# BBC Text Representations - Final Submission

**Roll Number:** SE22UARI195  
**Course:** Computational Sequence Modeling (CSM)  

---

## Assignment Overview
This notebook implements and compares multiple text representation methods:
- **Sparse:** OHE, BoW, N-grams, TF-IDF
- **Dense:** Word2Vec (Skip-gram/CBOW with NS/HS), GloVe
- **Tasks:** Classification (Logistic Regression), Retrieval (Cosine Similarity)

---

## 1. Setup and Imports

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import json
import pickle
import hashlib
import zlib
from pathlib import Path
from collections import Counter
import time
import sys

# Sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Word embeddings
from gensim.models import Word2Vec

# Ensure NLTK data
for resource in ['punkt', 'stopwords', 'wordnet', 'omw-1.4']:
    try:
        nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
    except LookupError:
        nltk.download(resource, quiet=True)

print("All imports successful!")

All imports successful!


[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>
[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>


## 2. Load Data and Compute Deterministic Splits

In [2]:
from pathlib import Path
import pandas as pd
import zlib

# Configuration - CORRECTED PATHS
ROLL = 'SE22UARI195'
MASTER_CSV = '../data/master.csv'
CACHE_DIR = Path('../cache')
MODELS_DIR = Path('../models')
OUTPUTS_DIR = Path('../outputs')

# Load master dataset
df = pd.read_csv(MASTER_CSV)

# Compute deterministic folds from roll number
r = zlib.crc32(ROLL.encode())
dev_fold = r % 5
test_fold = (r // 5) % 5
if test_fold == dev_fold:
    test_fold = (test_fold + 1) % 5

# Create splits
DEV = df[df.fold5 == dev_fold].reset_index(drop=True)
TEST = df[df.fold5 == test_fold].reset_index(drop=True)
TRAIN = df[~df.fold5.isin([dev_fold, test_fold])].reset_index(drop=True)

print(f"CRC32 Hash: {r}")
print(f"Fold Assignment:")
print(f"  DEV fold:  {dev_fold}")
print(f"  TEST fold: {test_fold}")
print(f"  TRAIN folds: {sorted([i for i in range(5) if i not in [dev_fold, test_fold]])}")
print(f"\nSplit Sizes:")
print(f"  TRAIN: {len(TRAIN)} documents ({len(TRAIN)/len(df)*100:.1f}%)")
print(f"  DEV:   {len(DEV)} documents ({len(DEV)/len(df)*100:.1f}%)")
print(f"  TEST:  {len(TEST)} documents ({len(TEST)/len(df)*100:.1f}%)")
print(f"  TOTAL: {len(df)} documents")


CRC32 Hash: 1507797122
Fold Assignment:
  DEV fold:  2
  TEST fold: 4
  TRAIN folds: [0, 1, 3]

Split Sizes:
  TRAIN: 1335 documents (60.0%)
  DEV:   445 documents (20.0%)
  TEST:  445 documents (20.0%)
  TOTAL: 2225 documents


## 3. Load Cached Processed Data

In [3]:
# Load preprocessed data from cache
train_processed = pd.read_pickle(CACHE_DIR / 'train_processed.pkl')
dev_processed = pd.read_pickle(CACHE_DIR / 'dev_processed.pkl')
test_processed = pd.read_pickle(CACHE_DIR / 'test_processed.pkl')
vocab_counter = pd.read_pickle(CACHE_DIR / 'vocab_counter.pkl')

print(f" Loaded processed data from cache")
print(f"  Vocabulary size: {len(vocab_counter):,}")
print(f"  TRAIN tokens: {sum(train_processed['tokens'].apply(len)):,}")
print(f"  DEV tokens: {sum(dev_processed['tokens'].apply(len)):,}")
print(f"  TEST tokens: {sum(test_processed['tokens'].apply(len)):,}")

 Loaded processed data from cache
  Vocabulary size: 20,404
  TRAIN tokens: 285,829
  DEV tokens: 97,572
  TEST tokens: 100,831


## 4. Load Sparse Representations

In [4]:
import numpy as np
from scipy import sparse

# Load sparse representations (saved as .npz files)
X_train_ohe = sparse.load_npz(MODELS_DIR / 'X_train_ohe.npz')
X_train_bow = sparse.load_npz(MODELS_DIR / 'X_train_bow.npz')
X_train_ngram = sparse.load_npz(MODELS_DIR / 'X_train_ngram.npz')
X_train_tfidf = sparse.load_npz(MODELS_DIR / 'X_train_tfidf.npz')

X_test_ohe = sparse.load_npz(MODELS_DIR / 'X_test_ohe.npz')
X_test_bow = sparse.load_npz(MODELS_DIR / 'X_test_bow.npz')
X_test_ngram = sparse.load_npz(MODELS_DIR / 'X_test_ngram.npz')
X_test_tfidf = sparse.load_npz(MODELS_DIR / 'X_test_tfidf.npz')

# Load vectorizers (saved as .pkl files)
import pickle
with open(MODELS_DIR / 'ohe_vectorizer.pkl', 'rb') as f:
    ohe_vec = pickle.load(f)
with open(MODELS_DIR / 'bow_vectorizer.pkl', 'rb') as f:
    bow_vec = pickle.load(f)
with open(MODELS_DIR / 'ngram_vectorizer.pkl', 'rb') as f:
    ngram_vec = pickle.load(f)
with open(MODELS_DIR / 'tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vec = pickle.load(f)

print(" Loaded all sparse representations")
print(f"  OHE shape: {X_train_ohe.shape}")
print(f"  BoW shape: {X_train_bow.shape}")
print(f"  N-gram shape: {X_train_ngram.shape}")
print(f"  TF-IDF shape: {X_train_tfidf.shape}")


 Loaded all sparse representations
  OHE shape: (1335, 2000)
  BoW shape: (1335, 11515)
  N-gram shape: (1335, 18625)
  TF-IDF shape: (1335, 11515)


## 5. Load Dense Representations

In [5]:
from gensim.models import Word2Vec
import pickle

# Load Word2Vec models
w2v_sg_ns = Word2Vec.load(str(MODELS_DIR / 'w2v_sg_ns.model'))
w2v_cbow_ns = Word2Vec.load(str(MODELS_DIR / 'w2v_cbow_ns.model'))
w2v_sg_hs = Word2Vec.load(str(MODELS_DIR / 'w2v_sg_hs.model'))
w2v_cbow_hs = Word2Vec.load(str(MODELS_DIR / 'w2v_cbow_hs.model'))

# Load GloVe embeddings dictionary (if it exists, otherwise load from txt)
glove_pkl_path = MODELS_DIR / 'glove_embeddings.pkl'
if glove_pkl_path.exists():
    with open(glove_pkl_path, 'rb') as f:
        glove_dict = pickle.load(f)
    print("Loaded GloVe from pickle")
else:
    # Load from original txt file
    print("⚠️ glove_embeddings.pkl not found, loading from txt (this may take a moment)...")
    glove_dict = {}
    glove_path = Path('../data/glove.6B.100d.txt')
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove_dict[word] = vector
    print(f"Loaded GloVe: {len(glove_dict):,} words")

# Load dense document representations (saved as .npy files)
X_train_w2v_ns = np.load(MODELS_DIR / 'X_train_w2v_sg_ns.npy')
X_train_w2v_hs = np.load(MODELS_DIR / 'X_train_w2v_sg_hs.npy')
X_train_glove = np.load(MODELS_DIR / 'X_train_glove.npy')

X_test_w2v_ns = np.load(MODELS_DIR / 'X_test_w2v_sg_ns.npy')
X_test_w2v_hs = np.load(MODELS_DIR / 'X_test_w2v_sg_hs.npy')
X_test_glove = np.load(MODELS_DIR / 'X_test_glove.npy')

print(" Loaded all dense representations")
print(f"  W2V Skip-gram NS shape: {X_train_w2v_ns.shape}")
print(f"  W2V Skip-gram HS shape: {X_train_w2v_hs.shape}")
print(f"  GloVe shape: {X_train_glove.shape}")



⚠️ glove_embeddings.pkl not found, loading from txt (this may take a moment)...
Loaded GloVe: 400,000 words
 Loaded all dense representations
  W2V Skip-gram NS shape: (1335, 100)
  W2V Skip-gram HS shape: (1335, 100)
  GloVe shape: (1335, 100)


## 6. Aggregate Health Metrics

In [6]:
# Health metrics from your notebook outputs
health_metrics = {
    'ohe': {
        'V': 2000,
        'nnz': int(X_train_ohe.nnz),
        'sparsity': 0.9498,
        'oov': 0.2863,
        'topk_100': 0.0,
        'topk_500': 0.0,
        'fit_s': 0.224,
        'ms_per_doc': 0.124,
        'mem_mb': 1.539
    },
    'bow': {
        'V': 11515,
        'nnz': int(X_train_bow.nnz),
        'sparsity': 0.9879,
        'oov': 0.0706,
        'topk_100': 0.1195,
        'topk_500': 0.3344,
        'fit_s': 0.221,
        'ms_per_doc': 0.173,
        'mem_mb': 2.142
    },
    'ngram': {
        'V': 18625,
        'nnz': int(X_train_ngram.nnz),
        'sparsity': 0.9907,
        'oov': 0.0935,
        'topk_100': 0.0,
        'topk_500': 0.0,
        'fit_s': 0.724,
        'ms_per_doc': 0.374,
        'mem_mb': 2.648
    },
    'tfidf': {
        'V': 11515,
        'nnz': int(X_train_tfidf.nnz),
        'sparsity': 0.9879,
        'oov': 0.0706,
        'topk_100': 0.1195,
        'topk_500': 0.3344,
        'fit_s': 0.200,
        'ms_per_doc': 0.141,
        'mem_mb': 2.142
    },
    'w2v_ns': {
        'tokens_per_sec': 45760.0
    },
    'w2v_hs': {
        'tokens_per_sec': 44775.0
    },
    'glove': {}
}

print(" Health metrics compiled")

 Health metrics compiled


## 7. Classification Results

In [7]:
# Classification metrics from your notebook outputs
classification_metrics = {
    'ohe': {
        'macro_f1': 0.9654,
        'accuracy': 0.9663
    },
    'bow': {
        'macro_f1': 0.9683,
        'accuracy': 0.9685
    },
    'ngram': {
        'macro_f1': 0.9639,
        'accuracy': 0.9640
    },
    'tfidf': {
        'macro_f1': 0.9639,
        'accuracy': 0.9640
    },
    'w2v_ns_tfidf': {
        'macro_f1': 0.9306,
        'accuracy': 0.9326
    },
    'w2v_hs_tfidf': {
        'macro_f1': 0.9327,
        'accuracy': 0.9348
    },
    'glove_tfidf': {
        'macro_f1': 0.9267,
        'accuracy': 0.9281
    }
}

print(" Classification metrics compiled")
print(f"\n Best method: bow (Macro-F1: {classification_metrics['bow']['macro_f1']:.4f})")

 Classification metrics compiled

 Best method: bow (Macro-F1: 0.9683)


## 8. Load and Compute Retrieval Metrics

In [8]:
# Load queries and rankings
with open(OUTPUTS_DIR / 'queries.json', 'r') as f:
    queries = json.load(f)

with open(OUTPUTS_DIR / 'rankings.json', 'r') as f:
    rankings = json.load(f)

print(f" Loaded {len(queries)} queries and rankings")

# Helper functions for retrieval metrics
def map_at_k(truth, ranks, k=5):
    """Calculate MAP@k"""
    ap_scores = []
    for qid, ranked_docs in ranks.items():
        relevant = truth.get(qid, [])
        if not relevant:
            continue
        hits = 0
        score = 0.0
        for i, doc_id in enumerate(ranked_docs[:k], 1):
            if doc_id in relevant:
                hits += 1
                score += hits / i
        ap_scores.append(score / min(k, len(relevant)))
    return float(np.mean(ap_scores)) if ap_scores else 0.0

def recall_at_k(truth, ranks, k=10):
    """Calculate Recall@k"""
    recalls = []
    for qid, ranked_docs in ranks.items():
        relevant = truth.get(qid, [])
        if not relevant:
            continue
        retrieved = sum(1 for doc_id in ranked_docs[:k] if doc_id in relevant)
        recalls.append(retrieved / len(relevant))
    return float(np.mean(recalls)) if recalls else 0.0

def neg_top1_accuracy(queries_list, ranks):
    """Calculate negation top-1% accuracy"""
    neg_queries = [q for q in queries_list if q['type'] == 'neg']
    if not neg_queries:
        return 0.0
    correct = 0
    for q in neg_queries:
        qid = q['qid']
        source_label = q['source_label']
        ranked = ranks.get(qid, [])
        if ranked:
            top_doc_id = ranked[0]
            # Check if top doc has same label as source
            top_doc_label = TEST[TEST['id'] == top_doc_id]['label'].values
            if len(top_doc_label) > 0 and top_doc_label[0] == source_label:
                correct += 1
    return float(correct / len(neg_queries))

# Build ground truth (same-label docs as relevant)
ground_truth = {}
for q in queries:
    qid = q['qid']
    source_label = q['source_label']
    relevant_docs = TEST[TEST['label'] == source_label]['id'].tolist()
    ground_truth[qid] = relevant_docs

# Compute retrieval metrics (using TF-IDF rankings from your output)
retrieval_metrics = {
    'tfidf': {
        'map@5': map_at_k(ground_truth, rankings, k=5),
        'recall@10': recall_at_k(ground_truth, rankings, k=10),
        'neg_top1': neg_top1_accuracy(queries, rankings)
    },
    'w2v_ns_tfidf': {
        'map@5': 0.0,  # Replace with actual if computed
        'recall@10': 0.0,
        'neg_top1': 0.0
    },
    'w2v_hs_tfidf': {
        'map@5': 0.0,
        'recall@10': 0.0,
        'neg_top1': 0.0
    },
    'glove_tfidf': {
        'map@5': 0.0,
        'recall@10': 0.0,
        'neg_top1': 0.0
    }
}

print(" Retrieval metrics computed")
print(f"  MAP@5: {retrieval_metrics['tfidf']['map@5']:.4f}")
print(f"  Recall@10: {retrieval_metrics['tfidf']['recall@10']:.4f}")
print(f"  Negation Top-1%: {retrieval_metrics['tfidf']['neg_top1']:.4f}")

 Loaded 20 queries and rankings
 Retrieval metrics computed
  MAP@5: 0.6993
  Recall@10: 0.0710
  Negation Top-1%: 1.0000


In [9]:
# ============================================
# Compute Retrieval Metrics for W2V and GloVe
# ============================================

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Computing retrieval rankings for dense methods...")
print("=" * 60)

# Helper function to get document vectors for queries
def get_query_vector(query_text, model_type='w2v_ns', tfidf_vec=None):
    """
    Convert query text to vector using the specified method
    """
    # Tokenize and preprocess query
    tokens = query_text.lower().split()
    
    if model_type == 'w2v_ns':
        model = w2v_sg_ns
    elif model_type == 'w2v_hs':
        model = w2v_sg_hs
    elif model_type == 'glove':
        model = glove_dict
    
    # Get word vectors
    word_vectors = []
    word_weights = []
    
    for token in tokens:
        if model_type in ['w2v_ns', 'w2v_hs']:
            if token in model.wv:
                word_vectors.append(model.wv[token])
                # Use uniform weighting for query
                word_weights.append(1.0)
        else:  # GloVe
            if token in model:
                word_vectors.append(model[token])
                word_weights.append(1.0)
    
    if len(word_vectors) == 0:
        # Return zero vector if no words found
        return np.zeros(100)
    
    # Weighted average
    word_vectors = np.array(word_vectors)
    word_weights = np.array(word_weights)
    doc_vector = np.average(word_vectors, axis=0, weights=word_weights)
    
    return doc_vector

# Function to compute rankings for a method
def compute_rankings_for_method(X_test_dense, method_name):
    """
    Compute top-10 rankings for all queries using cosine similarity
    """
    rankings_dict = {}
    
    for query in queries:
        qid = query['qid']
        query_text = query['query_text']
        
        # Get query vector
        if 'w2v_ns' in method_name or 'sg_ns' in method_name:
            query_vec = get_query_vector(query_text, model_type='w2v_ns')
        elif 'w2v_hs' in method_name or 'sg_hs' in method_name:
            query_vec = get_query_vector(query_text, model_type='w2v_hs')
        else:  # GloVe
            query_vec = get_query_vector(query_text, model_type='glove')
        
        # Compute cosine similarity with all test documents
        query_vec_reshaped = query_vec.reshape(1, -1)
        similarities = cosine_similarity(query_vec_reshaped, X_test_dense)[0]
        
        # Get top-10 document indices
        top_10_indices = np.argsort(similarities)[::-1][:10]
        
        # Map indices to document IDs
        top_10_doc_ids = TEST.iloc[top_10_indices]['id'].tolist()
        
        rankings_dict[qid] = top_10_doc_ids
    
    return rankings_dict

# Compute rankings for each dense method
print("\n1. Computing W2V Skip-gram NS rankings...")
rankings_w2v_ns = compute_rankings_for_method(X_test_w2v_ns, 'w2v_ns')

print("2. Computing W2V Skip-gram HS rankings...")
rankings_w2v_hs = compute_rankings_for_method(X_test_w2v_hs, 'w2v_hs')

print("3. Computing GloVe rankings...")
rankings_glove = compute_rankings_for_method(X_test_glove, 'glove')

print("\n All rankings computed!")

# Now compute retrieval metrics for each method
print("\n" + "=" * 60)
print("Computing Retrieval Metrics")
print("=" * 60)

# Build ground truth (same as before)
ground_truth = {}
for q in queries:
    qid = q['qid']
    source_label = q['source_label']
    relevant_docs = TEST[TEST['label'] == source_label]['id'].tolist()
    ground_truth[qid] = relevant_docs

# Compute metrics for W2V NS
map_w2v_ns = map_at_k(ground_truth, rankings_w2v_ns, k=5)
recall_w2v_ns = recall_at_k(ground_truth, rankings_w2v_ns, k=10)
neg_w2v_ns = neg_top1_accuracy(queries, rankings_w2v_ns)

print(f"\nW2V Skip-gram NS:")
print(f"  MAP@5: {map_w2v_ns:.4f}")
print(f"  Recall@10: {recall_w2v_ns:.4f}")
print(f"  Negation Top-1%: {neg_w2v_ns:.4f}")

# Compute metrics for W2V HS
map_w2v_hs = map_at_k(ground_truth, rankings_w2v_hs, k=5)
recall_w2v_hs = recall_at_k(ground_truth, rankings_w2v_hs, k=10)
neg_w2v_hs = neg_top1_accuracy(queries, rankings_w2v_hs)

print(f"\nW2V Skip-gram HS:")
print(f"  MAP@5: {map_w2v_hs:.4f}")
print(f"  Recall@10: {recall_w2v_hs:.4f}")
print(f"  Negation Top-1%: {neg_w2v_hs:.4f}")

# Compute metrics for GloVe
map_glove = map_at_k(ground_truth, rankings_glove, k=5)
recall_glove = recall_at_k(ground_truth, rankings_glove, k=10)
neg_glove = neg_top1_accuracy(queries, rankings_glove)

print(f"\nGloVe:")
print(f"  MAP@5: {map_glove:.4f}")
print(f"  Recall@10: {recall_glove:.4f}")
print(f"  Negation Top-1%: {neg_glove:.4f}")

# Update retrieval_metrics dictionary
retrieval_metrics = {
    'tfidf': {
        'map@5': retrieval_metrics['tfidf']['map@5'],  # Keep existing
        'recall@10': retrieval_metrics['tfidf']['recall@10'],
        'neg_top1': retrieval_metrics['tfidf']['neg_top1']
    },
    'w2v_ns_tfidf': {
        'map@5': map_w2v_ns,
        'recall@10': recall_w2v_ns,
        'neg_top1': neg_w2v_ns
    },
    'w2v_hs_tfidf': {
        'map@5': map_w2v_hs,
        'recall@10': recall_w2v_hs,
        'neg_top1': neg_w2v_hs
    },
    'glove_tfidf': {
        'map@5': map_glove,
        'recall@10': recall_glove,
        'neg_top1': neg_glove
    }
}

print("\n" + "=" * 60)
print(" All retrieval metrics computed!")
print("=" * 60)


Computing retrieval rankings for dense methods...

1. Computing W2V Skip-gram NS rankings...
2. Computing W2V Skip-gram HS rankings...
3. Computing GloVe rankings...

 All rankings computed!

Computing Retrieval Metrics

W2V Skip-gram NS:
  MAP@5: 0.8325
  Recall@10: 0.0911
  Negation Top-1%: 1.0000

W2V Skip-gram HS:
  MAP@5: 0.8402
  Recall@10: 0.0916
  Negation Top-1%: 1.0000

GloVe:
  MAP@5: 0.8673
  Recall@10: 0.0943
  Negation Top-1%: 1.0000

 All retrieval metrics computed!


## 9. Generate results.json

In [10]:
# Compile all results
results = {
    'roll': ROLL,
    'dev_fold': int(dev_fold),
    'test_fold': int(test_fold),
    'health': health_metrics,
    'classification': classification_metrics,
    'retrieval': retrieval_metrics
}

# Save results.json
results_path = OUTPUTS_DIR / 'results.json'
with open(results_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)

print(f" Saved: {results_path}")
print(f"\n Results Summary:")
print(f"  Roll: {ROLL}")
print(f"  DEV fold: {dev_fold}, TEST fold: {test_fold}")
print(f"  Best classification: bow (F1: {classification_metrics['bow']['macro_f1']:.4f})")
print(f"  TF-IDF MAP@5: {retrieval_metrics['tfidf']['map@5']:.4f}")

 Saved: ../outputs/results.json

 Results Summary:
  Roll: SE22UARI195
  DEV fold: 2, TEST fold: 4
  Best classification: bow (F1: 0.9683)
  TF-IDF MAP@5: 0.6993


## 10. Print Query and Ranking Signatures

In [11]:
# Generate deterministic signatures
query_strings = sorted(f"{q['qid']}|{q['query_text']}" for q in queries)
QUERY_SIGNATURE = hashlib.sha256('\n'.join(query_strings).encode()).hexdigest()

rank_strings = [f"{qid}|{','.join(rankings[qid][:10])}" for qid in sorted(rankings.keys())]
RANK_SIGNATURE = hashlib.sha256('\n'.join(rank_strings).encode()).hexdigest()

print(" Submission Signatures:")
print(f"QUERY_SIGNATURE: {QUERY_SIGNATURE}")
print(f"RANK_SIGNATURE:  {RANK_SIGNATURE}")

 Submission Signatures:
QUERY_SIGNATURE: 76a19dbc0010bdf730183e42c05c434bbe5f0d1c3066dbc6563d55f0555208d8
RANK_SIGNATURE:  30da9df4e76e94f4af33e5e276067e51c6471c1d87140e5dde634114d82f94cc


## 11. Submission Validation

In [2]:
# Validation checks
print(" Running submission validation...\n")

checks_passed = 0
total_checks = 0

# Check 1: results.json exists and is valid
total_checks += 1
try:
    with open(OUTPUTS_DIR / 'results.json', 'r') as f:
        loaded_results = json.load(f)
    assert loaded_results['roll'] == ROLL
    assert loaded_results['dev_fold'] == dev_fold
    assert loaded_results['test_fold'] == test_fold
    print(" CHECK 1: results.json valid")
    checks_passed += 1
except Exception as e:
    print(f" CHECK 1 FAILED: {e}")

# Check 2: preds_test.csv exists and has correct format
total_checks += 1
try:
    preds = pd.read_csv(OUTPUTS_DIR / 'preds_test.csv')
    assert set(preds.columns) == {'id', 'pred'}
    assert len(preds) == len(TEST)
    assert set(preds['id']) == set(TEST['id'])
    print(" CHECK 2: preds_test.csv valid")
    checks_passed += 1
except Exception as e:
    print(f" CHECK 2 FAILED: {e}")

# Check 3: queries.json has 20 queries (15 TF-IDF + 5 negation)
total_checks += 1
try:
    assert len(queries) == 20
    tfidf_queries = [q for q in queries if q['type'] == 'tfidf']
    neg_queries = [q for q in queries if q['type'] == 'neg']
    assert len(tfidf_queries) == 15
    assert len(neg_queries) == 5
    print(" CHECK 3: queries.json valid (20 queries: 15 TF-IDF + 5 negation)")
    checks_passed += 1
except Exception as e:
    print(f" CHECK 3 FAILED: {e}")

# Check 4: rankings.json has top-10 docs per query
total_checks += 1
try:
    assert len(rankings) == 20
    for qid, docs in rankings.items():
        assert len(docs) == 10
        assert all(doc_id in TEST['id'].values for doc_id in docs)
    print(" CHECK 4: rankings.json valid (10 TEST docs per query)")
    checks_passed += 1
except Exception as e:
    print(f" CHECK 4 FAILED: {e}")

# Check 5: Fold assignment deterministic
total_checks += 1
try:
    r_check = zlib.crc32(ROLL.encode())
    dev_check = r_check % 5
    test_check = (r_check // 5) % 5
    if test_check == dev_check:
        test_check = (test_check + 1) % 5
    assert dev_fold == dev_check
    assert test_fold == test_check
    print(" CHECK 5: Fold assignment deterministic")
    checks_passed += 1
except Exception as e:
    print(f" CHECK 5 FAILED: {e}")


 Running submission validation...

 CHECK 1 FAILED: name 'OUTPUTS_DIR' is not defined
 CHECK 2 FAILED: name 'pd' is not defined
 CHECK 3 FAILED: name 'queries' is not defined
 CHECK 4 FAILED: name 'rankings' is not defined
 CHECK 5 FAILED: name 'zlib' is not defined
