<a href="https://colab.research.google.com/github/ashnegiii/air_check-that_group22/blob/main/Traditional_IR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install rank_bm25
import pandas as pd
from rank_bm25 import BM25Okapi

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [3]:
# Load datasets
df_collection = pd.read_pickle('subtask4b_collection_data.pkl')
df_train = pd.read_csv('subtask4b_query_tweets_train.tsv', sep='\t')
df_dev = pd.read_csv('subtask4b_query_tweets_dev.tsv', sep='\t')
df_test = pd.read_csv('subtask4b_query_tweets_test.tsv', sep='\t')
df_test_gold = pd.read_csv('subtask4b_query_tweets_test_gold.tsv', sep='\t')

In [4]:
print("Collection shape:", df_collection.shape)
print("Train set shape:", df_train.shape)
print("Dev set shape:", df_dev.shape)
print("Test set shape:", df_test.shape)
print("Test gold shape:", df_test_gold.shape)

Collection shape: (7718, 17)
Train set shape: (12853, 3)
Dev set shape: (1400, 3)
Test set shape: (1446, 2)
Test gold shape: (1446, 3)


In [5]:
# Step 1: Preparing the documents
docs = df_collection[['title', 'abstract']].fillna('').apply(
    lambda row: f"{row['title']} {row['abstract']}", axis=1
).tolist()
cord_uids = df_collection['cord_uid'].tolist()

In [6]:
# Step 2: Tokenizing and building the BM25 model
tokenized_docs = [doc.lower().split() for doc in docs]
bm25 = BM25Okapi(tokenized_docs)

print(f"BM25 index built on {len(tokenized_docs)} documents.")

BM25 index built on 7718 documents.


In [7]:
def get_top5(query):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:5]
    return [cord_uids[i] for i in top_indices]

# Applying to the dev set
df_dev['bm25_top5'] = df_dev['tweet_text'].apply(get_top5)

In [8]:
def mrr_score(df, gold_col='cord_uid', pred_col='bm25_top5'):
    def reciprocal_rank(preds, gold):
        try:
            return 1 / (preds.index(gold) + 1)
        except ValueError:
            return 0.0
    return df.apply(lambda row: reciprocal_rank(row[pred_col], row[gold_col]), axis=1).mean()

# Evaluate on dev set
mrr_dev = mrr_score(df_dev)
print("MRR@5 on dev set:", round(mrr_dev, 4))

MRR@5 on dev set: 0.5874


In [9]:
# Apply BM25 to the test set
df_test['bm25_top5'] = df_test['tweet_text'].apply(get_top5)

In [10]:
# Merge predictions with gold labels for evaluation
df_test_eval = df_test.merge(df_test_gold[['post_id', 'cord_uid']], on='post_id')
mrr_test = mrr_score(df_test_eval)
print("MRR@5 on test set:", round(mrr_test, 4))

MRR@5 on test set: 0.4748


In [11]:
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    tokens = text.lower().translate(punct_table).split()
    return [t for t in tokens if t not in stop_words]

# Recreate tokenized corpus with preprocessing
tokenized_cleaned_docs = [preprocess(doc) for doc in docs]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
from rank_bm25 import BM25Okapi

# Define combinations to test
param_grid = [
    (1.2, 0.75),
    (1.5, 0.75),  # default
    (2.0, 0.75),
    (1.5, 0.5),
    (1.5, 1.0),
]

# Store results
results = []

# Try each combination
for k1, b in param_grid:
    bm25_custom = BM25Okapi(tokenized_cleaned_docs, k1=k1, b=b)

    def get_top5_custom(query):
        tokens = preprocess(query)
        scores = bm25_custom.get_scores(tokens)
        top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:5]
        return [cord_uids[i] for i in top_indices]

    df_dev['bm25_top5'] = df_dev['tweet_text'].apply(get_top5_custom)
    score = mrr_score(df_dev)
    results.append((k1, b, round(score, 4)))
    print(f"MRR@5 with k1={k1}, b={b} → {round(score, 4)}")

# Show all results
results

MRR@5 with k1=1.2, b=0.75 → 0.6264
MRR@5 with k1=1.5, b=0.75 → 0.6274
MRR@5 with k1=2.0, b=0.75 → 0.6239
MRR@5 with k1=1.5, b=0.5 → 0.625
MRR@5 with k1=1.5, b=1.0 → 0.6215


[(1.2, 0.75, np.float64(0.6264)),
 (1.5, 0.75, np.float64(0.6274)),
 (2.0, 0.75, np.float64(0.6239)),
 (1.5, 0.5, np.float64(0.625)),
 (1.5, 1.0, np.float64(0.6215))]

In [14]:
# Rebuild best BM25 model
final_bm25 = BM25Okapi(tokenized_cleaned_docs, k1=1.5, b=0.75)

# Use best model to generate predictions on test
def get_top5_final(query):
    tokens = preprocess(query)
    scores = final_bm25.get_scores(tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:5]
    return [cord_uids[i] for i in top_indices]

df_test['bm25_top5'] = df_test['tweet_text'].apply(get_top5_final)

In [15]:
# Merge predictions with gold labels
df_test_eval = df_test.merge(df_test_gold[['post_id', 'cord_uid']], on='post_id')
mrr_final_test = mrr_score(df_test_eval)
print("Enhanced MRR@5 on test set:", round(mrr_final_test, 4))

Enhanced MRR@5 on test set: 0.5137


In [16]:
# Save enhanced predictions to a TSV file
df_test[['post_id', 'bm25_top5']].rename(columns={'bm25_top5': 'preds'}).to_csv(
    'bm25_enhanced_predictions.tsv', sep='\t', index=False
)