In [8]:
import pandas as pd

# Load the documents
doc_columns = ['ID', 'URL', 'TITLE', 'ABSTRACT']
documents = pd.read_csv('./nfcorpus/raw/doc_dump.txt', sep='\t', names=doc_columns, header=None)

qrel_columns = ['QUERY_ID', 'UNUSED', 'DOC_ID', 'RELEVANCE_LEVEL']
qrels = pd.read_csv('./nfcorpus/merged.qrel', sep='\t', names=qrel_columns, header=None)

In [9]:
query_columns = ['QUERY_ID', 'QUERY_TEXT']
queries = pd.read_csv('./nfcorpus/train.titles.queries', sep='\t', names=query_columns, header=None)

# Filter qrels to only include train query IDs
train_qrels = qrels[qrels['QUERY_ID'].isin(queries['QUERY_ID'])]
print(len(qrels))

# Check if all query IDs from qrels are present in queries
missing_queries = qrels[~qrels['QUERY_ID'].isin(queries['QUERY_ID'])]
print("Missing query IDs: ", missing_queries['QUERY_ID'].unique())
# print number of missing queries
print("Number of missing queries: ", len(missing_queries))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine titles and abstracts for better content representation
documents['CONTENT'] = documents['TITLE'] + " " + documents['ABSTRACT']

# Vectorize documents
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents['CONTENT'])

# Vectorize queries
query_vectors = vectorizer.transform(queries['QUERY_TEXT'])

# Create a mapping from document IDs to their vector indices for quick access
doc_id_to_index = pd.Series(data=documents.index, index=documents['ID']).to_dict()

# Initialize lists to hold your training data
X_train = []
Y_train = []

# Iterate over each entry in the relevance judgments
# print number of rows in qrels
print("Number of rows in qrels: ", len(qrels))


169759
Missing query IDs:  ['PLAIN-1' 'PLAIN-11' 'PLAIN-22' 'PLAIN-32' 'PLAIN-43' 'PLAIN-55'
 'PLAIN-67' 'PLAIN-77' 'PLAIN-90' 'PLAIN-101' 'PLAIN-111' 'PLAIN-122'
 'PLAIN-132' 'PLAIN-142' 'PLAIN-152' 'PLAIN-164' 'PLAIN-174' 'PLAIN-185'
 'PLAIN-195' 'PLAIN-206' 'PLAIN-216' 'PLAIN-226' 'PLAIN-237' 'PLAIN-247'
 'PLAIN-258' 'PLAIN-269' 'PLAIN-279' 'PLAIN-290' 'PLAIN-304' 'PLAIN-319'
 'PLAIN-331' 'PLAIN-343' 'PLAIN-357' 'PLAIN-370' 'PLAIN-382' 'PLAIN-394'
 'PLAIN-406' 'PLAIN-417' 'PLAIN-428' 'PLAIN-440' 'PLAIN-456' 'PLAIN-467'
 'PLAIN-477' 'PLAIN-487' 'PLAIN-498' 'PLAIN-508' 'PLAIN-519' 'PLAIN-530'
 'PLAIN-540' 'PLAIN-550' 'PLAIN-560' 'PLAIN-570' 'PLAIN-582' 'PLAIN-592'
 'PLAIN-602' 'PLAIN-612' 'PLAIN-622' 'PLAIN-633' 'PLAIN-644' 'PLAIN-656'
 'PLAIN-670' 'PLAIN-680' 'PLAIN-690' 'PLAIN-700' 'PLAIN-710' 'PLAIN-720'
 'PLAIN-730' 'PLAIN-740' 'PLAIN-750' 'PLAIN-760' 'PLAIN-770' 'PLAIN-781'
 'PLAIN-791' 'PLAIN-805' 'PLAIN-816' 'PLAIN-826' 'PLAIN-837' 'PLAIN-848'
 'PLAIN-859' 'PLAIN-871' 'PLAIN-88

In [10]:

for index, row in train_qrels.iterrows():
    print(f"Processing row {index}", end='\r')
    query_id = row['QUERY_ID']
    doc_id = row['DOC_ID']
    relevance_label = row['RELEVANCE_LEVEL']
    # print(f"Query ID: {query_id}, Doc ID: {doc_id}, Relevance: {relevance_label}")
    # Check if the query ID is found within the queries DataFrame
    query_indices = queries[queries['QUERY_ID'] == query_id].index.tolist()
    if query_indices:  # This checks if the list is non-empty
        query_vector = query_vectors[query_indices[0]]
        if doc_id in doc_id_to_index:
            doc_vector = doc_vectors[doc_id_to_index[doc_id]]
            # Calculate features and append to training data
            features = cosine_similarity(query_vector, doc_vector).flatten()
            X_train.append(features)
            Y_train.append(relevance_label)
        # print(f"Query ID {query_id} found in training queries")
    else:
        print(f"Query ID {query_id} not found in training queries")

Processing row 153938

In [11]:
# Load test queries
test_query_columns = ['QUERY_ID', 'QUERY_TEXT']
test_queries = pd.read_csv('./nfcorpus/test.titles.queries', sep='\t', names=test_query_columns, header=None)

# Vectorize test queries using the already fitted vectorizer
test_query_vectors = vectorizer.transform(test_queries['QUERY_TEXT'])

# Filter qrels to only include test query IDs
test_qrels = qrels[qrels['QUERY_ID'].isin(test_queries['QUERY_ID'])]
print(len(test_qrels))

# Initialize lists for test data
X_test = []
Y_test = []

# Iterate over each entry in the test relevance judgments
for index, row in test_qrels.iterrows():
    print(f"Processing row {index}", end='\r')
    query_id = row['QUERY_ID']
    doc_id = row['DOC_ID']
    relevance_label = row['RELEVANCE_LEVEL']
        
    # Find the query index and corresponding vector
    query_indices = test_queries[test_queries['QUERY_ID'] == query_id].index.tolist()
    if query_indices:  # This checks if the list is non-empty
        query_vector = test_query_vectors[query_indices[0]]
        if doc_id in doc_id_to_index:
            doc_vector = doc_vectors[doc_id_to_index[doc_id]]
            
            # Calculate features
            features = cosine_similarity(query_vector, doc_vector).flatten()
            X_test.append(features)
            Y_test.append(relevance_label)
    else:
        print(f"Test Query ID {query_id} not found in test queries")

15820
Processing row 169758

In [12]:
# val set
val_query_columns = ['QUERY_ID', 'QUERY_TEXT']
val_queries = pd.read_csv('./nfcorpus/dev.titles.queries', sep='\t', names=val_query_columns, header=None)

# Vectorize test queries using the already fitted vectorizer
val_query_vectors = vectorizer.transform(val_queries['QUERY_TEXT'])

# Filter qrels to only include test query IDs
val_qrels = qrels[qrels['QUERY_ID'].isin(val_queries['QUERY_ID'])]
print(len(val_qrels))

# Initialize lists for test data
X_val = []
Y_val = []

# Iterate over each entry in the test relevance judgments
for index, row in val_qrels.iterrows():
    print(f"Processing row {index}", end='\r')
    query_id = row['QUERY_ID']
    doc_id = row['DOC_ID']
    relevance_label = row['RELEVANCE_LEVEL']
        
    # Find the query index and corresponding vector
    query_indices = val_queries[val_queries['QUERY_ID'] == query_id].index.tolist()
    if query_indices:  # This checks if the list is non-empty
        query_vector = val_query_vectors[query_indices[0]]
        if doc_id in doc_id_to_index:
            doc_vector = doc_vectors[doc_id_to_index[doc_id]]
            
            # Calculate features
            features = cosine_similarity(query_vector, doc_vector).flatten()
            X_val.append(features)
            Y_val.append(relevance_label)
    else:
        print(f"Val Query ID {query_id} not found in val queries")

14589
Processing row 14588

In [13]:
# Convert lists to arrays if needed, for compatibility with scikit-learn
import numpy as np
X_train = np.array(X_train)
Y_train = np.array(Y_train)

from sklearn.ensemble import RandomForestClassifier

# Train a simple model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, Y_train)

In [14]:
# NDCG evaluation
from sklearn.metrics import ndcg_score

# Predict on the test set
Y_pred = model.predict(X_test)

# Calculate NDCG
ndcg = ndcg_score([Y_test], [Y_pred], k=10)
print(f"NDCG@10: {ndcg}")


NDCG@10: 0.6732186732186731
