In [1]:
import sys
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gzip
import json
sys.path.append('..')

from src.document_preprocessor import RegexTokenizer
from src.indexing import BasicInvertedIndex, Indexer, IndexType
from src.ranker import BM25, Ranker
from src.relevance import run_relevance_tests
from src.l2r import L2RFeatureExtractor, L2RRanker
from src.vector_ranker import VectorRanker

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
STOPWORD_PATH = '../data/stopwords_updated.txt'
DATASET_PATH = '../data/processed_articles_dedup.csv'
RELEVANCE_TRAIN_DATA = '../data/relevance_test.csv'
MAIN_INDEX_PATH = '../data/index'
HEADLINE_INDEX_PATH = '../data/index_headline'
BODY_EMBEDDINGS_PATH = '../data/body_embeddings.npy'

In [3]:
# Load in the stopwords

stopwords = set()
with open(STOPWORD_PATH, 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 550'

In [4]:
preprocessor = RegexTokenizer("\w+(?:-\w+)*(?:'[^stmrvld]\w*)*", lowercase=True)

# main_index = Indexer.create_index(
#     index_type=IndexType.BasicInvertedIndex,
#     dataset_path=DATASET_PATH,
#     document_preprocessor=preprocessor,
#     stopwords=stopwords,
#     minimum_word_frequency=1,
#     text_key='body',
#     id_key='docid',
#     max_docs=35292
# )

# headline_index = Indexer.create_index(
#     index_type=IndexType.BasicInvertedIndex,
#     dataset_path=DATASET_PATH,
#     document_preprocessor=preprocessor,
#     stopwords=stopwords,
#     minimum_word_frequency=1,
#     text_key='headline',
#     id_key='docid',
#     max_docs=35292
# )

# main_index.save(MAIN_INDEX_PATH)
# headline_index.save(HEADLINE_INDEX_PATH)

In [5]:
# # Get the list of document IDs from the index
# valid_doc_ids = list(main_index.document_metadata.keys())

# # Load the dataset
# df = pd.read_csv(DATASET_PATH)

# # Filter to only include documents that are in the index
# df = df[df['docid'].isin(valid_doc_ids)]

# # Create new embeddings with only the valid documents
# model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')
# body_embeddings = model.encode(df['body'].tolist(),
#                              batch_size=32,
#                              show_progress_bar=True,
#                              convert_to_tensor=False)

# # Save the embeddings
# np.save(BODY_EMBEDDINGS_PATH, body_embeddings)

In [6]:
main_index = BasicInvertedIndex()
headline_index = BasicInvertedIndex()

main_index.load(MAIN_INDEX_PATH)
headline_index.load(HEADLINE_INDEX_PATH)

In [7]:
fe = L2RFeatureExtractor(main_index, headline_index, preprocessor, stopwords)

# Load pre-computed document embeddings
encoded_docs = np.load(BODY_EMBEDDINGS_PATH)

vector_ranker = VectorRanker('sentence-transformers/msmarco-MiniLM-L12-cos-v5',
                             encoded_docs, list(main_index.document_metadata.keys()))

l2r_ranker = L2RRanker(main_index, headline_index, preprocessor,
                       stopwords, vector_ranker, fe)

l2r_ranker.train(RELEVANCE_TRAIN_DATA)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 723
[LightGBM] [Info] Number of data points in the train set: 654, number of used features: 9


In [9]:
# Run relevance tests on the trained L2R ranker
results = run_relevance_tests(RELEVANCE_TRAIN_DATA, l2r_ranker)  # Changed order of arguments
print("L2R Ranker Evaluation Results:")
print(f"Mean Average Precision (MAP): {results['map']:.4f}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {results['ndcg']:.4f}")

L2R Ranker Evaluation Results:
Mean Average Precision (MAP): 0.1538
Normalized Discounted Cumulative Gain (NDCG): 0.2595
