In [None]:
#%pip install -r requirements.txt
import os, numpy as np
from src.data_loader import data_loader
from src.preprocessing import clean_data
from src.models.bm25 import BM25Model
from src.models.lsa import LSAModel
from src.evaluation import evaluate_model

In [None]:
dataset = "nfcorpus"  # "nfcorpus", "arguana", "scifact", "scidocs"
corpus, queries, qrels = data_loader(dataset)
cleaned_corpus, cleaned_queries = clean_data(corpus, queries)
len(cleaned_corpus), len(cleaned_queries)

In [None]:
bm25 = BM25Model()
bm25.fit(cleaned_corpus)
ndcg, _map, recall, precision = evaluate_model(bm25, cleaned_queries, qrels)
print(f"BM25 — NDCG@10: {ndcg['NDCG@10']:.3f} | Recall@100: {recall['Recall@100']:.3f}")

In [None]:
lsa = LSAModel()
lsa.fit(cleaned_corpus, plot=True, gif=False)  # gif=True si tu veux enregistrer
ndcg, _map, recall, precision = evaluate_model(lsa, cleaned_queries, qrels)
print(f"LSA — NDCG@10: {ndcg['NDCG@10']:.3f} | Recall@100: {recall['Recall@100']:.3f}")

In [None]:
lsa_prf = LSAModel(rocchio_prf=True)
lsa_prf.fit(cleaned_corpus)
ndcg, _map, recall, precision = evaluate_model(lsa_prf, cleaned_queries, qrels)
print(f"LSA + Rocchio PRF — NDCG@10: {ndcg['NDCG@10']:.3f} | Recall@100: {recall['Recall@100']:.3f}")