In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Build Retrieval Index

Goal: construct and persist a TF-IDF index over synthetic article abstracts.

Why it matters: indexing enables reproducible retrieval experiments without external services.

How to run and adapt: execute after data generation; adjust vectorizer parameters or swap in alternative embeddings for new trials.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))

index_payload = {
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
    "article_ids": articles["article_id"].tolist(),
}
index_path = DATA_DIR / "vector_index.pkl"
with open(index_path, "wb") as handle:
    pickle.dump(index_payload, handle)
print(f"Index saved to {index_path} with shape {tfidf_matrix.shape}")
