In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Build a TF-IDF Retrieval Index

**What this notebook does:** Constructs and saves a TF-IDF index over synthetic article abstracts for later querying.

**Why it matters:** Retrieval-augmented workflows rely on consistent indexing; practicing locally ensures repeatability before touching sensitive data.

**How to use it:**
1. Generate data, then run locally or in Colab.
2. Execute cells to fit the vectorizer and persist `vector_index.pkl`.
3. Tweak vectorizer parameters to match your domain.

**Expected outcome:** A saved index file with document IDs and TF-IDF matrix ready for querying or evaluation.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))

index_payload = {
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
    "article_ids": articles["article_id"].tolist(),
}
index_path = DATA_DIR / "vector_index.pkl"
with open(index_path, "wb") as handle:
    pickle.dump(index_payload, handle)
print(f"Index saved to {index_path} with shape {tfidf_matrix.shape}")


### If you get stuck / What to try next

If you get stuck: confirm data generation, rerun installs, and check that TF-IDF parameters match available RAM. What to try next: query the index in pipelines/rag/rag_query.ipynb and compare queries in pipelines/rag/rag_evaluation.ipynb.