In [None]:
import sys, subprocess, os
from pathlib import Path

# Colab Setup
if "google.colab" in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema", "plotly", "tqdm"])
    
    # Check for data
    if not (Path.cwd() / "data").exists():
        print("Data directory not found. Cloning repository...")
        subprocess.run(["git", "clone", "https://github.com/aire-program/aire-researcher-sandbox.git", "_repo"])
        
        # Move data and scripts to current directory
        if (Path("_repo/data").exists()):
            print("Moving data and scripts...")
            subprocess.run(["mv", "_repo/data", "."])
            subprocess.run(["mv", "_repo/scripts", "."])
            subprocess.run(["rm", "-rf", "_repo"])
        else:
            print("Warning: Data not found in cloned repo.")
    else:
        print("Data directory found.")


# Build a TF-IDF Retrieval Index

**What**: Create and persist a TF-IDF vectorizer and index from synthetic abstracts.

**Why**: A pre-computed index allows for fast and consistent retrieval in downstream applications (like RAG).

**How**:
1. **Load synthetic abstracts**.
2. **Fit a TF-IDF vectorizer** to the text corpus.
3. **Save the index** and vectorizer to disk for later use.

**Key Concept**: **Vectorization** is the process of converting text into numerical vectors that computers can process and compare.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You fit a TF-IDF vectorizer on abstracts.
- You saved an index file (vector_index.pkl).
- You know the document and feature counts.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))

index_payload = {
    "vectorizer": vectorizer,
    "tfidf_matrix": tfidf_matrix,
    "article_ids": articles["article_id"].tolist(),
}
index_path = DATA_DIR / "vector_index.pkl"
with open(index_path, "wb") as handle:
    pickle.dump(index_payload, handle)
print(f"Index saved to {index_path} with shape {tfidf_matrix.shape}")


### If you get stuck / What to try next

If you get stuck: confirm data generation, rerun installs, and check that TF-IDF parameters match available RAM. What to try next: query the index in pipelines/rag/rag_query.ipynb and compare queries in pipelines/rag/rag_evaluation.ipynb.