In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Query the Retrieval Index

**What**: Retrieve relevant documents from the pre-built TF-IDF index using natural language queries.

**Why**: Retrieval is the core component of RAG (Retrieval-Augmented Generation) systems, allowing them to access external knowledge.

**How**:
1. **Load the saved index**.
2. **Transform a user query** into a vector.
3. **Calculate cosine similarity** to find the closest documents.

**Key Concept**: **Cosine Similarity** measures the cosine of the angle between two vectors. A value close to 1 means the vectors (and thus the texts) are very similar.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You loaded the TF-IDF index.
- You ran at least one query.
- You viewed a ranked list with scores.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")


DATA_DIR = find_data_dir()


In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
index_path = DATA_DIR / "vector_index.pkl"

if not index_path.exists():
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))
    with open(index_path, "wb") as handle:
        pickle.dump({"vectorizer": vectorizer, "tfidf_matrix": tfidf_matrix}, handle)
else:
    with open(index_path, "rb") as handle:
        payload = pickle.load(handle)
    vectorizer = payload["vectorizer"]
    tfidf_matrix = payload["tfidf_matrix"]


query = "methods for reproducible ai"
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
articles = articles.assign(score=scores)
articles.sort_values("score", ascending=False).head(5)[["title", "score"]]


In [None]:
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
with open(DATA_DIR / "vector_index.pkl", "rb") as handle:
    payload = pickle.load(handle)
vectorizer = payload["vectorizer"]
tfidf_matrix = payload["tfidf_matrix"]

query = "methods for reproducible ai"
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
articles = articles.assign(score=scores)
articles.sort_values("score", ascending=False).head(5)[["title", "score"]]


### If you get stuck / What to try next

If you get stuck: rebuild the index or clear and rerun cells. What to try next: tune prompts in pipelines/rag/rag_evaluation.ipynb or integrate answers in pipelines/prototypes/minimal_research_assistant.ipynb.