In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Query the Retrieval Index

**What this notebook does:** Loads the TF-IDF index (building it if absent) and retrieves top-matching articles for sample queries.

**Why it matters:** Demonstrates the retrieval step in retrieval-augmented generation without external services.

**How to use it:**
1. Ensure the index exists or let the notebook build it.
2. Modify the `query` text to reflect your research questions.

**Expected outcome:** A ranked list of articles with similarity scores for each query, showing how indexing choices affect relevance.

In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
index_path = DATA_DIR / "vector_index.pkl"

if not index_path.exists():
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))
    with open(index_path, "wb") as handle:
        pickle.dump({"vectorizer": vectorizer, "tfidf_matrix": tfidf_matrix}, handle)
else:
    with open(index_path, "rb") as handle:
        payload = pickle.load(handle)
    vectorizer = payload["vectorizer"]
    tfidf_matrix = payload["tfidf_matrix"]


query = "methods for reproducible ai"
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
articles = articles.assign(score=scores)
articles.sort_values("score", ascending=False).head(5)[["title", "score"]]


In [None]:
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
with open(DATA_DIR / "vector_index.pkl", "rb") as handle:
    payload = pickle.load(handle)
vectorizer = payload["vectorizer"]
tfidf_matrix = payload["tfidf_matrix"]

query = "methods for reproducible ai"
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
articles = articles.assign(score=scores)
articles.sort_values("score", ascending=False).head(5)[["title", "score"]]


### If you get stuck / What to try next

If you get stuck: rebuild the index or clear and rerun cells. What to try next: tune prompts in pipelines/rag/rag_evaluation.ipynb or integrate answers in pipelines/prototypes/minimal_research_assistant.ipynb.