In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Evaluate Retrieval Quality

**What:** Compare similarity scores for a set of example queries against the TF-IDF index.

**Why:** Quick, offline evaluation guides prompt design and indexing choices before deploying assistants.

**How:** Run the install cell if needed, confirm the index exists, then execute cells. Similarity here is cosine similarityâ€”a way to see how close two vectors are.

**You will learn:** How to judge retrieval strength across queries and spot where indexing or prompts may need adjustment.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You evaluated several queries.
- You recorded best scores.
- You compared which queries perform better.

In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
index_path = DATA_DIR / "vector_index.pkl"

if not index_path.exists():
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))
    with open(index_path, "wb") as handle:
        pickle.dump({"vectorizer": vectorizer, "tfidf_matrix": tfidf_matrix}, handle)
else:
    with open(index_path, "rb") as handle:
        payload = pickle.load(handle)
    vectorizer = payload["vectorizer"]
    tfidf_matrix = payload["tfidf_matrix"]


queries = [
    "quantitative study design",
    "community impacts of technology",
    "statistical methods for small samples",
]
results = []
for q in queries:
    score = cosine_similarity(vectorizer.transform([q]), tfidf_matrix).flatten().max()
    results.append({"query": q, "best_score": float(score)})

pd.DataFrame(results)


In [None]:
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
with open(DATA_DIR / "vector_index.pkl", "rb") as handle:
    payload = pickle.load(handle)
vectorizer = payload["vectorizer"]
tfidf_matrix = payload["tfidf_matrix"]

queries = [
    "quantitative study design",
    "community impacts of technology",
    "statistical methods for small samples",
]
results = []
for q in queries:
    score = cosine_similarity(vectorizer.transform([q]), tfidf_matrix).flatten().max()
    results.append({"query": q, "best_score": float(score)})

pd.DataFrame(results)


### If you get stuck / What to try next

If you get stuck: verify the index file exists or rerun build_index. What to try next: apply the scoring insights to pipelines/prototypes/minimal_research_assistant.ipynb.