In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Prototype: Minimal Research Assistant

**What**: Build a simple end-to-end research assistant that retrieves information and formats an answer.

**Why**: Prototyping the full flow helps understand how retrieval and generation (or formatting) components interact.

**How**:
1. **Accept a user question**.
2. **Retrieve relevant abstracts** using the TF-IDF index.
3. **Format a response** (simulated generation) based on the retrieved context.

**Key Concept**: **RAG** (Retrieval-Augmented Generation) combines a retriever (finding docs) with a generator (writing answers) to produce grounded responses.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You combined retrieval with templated answers.
- You saw responses grounded in retrieved abstracts.
- You identified how retrieval impacts answers.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")


DATA_DIR = find_data_dir()


In [None]:
import pickle
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
index_path = Path("/tmp/vector_index.pkl")

if not index_path.exists():
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))
    with open(index_path, "wb") as handle:
        pickle.dump({"vectorizer": vectorizer, "tfidf_matrix": tfidf_matrix}, handle)
else:
    with open(index_path, "rb") as handle:
        payload = pickle.load(handle)
    vectorizer = payload["vectorizer"]
    tfidf_matrix = payload["tfidf_matrix"]


def answer(query: str) -> str:
    scores = cosine_similarity(vectorizer.transform([query]), tfidf_matrix).flatten()
    top_idx = scores.argmax()
    article = articles.iloc[top_idx]
    return f"Based on {article['title']}, consider: {article['abstract'][:150]}..."


for q in ["How to document AI methods?", "Ways to improve study reproducibility?"]:
    print(q)
    print(answer(q))
    print("---")
