In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Minimal Research Assistant

Goal: combine TF-IDF retrieval with templated responses to simulate a lightweight research assistant.

Why it matters: provides a safe, offline prototype for exploring RAG-style interactions using synthetic data.

How to run and adapt: ensure data is generated; adjust the `answer` function or prompt phrasing to reflect your domain.

In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
index_path = DATA_DIR / "vector_index.pkl"

if not index_path.exists():
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))
    with open(index_path, "wb") as handle:
        pickle.dump({"vectorizer": vectorizer, "tfidf_matrix": tfidf_matrix}, handle)
else:
    with open(index_path, "rb") as handle:
        payload = pickle.load(handle)
    vectorizer = payload["vectorizer"]
    tfidf_matrix = payload["tfidf_matrix"]


def answer(query: str) -> str:
    scores = cosine_similarity(vectorizer.transform([query]), tfidf_matrix).flatten()
    top_idx = scores.argmax()
    article = articles.iloc[top_idx]
    return f"Based on {article['title']}, consider: {article['abstract'][:150]}..."

for q in ["How to document AI methods?", "Ways to improve study reproducibility?"]:
    print(q)
    print(answer(q))
    print("---")


In [None]:
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
with open(DATA_DIR / "vector_index.pkl", "rb") as handle:
    payload = pickle.load(handle)
vectorizer = payload["vectorizer"]
tfidf_matrix = payload["tfidf_matrix"]


def answer(query: str) -> str:
    scores = cosine_similarity(vectorizer.transform([query]), tfidf_matrix).flatten()
    top_idx = scores.argmax()
    article = articles.iloc[top_idx]
    return f"Based on {article['title']}, consider: {article['abstract'][:150]}..."

for q in ["How to document AI methods?", "Ways to improve study reproducibility?"]:
    print(q)
    print(answer(q))
    print("---")
