In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Semantic Search Demo

Goal: illustrate similarity search over abstracts using TF-IDF cosine similarity.

Why it matters: shows how to rank documents without external embedding services, useful for quick literature scans.

How to run and adapt: run after data generation; replace TF-IDF with your preferred vectorizer for deeper experiments.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
embeddings = vectorizer.fit_transform(articles["abstract"].fillna(""))

similarity_matrix = cosine_similarity(embeddings)

# Show the most similar pair (excluding diagonal)
import numpy as np
np.fill_diagonal(similarity_matrix, 0)
max_idx = similarity_matrix.argmax()
row, col = divmod(max_idx, similarity_matrix.shape[0])

print("Most similar pair:")
print(articles.iloc[row]["title"])
print(articles.iloc[col]["title"])
print("Similarity score", similarity_matrix[row, col])
