In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Semantic-Style Similarity Demo

**What this notebook does:** Uses TF-IDF cosine similarity as a stand-in for embedding-based search over synthetic abstracts.

**Why it matters:** Shows how to rank documents without remote embedding services, enabling quick literature scans.

**How to use it:**
1. Generate data, run locally or in Colab.
2. Inspect the most similar document pairs; experiment with preprocessing to see how similarity shifts.

**Expected outcome:** Similarity scores and examples of closely related abstracts to guide exploratory search designs.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
embeddings = vectorizer.fit_transform(articles["abstract"].fillna(""))

similarity_matrix = cosine_similarity(embeddings)

# Show the most similar pair (excluding diagonal)
import numpy as np
np.fill_diagonal(similarity_matrix, 0)
max_idx = similarity_matrix.argmax()
row, col = divmod(max_idx, similarity_matrix.shape[0])

print("Most similar pair:")
print(articles.iloc[row]["title"])
print(articles.iloc[col]["title"])
print("Similarity score", similarity_matrix[row, col])


### If you get stuck / What to try next

If you get stuck: rerun installs and confirm data presence. What to try next: adjust preprocessing or move to retrieval notebooks to test ranking changes.