In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Clustering and Topics

Goal: cluster abstracts with TF-IDF + k-means to reveal themes in the synthetic corpus.

Why it matters: grouping documents helps researchers triage readings and discover topic gaps before deeper modeling.

How to run and adapt: run after data generation; tweak `n_clusters` or preprocessing to experiment with different groupings.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(articles["abstract"].fillna(""))

model = KMeans(n_clusters=4, random_state=42, n_init=10)
articles["cluster"] = model.fit_predict(tfidf_matrix)
articles[["title", "cluster"]].head()


## Inspect cluster composition

In [None]:
cluster_counts = articles.groupby("cluster").size().reset_index(name="count")
cluster_counts
