In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Ingest and Clean

Goal: load synthetic article abstracts and apply quick normalization.

Why it matters: researchers need a reproducible baseline cleaning step before clustering, retrieval, or labeling.

How to run and adapt: execute cells top-to-bottom after generating data; adjust the `clean_text` function to mirror your own preprocessing rules.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

articles_path = DATA_DIR / "sample_texts" / "articles_sample.csv"
articles = pd.read_csv(articles_path)
print(f"Loaded {len(articles)} articles from {articles_path}")
articles.head()


## Basic cleaning

In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return " ".join(text.split())

articles["cleaned"] = articles["abstract"].apply(clean_text)
articles[["title", "cleaned"]].head()
