In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Batch Summarization

Goal: create lightweight summaries of abstracts without external model calls.

Why it matters: quick summaries help researchers scan literature safely when external APIs are unavailable or restricted.

How to run and adapt: run after generating data; replace the heuristic summarizer with your preferred local model if needed.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

articles = pd.read_csv(DATA_DIR / "sample_texts" / "articles_sample.csv")


def simple_summary(text: str, sentences: int = 2) -> str:
    parts = [part.strip() for part in text.split('.') if part.strip()]
    return '. '.join(parts[:sentences]) + ('.' if parts else '')

articles["summary"] = articles["abstract"].apply(simple_summary)
articles[["title", "summary"]].head()
