In [None]:
import sys, subprocess, os
from pathlib import Path

# Colab Setup
if "google.colab" in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema", "plotly", "tqdm"])
    
    # Check for data
    if not (Path.cwd() / "data").exists():
        print("Data directory not found. Cloning repository...")
        subprocess.run(["git", "clone", "https://github.com/aire-program/aire-researcher-sandbox.git", "_repo"])
        
        # Move data and scripts to current directory
        if (Path("_repo/data").exists()):
            print("Moving data and scripts...")
            subprocess.run(["mv", "_repo/data", "."])
            subprocess.run(["mv", "_repo/scripts", "."])
            subprocess.run(["rm", "-rf", "_repo"])
        else:
            print("Warning: Data not found in cloned repo.")
    else:
        print("Data directory found.")


# Ingest and Clean Synthetic Articles

**What**: Normalize and prepare synthetic article titles and abstracts for downstream analysis.

**Why**: Text data often contains noise that can degrade the performance of clustering and retrieval algorithms. Establishing a clean baseline is a critical first step in any NLP pipeline.

**How**:
1. **Install dependencies** (if running in Colab).
2. **Load data** from the synthetic dataset.
3. **Apply cleaning functions** to normalize text (lowercase, remove special characters).
4. **Verify** the output structure.

**Key Concept**: **Normalization** is the process of transforming text into a standard format (e.g., lowercase, no punctuation) to ensure consistency during analysis.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You loaded synthetic articles.
- You produced cleaned text and saw the shape of your dataset.
- You have a DataFrame ready for clustering or retrieval.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

articles_path = DATA_DIR / "sample_texts" / "articles_sample.csv"
articles = pd.read_csv(articles_path)
print(f"Loaded {len(articles)} articles from {articles_path}")
articles.head()


## Basic cleaning

In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return " ".join(text.split())

articles["cleaned"] = articles["abstract"].apply(clean_text)
articles[["title", "cleaned"]].head()


### If you get stuck / What to try next

If you get stuck: rerun the first Colab setup cell and ensure `scripts/generate_synthetic_data.py` has been run. What to try next: cluster cleaned abstracts in pipelines/text/clustering_and_topics.ipynb (see docs/colab_index.md for a Colab badge).