# Sentinel-SLM Data Preparation

This notebook orchestrates the data preparation pipeline by importing logic from the `src.sentinel.data` package.
You can run this step-by-step to verify downloads and mappings.

In [None]:
import os
import sys
import pandas as pd

# Ensure repo root is in path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.sentinel.data.download import download_all
from src.sentinel.data.processing import map_all_raw_data
from src.sentinel.utils.taxonomy import CATEGORY_NAMES

## 1. Download Public Datasets
This step fetches data from Hugging Face and saves it to `data/raw/*.parquet`.

In [None]:
print("--- Downloading All Data ---")
download_all()

## 2. Inspect Raw Data
Let's check one of the downloaded files to see what it looks like.

In [None]:
raw_path = "../data/raw/civil_comments_sample.parquet"
if os.path.exists(raw_path):
    df_raw = pd.read_parquet(raw_path)
    print(f"Loaded {raw_path} with {len(df_raw)} rows.")
    display(df_raw.head())
else:
    print(f"{raw_path} not found.")

## 3. Map & Standardize Labels
Run the mapping logic to convert all raw datasets into our 8-category taxonomy.

In [None]:
# This runs the main processing pipeline
map_all_raw_data()

## 4. Final Distribution Audit
Analyze the class abundance in the unified dataset.

In [None]:
processed_path = "../data/processed/unified_dataset.parquet"
if os.path.exists(processed_path):
    df_uni = pd.read_parquet(processed_path)
    print(f"Total Processed Samples: {len(df_uni)}")
    
    # Explode because labels are lists
    exploded = df_uni.explode("labels")
    counts = exploded["labels"].value_counts().rename(index=CATEGORY_NAMES)
    
    print("\n--- Class Balance ---")
    print(counts)
    
    # Visual check of some mapped examples
    print("\n--- Random Examples ---")
    sample = df_uni.sample(5)
    for i, row in sample.iterrows():
        cats = [CATEGORY_NAMES.get(l, l) for l in row['labels']]
        print(f"Text (trunc): {row['text'][:100]}...\nLabels: {cats}\n")
else:
    print("Unified dataset not found. Did mapping fail?")