# Synthetic Data Augmentation

This notebook inspects the synthetic data generated and merges it with the main dataset.

In [None]:
import pandas as pd
import json
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.sentinel.data.processing import merge_synthetic
from src.sentinel.utils.taxonomy import CATEGORY_NAMES

## 1. Inspect Generated JSONL
Let's look at the raw output from the LLM.

In [None]:
synth_path = "../data/synthetic/synthetic_data.jsonl"
if os.path.exists(synth_path):
    with open(synth_path, 'r') as f:
        lines = f.readlines()
    
    print(f"Found {len(lines)} synthetic samples.")
    print("--- Sample Raw Lines ---")
    for line in lines[:5]:
        print(line.strip())
        
    # Load into DF for nicer view
    data = [json.loads(line) for line in lines]
    df_synth = pd.DataFrame(data)
    display(df_synth.head())
    print("\nDistribution of Generated Labels:")
    print(df_synth['label'].value_counts())
else:
    print("Synthetic data file not found yet. Is the script still running?")

## 2. Run Merge Script
This will combine the synthetic data with `unified_dataset.parquet`.

In [None]:
# Load main first
main_path = "../data/processed/unified_dataset.parquet"
if os.path.exists(main_path):
    df_main = pd.read_parquet(main_path)
    merge_synthetic(df_main)
else:
    print("Main dataset not found.")

## 3. Final Dataset Verification

In [None]:
final_path = "../data/processed/final_augmented_dataset.parquet"
if os.path.exists(final_path):
    df = pd.read_parquet(final_path)
    print(f"Final Dataset Size: {len(df)}")
    display(df.tail()) # Show the appended synthetic items
else:
    print("Final dataset not found.")