In [3]:
import pandas as pd
import os

# Load data
data_path = os.path.join(os.getcwd(), 'data.csv')

if not os.path.exists(data_path):
    print(f"Data file not found at {data_path}. Run prepare_data.py first.")
else:
    print("Loading data...")
    df = pd.read_csv(data_path)
    print(f"Initial shape: {df.shape}")

    # Remove duplicates
    print("Removing duplicates based on 'text' column if available, else all columns...")
    
    # We generated data.csv with features/embeddings but we need to know which cols are which to verify what to dedup on.
    # However, exact dedup usually means entire row or just the text source. 
    # Since prepare_data.py output might not have 'text' column depending on how it was saved (in my last edit I removed 'text' from final_df concatenation to save space/memory if not requested, but wait... let me check my last edit to prepare_data.py).
    
    # Actually, looking at my last edit to prepare_data.py:
    # final_df = pd.concat([full_df[['label']], emb_df], axis=1)
    # It does NOT verify 'text' is preserved in the CSV.
    # If the user wants to dedup, they might want to dedup based on embeddings if text is missing.
    # BUT, if text is missing, exact dedup on float embeddings might be tricky due to precision, though pandas handles it reasonably well.
    # Ideally we should have kept 'text'. 
    
    # Let's try deduping on all columns (which are embeddings + label).
    initial_len = len(df)
    df.drop_duplicates(inplace=True)
    new_len = len(df)
    
    print(f"Removed {initial_len - new_len} duplicates.")
    print(f"New shape: {df.shape}")
    
    # Check Class Imbalance
    print("\nClass Distribution:")
    counts = df['label'].value_counts()
    print(counts)
    
    print("\nClass Ratios:")
    print(df['label'].value_counts(normalize=True))
    
    # Save back
    df.to_csv(data_path, index=False)
    print(f"Saved cleaned data to {data_path}")

Loading data...
Initial shape: (335779, 385)
Removing duplicates based on 'text' column if available, else all columns...
Removed 0 duplicates.
New shape: (335779, 385)

Class Distribution:
label
0    174063
1    161716
Name: count, dtype: int64

Class Ratios:
label
0    0.518386
1    0.481614
Name: proportion, dtype: float64


KeyboardInterrupt: 