# Part 2: Data Cleaning

Taking the messy real-world data and getting it ready for analysis.

**What we need to do:**
- Fix currency formatting
- Handle missing values
- Standardize categorical variables
- Remove/flag outliers
- Save cleaned data

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from src.data_processing import (
    load_grant_data, 
    clean_currency, 
    categorize_program,
    validate_data_quality
)

import warnings
warnings.filterwarnings('ignore')

## Load Raw Data

In [None]:
# Using our helper function
df = load_grant_data('../data/raw/MCHB_Data_GranteeDetails.csv')
print(f"Loaded {len(df)} records")
df.head()

## Fix Currency Formatting

In [None]:
# The load function already cleaned this, but let's verify
print("Sample of cleaned amounts:")
print(df[['Awardee Amount']].head(10))

# Check for any remaining non-numeric values
non_numeric = df[df['Awardee Amount'].isna()]
print(f"\nRows with invalid amounts: {len(non_numeric)}")

## Handle Missing Values

In [None]:
# Check missing values again
missing_summary = pd.DataFrame({
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
})
print(missing_summary[missing_summary['Missing'] > 0])

In [None]:
# Decision on missing values:
# - County: Keep as is (some grants are state-level)
# - Congressional District: Keep as is (same reason)
# - Amount: Drop these rows (can't analyze without amounts)

print(f"Records before dropping: {len(df)}")
df_clean = df.dropna(subset=['Awardee Amount'])
print(f"Records after dropping: {len(df_clean)}")
print(f"Dropped {len(df) - len(df_clean)} rows with missing amounts")

## Standardize State Names

In [None]:
# Check current state values
print("Unique states/territories:")
print(sorted(df_clean['State'].unique()))
print(f"\nTotal: {df_clean['State'].nunique()}")

In [None]:
# States look good already (all 2-letter codes)
# Just make sure they're uppercase and stripped
df_clean['State'] = df_clean['State'].str.upper().str.strip()

## Create Program Categories

In [None]:
# Using our categorization function
df_clean['program_category'] = df_clean['Program Name'].apply(categorize_program)

print("Program categories created:")
print(df_clean['program_category'].value_counts())

## Handle Outliers

In [None]:
# Identify outliers using IQR method
Q1 = df_clean['Awardee Amount'].quantile(0.25)
Q3 = df_clean['Awardee Amount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_clean['is_outlier'] = (
    (df_clean['Awardee Amount'] < lower_bound) | 
    (df_clean['Awardee Amount'] > upper_bound)
)

print(f"Outliers identified: {df_clean['is_outlier'].sum()}")
print(f"\nOutlier bounds: ${lower_bound:,.0f} to ${upper_bound:,.0f}")

In [None]:
# Look at some outliers
print("Sample of outlier grants:")
outliers = df_clean[df_clean['is_outlier']]
print(outliers[['State', 'Program Name', 'Awardee Amount']].sort_values('Awardee Amount', ascending=False).head(10))

In [None]:
# Decision: Keep outliers but flag them
# These are legitimate large grants (like state MCH services blocks)
# We'll just be aware of them in analysis

## Create Additional Useful Columns

In [None]:
# Fiscal year as int
df_clean['fiscal_year'] = df_clean['Fiscal Year'].astype(int)

# Binary flag for mental health programs
df_clean['is_mental_health'] = df_clean['program_category'] == 'Mental Health'

# Binary flag for maternal health
df_clean['is_maternal_health'] = df_clean['program_category'] == 'Maternal Health'

# Log transform of amount for modeling
df_clean['log_amount'] = np.log10(df_clean['Awardee Amount'])

print("New columns created:")
print(df_clean[['program_category', 'is_mental_health', 'is_maternal_health', 'log_amount']].head())

## Final Data Quality Check

In [None]:
# Use our validation function
validate_data_quality(df_clean)

In [None]:
# Check for duplicates
dups = df_clean.duplicated(subset=['Grant Number']).sum()
print(f"Duplicate grant numbers: {dups}")

if dups > 0:
    # Remove duplicates if any
    df_clean = df_clean.drop_duplicates(subset=['Grant Number'], keep='first')
    print(f"Removed {dups} duplicates")

## Save Cleaned Data

In [None]:
# Save to processed folder
output_path = '../data/processed/mchb_grants_cleaned.csv'
df_clean.to_csv(output_path, index=False)

print(f"Saved {len(df_clean)} cleaned records to {output_path}")
print(f"\nColumns in cleaned dataset: {len(df_clean.columns)}")
print(f"File size: {df_clean.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## Summary

**What we did:**
1. ✅ Converted currency strings to numeric
2. ✅ Handled missing values (dropped rows without amounts)
3. ✅ Standardized state codes
4. ✅ Created program categories
5. ✅ Identified and flagged outliers
6. ✅ Added useful derived columns
7. ✅ Saved cleaned data

**Next up:** Feature engineering and merging with external health outcome data!