# Data Preprocessing
This notebook handles data cleaning, merging datasets, encoding categorical variables, and preparing data for model training.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Load Data
Loading the three datasets: SHED, RPP, and FRED minimum wage data

In [None]:
# Load datasets
fred = pd.read_csv("../data/raw/fredgraph.csv")
rpp = pd.read_excel("../data/raw/rpp1224.xlsx", skiprows=5)
shed = pd.read_csv("../data/raw/public2024.csv")

# Clean RPP column names
rpp.columns = ['state', 'real_pce_2022', 'real_pce_2023', 'pce_pct_change',
               'real_income_2022', 'real_income_2023', 'income_pct_change']
rpp['state'] = rpp['state'].str.lower().str.strip()
rpp = rpp[~rpp['state'].isin(['united states', 'nan', ''])]
rpp = rpp.dropna(subset=['state'])

print(f"SHED shape: {shed.shape}")
print(f"RPP shape: {rpp.shape}")
print(f"FRED shape: {fred.shape}")

## Define Target Variable
Creating binary target: Can cover $400 emergency (1) vs Cannot cover (0)

**Note:** Update the column name below based on your SHED codebook - look for the question about covering a $400 emergency expense

In [None]:
# TODO: Replace 'EF3' with actual column name from SHED codebook
# This should be the question about covering $400 emergency
# Common column names: EF3, EF3_a, or similar

target_column = 'EF3'  # UPDATE THIS based on your codebook

# Create binary target (1 = can cover, 0 = cannot cover)
# Adjust the mapping based on how responses are coded in your dataset
shed['can_cover_400'] = shed[target_column].apply(
    lambda x: 1 if x == 1 else 0  # Adjust these values based on your data
)

print(f"Target variable distribution:")
print(shed['can_cover_400'].value_counts())
print(f"\nPercentage who can cover: {shed['can_cover_400'].mean()*100:.1f}%")

## Merge Datasets
Adding state-level economic indicators to individual-level SHED data

In [None]:
# Assuming SHED has a state column - update column name if different
# Common names: ppstaten, state, ppstate

# Create lowercase state column for merging
shed['state_lower'] = shed['ppstaten'].str.lower().str.strip()  # UPDATE column name if needed

# Merge with RPP data
df = shed.merge(rpp, left_on='state_lower', right_on='state', how='left')

# TODO: Merge FRED minimum wage data
# This depends on how FRED data is structured - adjust as needed

print(f"Merged dataset shape: {df.shape}")
print(f"Missing values after merge: {df.isnull().sum().sum()}")

## Select Features
Choosing relevant demographic and economic features

**Based on your KWK results, key features include:**
- pphhsize (household size)
- ppkid017 (children under 18)
- ppage (age)
- ppeduc (education)
- income variables
- race/ethnicity
- state economic indicators

In [None]:
# TODO: Update this list based on actual columns in your SHED dataset
# Check the codebook for exact column names

feature_columns = [
    # Demographics
    'ppage',           # Age
    'pphhsize',        # Household size
    'ppkid017',        # Number of children under 18
    'ppeduc',          # Education level
    'ppethm',          # Race/ethnicity
    'ppgender',        # Gender
    'ppincimp',        # Income
    'ppwork',          # Employment status
    
    # State economic indicators
    'pce_pct_change',
    'income_pct_change',
    'real_income_2023',
    
    # Add other relevant SHED columns here
]

# Keep only features that exist in the dataset
available_features = [col for col in feature_columns if col in df.columns]
print(f"Available features: {len(available_features)} out of {len(feature_columns)}")
print(f"Missing features: {set(feature_columns) - set(available_features)}")

# Create feature matrix
X = df[available_features].copy()
y = df['can_cover_400'].copy()

## Handle Missing Values

In [None]:
print("Missing values before handling:")
print(X.isnull().sum())

# Drop rows with missing target
valid_indices = ~y.isnull()
X = X[valid_indices]
y = y[valid_indices]

# Handle missing values in features
# Strategy: Fill with median for numeric, mode for categorical
for col in X.columns:
    if X[col].dtype in ['int64', 'float64']:
        X[col].fillna(X[col].median(), inplace=True)
    else:
        X[col].fillna(X[col].mode()[0], inplace=True)

print(f"\nDataset shape after cleaning: {X.shape}")
print(f"Missing values after handling: {X.isnull().sum().sum()}")

## Encode Categorical Variables

In [None]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Categorical columns to encode: {categorical_cols}")

# Label encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"\nAll features are now numeric")
print(X.dtypes.value_counts())

## Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE,
    stratify=y  # Maintain class balance
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"\nClass distribution in train:")
print(y_train.value_counts(normalize=True))
print(f"\nClass distribution in test:")
print(y_test.value_counts(normalize=True))

## Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Features scaled using StandardScaler")
print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

## Save Processed Data

In [None]:
# Save processed data for use in next notebooks
np.save('../data/processed/X_train_scaled.npy', X_train_scaled)
np.save('../data/processed/X_test_scaled.npy', X_test_scaled)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)

# Save feature names and scaler
import pickle
with open('../data/processed/feature_names.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
with open('../data/processed/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Processed data saved to ../data/processed/")
print("\nSummary:")
print(f"- Features: {X_train_scaled.shape[1]}")
print(f"- Training samples: {X_train_scaled.shape[0]:,}")
print(f"- Test samples: {X_test_scaled.shape[0]:,}")
print(f"- Target: can_cover_400 (binary)")