# Data Exploration
Initial exploration of SHED 2024, Regional Price Parities, and FRED minimum wage datasets

In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn plotly imbalanced-learn shap openpyxl -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully")

## Load Datasets

In [None]:
# Load datasets
fred = pd.read_csv("../data/raw/fredgraph.csv")
rpp = pd.read_excel("../data/raw/rpp1224.xlsx", skiprows=5)
shed = pd.read_csv("../data/raw/public2024.csv")

# Clean RPP data
rpp.columns = ['state', 'real_pce_2022', 'real_pce_2023', 'pce_pct_change',
               'real_income_2022', 'real_income_2023', 'income_pct_change']
rpp['state'] = rpp['state'].str.lower().str.strip()
rpp = rpp[~rpp['state'].isin(['united states', 'nan', ''])]
rpp = rpp.dropna(subset=['state'])

## Basic Dataset Information

In [None]:
print(f"FRED (Minimum Wage Data)")
print(f"Dataset shape: {fred.shape}")
print(f"Total samples: {len(fred):,}")
print(f"\nColumns: {fred.columns.tolist()}")

In [None]:
fred.head()

In [None]:
fred.info()

In [None]:
print(f"SHED 2024 (Survey of Household Economics and Decisionmaking)")
print(f"Dataset shape: {shed.shape}")
print(f"Total samples: {len(shed):,}")
print(f"\nColumns: {len(shed.columns)} features")
print(f"\nFirst 20 columns: {shed.columns.tolist()[:20]}")

In [None]:
shed.head()

In [None]:
shed.info()

In [None]:
print(f"RPP (Regional Price Parities)")
print(f"Dataset shape: {rpp.shape}")
print(f"Total samples: {len(rpp):,}")
print(f"\nColumns: {rpp.columns.tolist()}")

In [None]:
rpp.head()

In [None]:
rpp.info()

## Statistical Summaries

In [None]:
fred.describe()

In [None]:
shed.describe()

In [None]:
rpp.describe()

## Data Visualizations

### Target Variable Distribution

In [None]:
# TODO: Update 'EF3' with actual column name for $400 emergency question
target_column = 'EF3'  # UPDATE THIS

if target_column in shed.columns:
    plt.figure(figsize=(10, 6))
    shed[target_column].value_counts().plot(kind='bar', color='#3498db')
    plt.title('Can Cover $400 Emergency Expense Distribution')
    plt.xlabel('Response')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print(f"\nPercentage distribution:")
    print(shed[target_column].value_counts(normalize=True) * 100)
else:
    print(f"Column '{target_column}' not found. Check SHED codebook for correct column name.")

### Demographic Distributions

In [None]:
# Age distribution
if 'ppage' in shed.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    shed['ppage'].hist(bins=30, color='#2ecc71', edgecolor='black')
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    shed['ppage'].plot(kind='box', color='#2ecc71')
    plt.title('Age Box Plot')
    plt.ylabel('Age')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Household size distribution
if 'pphhsize' in shed.columns:
    plt.figure(figsize=(10, 6))
    shed['pphhsize'].value_counts().sort_index().plot(kind='bar', color='#e74c3c')
    plt.title('Household Size Distribution')
    plt.xlabel('Household Size')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
# Education level distribution
if 'ppeduc' in shed.columns:
    plt.figure(figsize=(10, 6))
    shed['ppeduc'].value_counts().plot(kind='bar', color='#9b59b6')
    plt.title('Education Level Distribution')
    plt.xlabel('Education Level')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

### Geographic Analysis

In [None]:
# State distribution in SHED data
if 'ppstaten' in shed.columns:
    plt.figure(figsize=(12, 8))
    state_counts = shed['ppstaten'].value_counts().head(20)
    state_counts.plot(kind='barh', color='#3498db')
    plt.title('Top 20 States by Sample Size')
    plt.xlabel('Count')
    plt.ylabel('State')
    plt.tight_layout()
    plt.show()

In [None]:
# Regional Price Parities visualization
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
top_income = rpp.nlargest(15, 'real_income_2023')
plt.barh(range(len(top_income)), top_income['real_income_2023'], color='#2ecc71')
plt.yticks(range(len(top_income)), top_income['state'])
plt.xlabel('Real Income 2023')
plt.title('Top 15 States by Real Income')
plt.gca().invert_yaxis()

plt.subplot(1, 2, 2)
top_pce = rpp.nlargest(15, 'real_pce_2023')
plt.barh(range(len(top_pce)), top_pce['real_pce_2023'], color='#e74c3c')
plt.yticks(range(len(top_pce)), top_pce['state'])
plt.xlabel('Real PCE 2023')
plt.title('Top 15 States by Personal Consumption')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

### Correlation Analysis

In [None]:
# Select numeric columns for correlation analysis
numeric_cols = shed.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation with target variable (if available)
if target_column in numeric_cols:
    correlations = shed[numeric_cols].corr()[target_column].sort_values(ascending=False)
    
    # Plot top correlations
    plt.figure(figsize=(12, 8))
    top_corr = pd.concat([correlations.head(15), correlations.tail(15)])
    colors = ['#2ecc71' if x > 0 else '#e74c3c' for x in top_corr]
    plt.barh(range(len(top_corr)), top_corr, color=colors)
    plt.yticks(range(len(top_corr)), top_corr.index)
    plt.xlabel('Correlation with Target')
    plt.title('Top Positive and Negative Correlations with $400 Emergency Coverage')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Positive Correlations:")
    print(correlations.head(10))
    print("\nTop 10 Negative Correlations:")
    print(correlations.tail(10))

### Missing Data Analysis

In [None]:
# Calculate missing percentages
missing_pct = (shed.isnull().sum() / len(shed)) * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)

if len(missing_pct) > 0:
    plt.figure(figsize=(12, 8))
    missing_pct.head(20).plot(kind='barh', color='#e67e22')
    plt.xlabel('Percentage Missing')
    plt.title('Top 20 Features with Missing Data')
    plt.tight_layout()
    plt.show()
    
    print(f"\nTotal features with missing data: {len(missing_pct)}")
    print(f"\nTop 10 features with most missing data:")
    print(missing_pct.head(10))
else:
    print("No missing data found in SHED dataset")

## Summary Statistics

Key findings from data exploration will inform preprocessing and feature selection.

In [None]:
print("=" * 80)
print("DATA EXPLORATION SUMMARY")
print("=" * 80)
print(f"\nSHED Dataset:")
print(f"  - Total respondents: {len(shed):,}")
print(f"  - Total features: {len(shed.columns)}")
print(f"  - Numeric features: {len(shed.select_dtypes(include=[np.number]).columns)}")
print(f"  - Categorical features: {len(shed.select_dtypes(include=['object']).columns)}")

print(f"\nRPP Dataset:")
print(f"  - States covered: {len(rpp)}")
print(f"  - Average income change: {rpp['income_pct_change'].mean():.2f}%")
print(f"  - Average PCE change: {rpp['pce_pct_change'].mean():.2f}%")

print(f"\nFRED Dataset:")
print(f"  - Records: {len(fred):,}")
print(f"  - Features: {len(fred.columns)}")

print("\n" + "=" * 80)
print("Ready to proceed to preprocessing notebook")
print("=" * 80)