In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix

# Load your data
df = pd.read_csv('../processed_data/merged_data_2013-2022.csv')
df = df.drop(columns=["Unnamed: 0"]) # remove index column

In [None]:
# Create meaningful features
df['total_unhealthy_days'] = (df['unhealthy_for_sensitive_groups_days'] + 
                              df['unhealthy_days'] + 
                              df['very_unhealthy_days'] + 
                              df['hazardous_days'])

df['percent_good_days'] = (df['good_days'] / df['days_with_aqi']) * 100
df['percent_unhealthy_days'] = (df['total_unhealthy_days'] / df['days_with_aqi']) * 100

# Air quality severity categories
df['aqi_category'] = pd.cut(df['median_aqi'], 
                           bins=[0, 50, 100, 150, 200, float('inf')],
                           labels=['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy'])

# High asthma rate binary target (for logistic regression)
asthma_threshold = df['asthma_rate'].quantile(0.75)  # Top 25% of asthma rate per 10,000
df['high_asthma'] = (df['asthma_rate'] > asthma_threshold).astype(int)

print(f"High asthma threshold: {asthma_threshold:.1f} cases per 10k")
print(f"Counties with high asthma rates: {df['high_asthma'].sum()}")

####

# 1. Which air quality metrics correlate strongest with asthma?
air_quality_cols = ['median_aqi', 'max_aqi', '90th_percentile_aqi', 
                   'total_unhealthy_days', 'percent_unhealthy_days']

correlations = df[air_quality_cols + ['asthma_rate']].corr()['asthma_rate'].sort_values(ascending=False)
print("Correlations with asthma rate:")
print(correlations)

# 2. Pollutant-specific analysis
pollutant_days = ['days_co', 'days_no2', 'days_ozone', 'days_pm2.5', 'days_pm10']
pollutant_corr = df[pollutant_days + ['asthma_rate']].corr()['asthma_rate'].sort_values(ascending=False)
print("\nPollutant correlations:")
print(pollutant_corr)

# 3. County and temporal patterns
county_stats = df.groupby('county').agg({
    'median_aqi': 'mean',
    'asthma_rate': 'mean',
    'total_unhealthy_days': 'mean'
}).sort_values('asthma_rate', ascending=False)

print("\nTop 10 counties by asthma rate:")
print(county_stats.head(10))

In [None]:
# Assuming your data is in a DataFrame called 'df'
# Clean and prepare your data (make sure no NaN values for predictors)
df = df.dropna(subset=['asthma_rate', 'median_aqi'])

# Define your independent and dependent variables
X = df['median_aqi']  # Predictor
y = df['asthma_rate']  # Dependent variable

# Add constant to the model (for the intercept)
X = sm.add_constant(X)

# Fit the model using Ordinary Least Squares (OLS)
model = sm.OLS(y, X).fit()

# Get the summary of the model
print(model.summary())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Load and dummy-encode
df = pd.read_csv('../processed_data/merged_data_2013-2022.csv')
X = pd.get_dummies(df[['median_aqi','county','year']], drop_first=True)
y = df['asthma_rate']

# 2. Cross-validated predictions
lr = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(lr, X, y, cv=kf)

# 3. Compute CV RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Multiple-OLS CV RMSE: {rmse:.3f}")

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Predict continuous asthma rate
features = ['median_aqi', 'max_aqi', 'total_unhealthy_days', 
           'days_pm2.5', 'days_ozone', 'percent_unhealthy_days'] 

X = df[features]
y = df['asthma_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Multiple Linear Regression:

# Train model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = lr_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Linear Regression R²: {r2:.3f}")
print(f"RMSE: {rmse:.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'coefficient': lr_model.coef_,
    'abs_coefficient': np.abs(lr_model.coef_)
}).sort_values('abs_coefficient', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

Linear Regression R²: -0.067
RMSE: 16.56

Feature Importance:
                  feature  coefficient  abs_coefficient
3              days_pm2.5    -6.208504         6.208504
0              median_aqi     5.929189         5.929189
5  percent_unhealthy_days    -5.322920         5.322920
4              days_ozone    -4.047424         4.047424
2    total_unhealthy_days     2.050874         2.050874
1                 max_aqi    -1.483063         1.483063


In [13]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest R²: {r2_rf:.3f}")

# Feature importance
rf_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nRandom Forest Feature Importance:")
print(rf_importance)

Random Forest R²: 0.200

Random Forest Feature Importance:
                  feature  importance
1                 max_aqi    0.248335
4              days_ozone    0.208928
3              days_pm2.5    0.189073
0              median_aqi    0.161116
5  percent_unhealthy_days    0.109296
2    total_unhealthy_days    0.083251


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 1. Load and prepare your data
df = pd.read_csv('../processed_data/merged_data_2013-2022.csv')
X = pd.get_dummies(df[['median_aqi', 'county', 'year']], drop_first=True)
y = df['asthma_rate']

# 2. Out-of-sample test (train/test split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Compute RMSE manually for compatibility
mse_test = mean_squared_error(y_test, preds)
rmse_test = np.sqrt(mse_test)
print(f"Test RMSE: {rmse_test:.3f}")

# 3. K-fold cross-validation (5 folds)
# Use negative MSE scoring, then convert to RMSE
cv_neg_mse = cross_val_score(
    LinearRegression(), X, y,
    cv=5, scoring='neg_mean_squared_error'
)
mse_cv = -cv_neg_mse
rmse_cv = np.sqrt(mse_cv)
print(f"CV RMSE scores: {rmse_cv.round(3)}")
print(f"Mean CV RMSE: {rmse_cv.mean():.3f}")

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# load the merged data you already produced
df = pd.read_csv("../processed_data/merged_data_2013-2022.csv").drop(columns=["Unnamed: 0"], errors="ignore")

# 1) OLS
ols = smf.ols("asthma_rate ~ median_aqi", data=df).fit()
print("OLS:\n", ols.summary(), "\n")

# 2) Poisson
X = sm.add_constant(df["median_aqi"])
y = df["asthma_rate"]
pois = sm.GLM(y, X, family=sm.families.Poisson()).fit()
print("Poisson:\n", pois.summary(), "\n")

# e.g. “high” if rate is above the median rate
# 3) Logistic
df["high_asthma"] = (df["asthma_rate"] > df["asthma_rate"].median()).astype(int)
logit = smf.logit("high_asthma ~ median_aqi", data=df).fit()
print("Logit:\n", logit.summary())