In [3]:
import pandas as pd

In [7]:
df = pd.read_csv('../processed_data/merged_data_2013-2022.csv')
df = df.drop(columns=["Unnamed: 0"])

In [13]:
import statsmodels.formula.api as smf

mod = smf.ols('asthma_rate ~ median_aqi + C(county) + C(year)', data=df).fit()
mod.summary()

0,1,2,3
Dep. Variable:,asthma_rate,R-squared:,0.876
Model:,OLS,Adj. R-squared:,0.859
Method:,Least Squares,F-statistic:,52.9
Date:,"Mon, 23 Jun 2025",Prob (F-statistic):,6.4e-174
Time:,04:35:01,Log-Likelihood:,-1677.4
No. Observations:,529,AIC:,3481.0
Df Residuals:,466,BIC:,3750.0
Df Model:,62,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,45.8562,3.917,11.706,0.000,38.159,53.554
C(county)[T.Alpine],-5.075e-13,7.88e-14,-6.443,0.000,-6.62e-13,-3.53e-13
C(county)[T.Amador],5.4041,2.882,1.875,0.061,-0.259,11.068
C(county)[T.Butte],-11.2520,2.751,-4.090,0.000,-16.657,-5.847
C(county)[T.Calaveras],-0.9634,2.760,-0.349,0.727,-6.387,4.460
C(county)[T.Colusa],-10.1819,2.770,-3.676,0.000,-15.625,-4.739
C(county)[T.Contra Costa],3.4170,2.753,1.241,0.215,-1.992,8.826
C(county)[T.Del Norte],6.9385,3.170,2.189,0.029,0.710,13.167
C(county)[T.El Dorado],-14.9365,2.799,-5.336,0.000,-20.437,-9.436

0,1,2,3
Omnibus:,29.893,Durbin-Watson:,2.073
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.098
Skew:,0.367,Prob(JB):,1.09e-12
Kurtosis:,4.401,Cond. No.,9.16e+17


In [14]:
import statsmodels.api as sm

# "only aqi matters" ~ simple regression
# 3) Fit an OLS model: asthma_rate ~ median_aqi
model = smf.ols("asthma_rate ~ median_aqi", data=df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            asthma_rate   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     6.446
Date:                Mon, 23 Jun 2025   Prob (F-statistic):             0.0114
Time:                        04:35:02   Log-Likelihood:                -2225.4
No. Observations:                 529   AIC:                             4455.
Df Residuals:                     527   BIC:                             4463.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     36.3234      2.554     14.221      0.0

In [None]:
# Create meaningful features
df['total_unhealthy_days'] = (df['unhealthy_for_sensitive_groups_days'] + 
                              df['unhealthy_days'] + 
                              df['very_unhealthy_days'] + 
                              df['hazardous_days'])

df['percent_good_days'] = (df['good_days'] / df['days_with_aqi']) * 100
df['percent_unhealthy_days'] = (df['total_unhealthy_days'] / df['days_with_aqi']) * 100

# Air quality severity categories
df['aqi_category'] = pd.cut(df['median_aqi'], 
                           bins=[0, 50, 100, 150, 200, float('inf')],
                           labels=['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy'])

# High asthma rate binary target (for logistic regression)
asthma_threshold = df['asthma_rate'].quantile(0.75)  # Top 25% of asthma rate per 10,000
df['high_asthma'] = (df['asthma_rate'] > asthma_threshold).astype(int)

print(f"High asthma threshold: {asthma_threshold:.1f} cases per 10k")
print(f"Counties with high asthma rates: {df['high_asthma'].sum()}")

In [None]:
# 3. County and temporal patterns
county_stats = df.groupby('county').agg({
    'median_aqi': 'mean',
    'asthma_rate': 'mean',
    'total_unhealthy_days': 'mean'
}).sort_values('asthma_rate', ascending=False)

print("\nTop 10 counties by asthma rate:")
print(county_stats.head(10))