In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor


In [32]:
elec = pd.read_csv('../output/charging-stations-population.csv', index_col='Unnamed: 0')
aq = pd.read_csv('../output/air_quality_by_state.csv')

In [33]:
elec.isnull().sum()

state              0
year               0
population         0
biodiesel          0
cng                0
e85                0
electric           0
hydrogen           0
lng                0
propane            0
total              0
electric_by_pop    0
dtype: int64

In [34]:
elec.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 720 entries, 0 to 719
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   state            720 non-null    object 
 1   year             720 non-null    int64  
 2   population       720 non-null    float64
 3   biodiesel        720 non-null    float64
 4   cng              720 non-null    float64
 5   e85              720 non-null    float64
 6   electric         720 non-null    float64
 7   hydrogen         720 non-null    float64
 8   lng              720 non-null    float64
 9   propane          720 non-null    float64
 10  total            720 non-null    float64
 11  electric_by_pop  720 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 73.1+ KB


In [35]:
aq = aq[aq.year > 2007]

In [40]:
aq = aq.replace('Alaska', np.nan).dropna()
aq = aq.replace('Hawaii', np.nan).dropna()
aq = aq.replace('District Of Columbia', np.nan).dropna()
aq = aq.replace('Virgin Islands', np.nan).dropna()


In [41]:
aq.shape, elec.shape

((720, 18), (720, 12))

In [48]:
elec = elec[['state', 'year', 'electric_by_pop']]

In [49]:
df = aq.merge(elec, on=['state', 'year'])


In [47]:
elec.columns

Index(['state', 'year', 'population', 'biodiesel', 'cng', 'e85', 'electric',
       'hydrogen', 'lng', 'propane', 'total', 'electric_by_pop'],
      dtype='object')

In [50]:
df

Unnamed: 0,state,year,days_with_aqi,max_aqi,90th_percentile_aqi,median_aqi,pct_good_days,pct_moderate_days,pct_unhealthy_for_sensitive_groups_days,pct_unhealthy_days,pct_very_unhealthy_days,pct_hazardous_days,pct_days_co,pct_days_no2,pct_days_ozone,pct_days_pm2.5,pct_days_pm10,pct_bad_days,electric_by_pop
0,Alabama,2008,267.052632,114.263158,68.421053,44.315789,0.649062,0.339415,0.010577,0.000946,0.000000,0.000000,0.003020,0.000000,0.474684,0.501658,0.020638,0.350938,0.000000
1,Alabama,2009,272.368421,98.894737,59.052632,39.473684,0.752995,0.244101,0.002904,0.000000,0.000000,0.000000,0.000577,0.000000,0.435996,0.542251,0.021177,0.247005,0.000000
2,Alabama,2010,282.263158,112.263158,70.315789,46.157895,0.606986,0.379970,0.012089,0.000955,0.000000,0.000000,0.000144,0.000000,0.425478,0.549293,0.025085,0.393014,0.000000
3,Alabama,2011,248.526316,111.526316,69.421053,42.789474,0.695727,0.290738,0.012484,0.001052,0.000000,0.000000,0.000288,0.000000,0.566842,0.418499,0.014370,0.304273,0.000000
4,Alabama,2012,263.588235,110.235294,62.176471,40.117647,0.777177,0.211164,0.011017,0.000643,0.000000,0.000000,0.000000,0.000000,0.597273,0.392065,0.010662,0.222823,0.008306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,Wyoming,2018,329.666667,106.500000,58.611111,39.611111,0.789477,0.206487,0.003579,0.000457,0.000000,0.000000,0.000000,0.007854,0.779945,0.047889,0.164312,0.210523,2.094872
716,Wyoming,2019,343.888889,98.000000,51.944444,38.833333,0.871004,0.125178,0.003362,0.000457,0.000000,0.000000,0.000000,0.004811,0.818655,0.018977,0.157558,0.128996,2.747257
717,Wyoming,2020,338.352941,131.705882,50.352941,34.882353,0.876343,0.113637,0.007989,0.001768,0.000263,0.000000,0.000000,0.061075,0.654444,0.079556,0.204924,0.123657,2.977814
718,Wyoming,2021,349.941176,128.823529,65.235294,36.294118,0.783710,0.199038,0.016930,0.000161,0.000000,0.000161,0.000323,0.061565,0.693599,0.089626,0.154887,0.216290,3.175244


In [131]:
to_drop = [
    'days_with_aqi',
    'pct_hazardous_days',
    'pct_very_unhealthy_days'
]

In [133]:
df.drop(columns = to_drop, inplace=True)

In [134]:
X = df.drop(columns = ['state', 'year', 'electric_by_pop'])
y = df['electric_by_pop']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [135]:
knn = KNeighborsRegressor()

knn.fit(X_train, y_train)
knn.score(X_train, y_train), knn.score(X_test, y_test)

(0.31853011368664663, 0.14769635659668934)

In [136]:
lr = LinearRegression()

lr.fit(X_train, y_train)
lr.score(X_train, y_train), knn.score(X_test, y_test)

(0.1573054640072119, 0.14769635659668934)

In [137]:
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [138]:
knn = KNeighborsRegressor()

knn.fit(X_train_sc, y_train)
knn.score(X_train_sc, y_train), knn.score(X_test_sc, y_test)

(0.519455243709505, 0.37493153993800754)

In [165]:
gs_knn = GridSearchCV(
    knn,
    param_grid={
        'n_neighbors' : [3,5, 6,10,15],
        'weights' : ['uniform'],
        'p' : [1,2,3,4,5,6,7]
    },
    cv = 3,
    verbose = 1,
    n_jobs = -1
)

In [166]:
gs_knn.fit(X_train_sc, y_train)

Fitting 3 folds for each of 35 candidates, totalling 105 fits


In [167]:
gs_knn.best_estimator_

In [168]:
gs_knn.best_estimator_.fit(X_train_sc, y_train)

In [169]:
gs_knn.best_estimator_.score(X_train_sc, y_train)

0.49388970830995693

In [170]:
gs_knn.best_estimator_.score(X_test_sc, y_test)

0.42431497153541864

In [171]:
poly = PolynomialFeatures(degree = 3, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_sc)
X_test_poly = poly.transform(X_test_sc)

In [172]:
gs_knn.fit(X_train_poly, y_train)

Fitting 3 folds for each of 35 candidates, totalling 105 fits


In [173]:
gs_knn.best_estimator_

In [174]:
gs_knn.best_estimator_.score(X_train_poly, y_train)

0.48181823232434773

In [175]:
gs_knn.best_estimator_.score(X_test_poly, y_test)

0.36528104505596126

In [176]:
lr.fit(X_train_poly, y_train)

In [177]:
lr.score(X_train_poly, y_train)

0.8070642995835334

In [178]:
lr.score(X_test_poly, y_test)

-128.94805721513794

In [186]:
ridge = RidgeCV()

ridge.fit(X_train_poly, y_train)

In [187]:
ridge.score(X_train_poly, y_train)

0.5813655346818316

In [188]:
ridge.score(X_test_poly, y_test)

-0.16359183688082335

In [202]:
stack = StackingRegressor(
    [('ridge', RidgeCV()),
    ('lr', LinearRegression())],
    final_estimator=gs_knn.best_estimator_
)

In [203]:
stack.fit(X_train_poly, y_train)

In [204]:
stack.score(X_train_poly, y_train)

0.47090187015876306

In [205]:
stack.score(X_test_poly, y_test)

-0.025891194838093545