In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, BaggingClassifier, BaggingRegressor, StackingRegressor

In [2]:
df = pd.read_csv('../data/epa_final.csv')

In [3]:
df.head()

Unnamed: 0,Date Local,Latitude,Longitude,PM2.5 Monthly Mean,lat_2dec,lon_2dec,no2,aod,temp,precip
0,2019-01-31,30.497478,-87.880258,6.825,30.5,-87.88,1541665000000000.0,0.052667,290.88,3.993249
1,2019-01-31,38.013333,-87.577222,9.86,30.5,-87.88,1120693000000000.0,0.03525,290.56,4.091899
2,2019-01-31,41.874999,-90.177574,10.29,30.5,-87.88,1374970000000000.0,0.2015,288.74,4.167067
3,2019-01-31,38.013333,-87.577222,7.222581,30.5,-87.88,1120693000000000.0,0.03525,290.56,4.091899
4,2019-01-31,41.1644,-81.2352,8.7625,30.5,-87.88,1421505000000000.0,0.03525,290.56,4.091899


In [4]:
X = df.drop(columns = ['Date Local','PM2.5 Monthly Mean', 'lat_2dec', 'lon_2dec'])
y = df['PM2.5 Monthly Mean']

In [5]:
X.head()

Unnamed: 0,Latitude,Longitude,no2,aod,temp,precip
0,30.497478,-87.880258,1541665000000000.0,0.052667,290.88,3.993249
1,38.013333,-87.577222,1120693000000000.0,0.03525,290.56,4.091899
2,41.874999,-90.177574,1374970000000000.0,0.2015,288.74,4.167067
3,38.013333,-87.577222,1120693000000000.0,0.03525,290.56,4.091899
4,41.1644,-81.2352,1421505000000000.0,0.03525,290.56,4.091899


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [7]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [8]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
lr_train_pred = lr.predict(X_train_sc)
lr_test_pred = lr.predict(X_test_sc)
print(f'LR training score: {lr.score(X_train_sc,y_train)}')
print(f'LR test score: {lr.score(X_test_sc,y_test)}')

LR training score: 0.07870805165936012
LR test score: 0.0762504865919128


In [9]:
# Decision tree model:
dt = DecisionTreeRegressor()
dt.fit(X_train_sc, y_train)
dt_train_pred = dt.predict(X_train_sc)
dt_test_pred = dt.predict(X_test_sc)
print(f'Decision Tree training score: {dt.score(X_train_sc,y_train)}')
print(f'Decision Tree test score: {dt.score(X_test_sc,y_test)}')

Decision Tree training score: 0.9656854220585229
Decision Tree test score: 0.5337269533730442


In [10]:
bag = BaggingRegressor()
bag.fit(X_train_sc, y_train)
bag_train_pred = bag.predict(X_train_sc)
bag_test_pred = bag.predict(X_test_sc)
print(f'Bagged DT training score: {bag.score(X_train_sc,y_train)}')
print(f'Bagged DT test score: {bag.score(X_test_sc,y_test)}')

Bagged DT training score: 0.9205668292225033
Bagged DT test score: 0.6779221421893081


In [11]:
rf = RandomForestRegressor()
rf.fit(X_train_sc, y_train)
rf_train_pred = rf.predict(X_train_sc)
rf_test_pred = rf.predict(X_test_sc)
print(f'Random forest training score: {rf.score(X_train_sc,y_train)}')
print(f'Random forest test score: {rf.score(X_test_sc,y_test)}')

Random forest training score: 0.9353196832649067
Random forest test score: 0.7056661862851011


In [12]:
ada = AdaBoostRegressor(base_estimator=RandomForestRegressor())
ada.fit(X_train_sc, y_train)
ada_train_pred = ada.predict(X_train_sc)
ada_test_pred = ada.predict(X_test_sc)
print(f'Adaboost training score: {ada.score(X_train_sc,y_train)}')
print(f'Adaboost test score: {ada.score(X_test_sc,y_test)}')

Adaboost training score: 0.8643759438172689
Adaboost test score: 0.6813568958161513


In [13]:
X.shape

(13829, 6)

In [14]:
# Create the model
level1_models = [
    ('bag', BaggingRegressor()),
    ('knn_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('knn', KNeighborsRegressor())
    ])),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack = StackingRegressor(estimators=level1_models,
                         final_estimator=RandomForestRegressor())

In [15]:
# Fit
stack.fit(X_train_sc,y_train)

StackingRegressor(estimators=[('bag', BaggingRegressor()),
                              ('knn_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('knn',
                                                KNeighborsRegressor())])),
                              ('lasso_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('lasso', LassoCV())]))],
                  final_estimator=RandomForestRegressor())

In [16]:
# Train score
stack.score(X_train_sc, y_train)

0.8534422133895596

In [17]:
# Test score
stack.score(X_test_sc, y_test)

0.6462665016602321

In [18]:
level2_models = [
    ('bag', BaggingRegressor()),
    ('ada_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('ada', AdaBoostRegressor(base_estimator=RandomForestRegressor()))
    ])),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack1 = StackingRegressor(estimators=level2_models,
                         final_estimator=RandomForestRegressor())

In [19]:
# Fit
stack1.fit(X_train_sc,y_train)

StackingRegressor(estimators=[('bag', BaggingRegressor()),
                              ('ada_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('ada',
                                                AdaBoostRegressor(base_estimator=RandomForestRegressor()))])),
                              ('lasso_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('lasso', LassoCV())]))],
                  final_estimator=RandomForestRegressor())

In [20]:
# Train score
stack1.score(X_train_sc, y_train)

0.8623705337629469

In [21]:
# Test score
stack1.score(X_test_sc, y_test)

0.6503793631585513

In [22]:
level3_models = [
    ('bag', BaggingRegressor()),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack2 = StackingRegressor(estimators=level3_models,
                         final_estimator=RandomForestRegressor())

In [23]:
# Fit
stack2.fit(X_train_sc,y_train)

StackingRegressor(estimators=[('bag', BaggingRegressor()),
                              ('lasso_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('lasso', LassoCV())]))],
                  final_estimator=RandomForestRegressor())

In [24]:
# Train score
stack2.score(X_train_sc, y_train)

0.8356019077920918

In [25]:
# Test score
stack2.score(X_test_sc, y_test)

0.6088294577777917

In [26]:
level4_models = [
    ('bag', BaggingRegressor()),
    ('dt_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rf', RandomForestRegressor())
    ])),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack3 = StackingRegressor(estimators=level4_models)

In [27]:
# Fit
stack3.fit(X_train_sc,y_train)

StackingRegressor(estimators=[('bag', BaggingRegressor()),
                              ('dt_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('rf',
                                                RandomForestRegressor())])),
                              ('lasso_pipe',
                               Pipeline(steps=[('ss', StandardScaler()),
                                               ('lasso', LassoCV())]))])

In [28]:
# Train score
stack3.score(X_train_sc, y_train)

0.9319131589005882

In [29]:
# Test score
stack3.score(X_test_sc, y_test)

0.7089475794160607

In [30]:
level5_models = [
    ('bag', BaggingRegressor()),
    ('rf_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rf', RandomForestRegressor())
    ])),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack4 = StackingRegressor(estimators=level5_models)

In [31]:
params = {
    'rf_pipe__rf__n_estimators': [92],
    'lasso_pipe__lasso__n_alphas': [105],
    'lasso_pipe__lasso__max_iter': [925],
    'bag__n_estimators': [10],
    'rf_pipe__rf__min_samples_split': [5]
    
}

In [32]:
gs = GridSearchCV(estimator = stack4, param_grid = params, cv = 5)

In [33]:
gs.fit(X_train_sc,y_train)

  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)
  w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)


GridSearchCV(cv=5,
             estimator=StackingRegressor(estimators=[('bag',
                                                      BaggingRegressor()),
                                                     ('rf_pipe',
                                                      Pipeline(steps=[('ss',
                                                                       StandardScaler()),
                                                                      ('rf',
                                                                       RandomForestRegressor())])),
                                                     ('lasso_pipe',
                                                      Pipeline(steps=[('ss',
                                                                       StandardScaler()),
                                                                      ('lasso',
                                                                       LassoCV())]))]),
             param_grid={'bag__n_

In [34]:
gs.best_score_

0.6816816476878576

In [35]:
gs.best_params_

{'bag__n_estimators': 10,
 'lasso_pipe__lasso__max_iter': 925,
 'lasso_pipe__lasso__n_alphas': 105,
 'rf_pipe__rf__min_samples_split': 5,
 'rf_pipe__rf__n_estimators': 92}

In [36]:
gs.score(X_train_sc, y_train)

0.9059438204956484

In [37]:
gs.score(X_test_sc,y_test)

0.7067165512794302

In [38]:
stack4.get_params().keys()

dict_keys(['cv', 'estimators', 'final_estimator', 'n_jobs', 'passthrough', 'verbose', 'bag', 'rf_pipe', 'lasso_pipe', 'bag__base_estimator', 'bag__bootstrap', 'bag__bootstrap_features', 'bag__max_features', 'bag__max_samples', 'bag__n_estimators', 'bag__n_jobs', 'bag__oob_score', 'bag__random_state', 'bag__verbose', 'bag__warm_start', 'rf_pipe__memory', 'rf_pipe__steps', 'rf_pipe__verbose', 'rf_pipe__ss', 'rf_pipe__rf', 'rf_pipe__ss__copy', 'rf_pipe__ss__with_mean', 'rf_pipe__ss__with_std', 'rf_pipe__rf__bootstrap', 'rf_pipe__rf__ccp_alpha', 'rf_pipe__rf__criterion', 'rf_pipe__rf__max_depth', 'rf_pipe__rf__max_features', 'rf_pipe__rf__max_leaf_nodes', 'rf_pipe__rf__max_samples', 'rf_pipe__rf__min_impurity_decrease', 'rf_pipe__rf__min_samples_leaf', 'rf_pipe__rf__min_samples_split', 'rf_pipe__rf__min_weight_fraction_leaf', 'rf_pipe__rf__n_estimators', 'rf_pipe__rf__n_jobs', 'rf_pipe__rf__oob_score', 'rf_pipe__rf__random_state', 'rf_pipe__rf__verbose', 'rf_pipe__rf__warm_start', 'lasso_p

In [39]:
bag.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [40]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [41]:
LassoCV().get_params()

{'alphas': None,
 'copy_X': True,
 'cv': None,
 'eps': 0.001,
 'fit_intercept': True,
 'max_iter': 1000,
 'n_alphas': 100,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False,
 'precompute': 'auto',
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'verbose': False}