# Regression Data Set for ToyotaCorolla

## Data in-set and Preprocessing

In [247]:
import warnings
warnings.filterwarnings("ignore")

In [248]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso,LinearRegression,Ridge
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,PolynomialFeatures, MinMaxScaler
from termcolor import colored
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,accuracy_score,r2_score,recall_score,precision_score
from sklearn.svm import SVR,LinearSVR
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,AdaBoostClassifier,AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasClassifier

def prRed(skk): print("\033[1m \033[91m {}\033[00m" .format(skk)) 
def prGreen(skk): print("\033[1m \033[92m {}\033[00m" .format(skk)) 
def prYellow(skk): print("\033[1m \033[93m {}\033[00m" .format(skk)) 
def prBlack(skk): print("\033[1m \033[98m {}\033[00m" .format(skk)) 

In [249]:
corolla = pd.read_csv('ToyotaCorolla_kaggle.csv')
corolla.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,,1,...,1,1,1,0,0,0,1,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,,1,...,1,0,1,0,0,0,1,0,0,0
2,3,?TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90.0,1,...,0,0,1,0,0,0,1,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90.0,0,...,0,0,1,0,0,0,1,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90.0,0,...,1,1,1,0,1,0,1,0,0,0


In [250]:
corolla.drop(['Id','Model'], axis=1, inplace=True)
corolla.head()


Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,Automatic,cc,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,13500,23,10,2002,46986,Diesel,,1,0,2000.0,...,1,1,1,0,0,0,1,0,0,0
1,13750,23,10,2002,72937,Diesel,,1,0,2000.0,...,1,0,1,0,0,0,1,0,0,0
2,13950,24,9,2002,41711,Diesel,90.0,1,0,2000.0,...,0,0,1,0,0,0,1,0,0,0
3,14950,26,7,2002,48000,Diesel,90.0,0,0,2000.0,...,0,0,1,0,0,0,1,0,0,0
4,13750,30,3,2002,38500,Diesel,90.0,0,0,2000.0,...,1,1,1,0,1,0,1,0,0,0


##### NULL VALUES
We have null values in these columns - 'HP' , 'cc' , 'Doors', 'Cylinders', 'Gears', 'Weight'. )

In [251]:
corolla.isnull().sum()



Price                0
Age_08_04            0
Mfg_Month            0
Mfg_Year             0
KM                   0
Fuel_Type            0
HP                  70
Met_Color            0
Automatic            0
cc                  51
Doors               12
Cylinders           38
Gears               18
Quarterly_Tax        0
Weight               5
Mfr_Guarantee        0
BOVAG_Guarantee      0
Guarantee_Period     0
ABS                  0
Airbag_1             0
Airbag_2             0
Airco                0
Automatic_airco      0
Boardcomputer        0
CD_Player            0
Central_Lock         0
Powered_Windows      0
Power_Steering       0
Radio                0
Mistlamps            0
Sport_Model          0
Backseat_Divider     0
Metallic_Rim         0
Radio_cassette       0
Tow_Bar              0
dtype: int64

##### To impute null values, best categorical column to groupby would be 'Fuel_type'

In [252]:
g_corolla = corolla.groupby(['Fuel_Type'])
g_corolla.first()

Unnamed: 0_level_0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
Fuel_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CNG,7750,43,2,2001,178858,110.0,0,0,1600.0,3.0,...,0,0,1,1,0,0,1,0,1,0
Diesel,13500,23,10,2002,46986,90.0,1,0,2000.0,3.0,...,1,1,1,0,0,0,1,0,0,0
Petrol,21500,27,6,2002,19700,192.0,0,0,1800.0,3.0,...,1,1,1,1,0,0,0,1,1,0


In [253]:
g1_corolla = g_corolla.transform(lambda grp: grp.fillna(grp.mean()))

In [254]:
g1_corolla

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,13500,23,10,2002,46986,78.306452,1,0,2000.0,3.0,...,1,1,1,0,0,0,1,0,0,0
1,13750,23,10,2002,72937,78.306452,1,0,2000.0,3.0,...,1,0,1,0,0,0,1,0,0,0
2,13950,24,9,2002,41711,90.000000,1,0,2000.0,3.0,...,0,0,1,0,0,0,1,0,0,0
3,14950,26,7,2002,48000,90.000000,0,0,2000.0,3.0,...,0,0,1,0,0,0,1,0,0,0
4,13750,30,3,2002,38500,90.000000,0,0,2000.0,3.0,...,1,1,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,12,1998,20544,86.000000,1,0,1300.0,3.0,...,1,1,1,0,1,1,1,0,0,0
1432,10845,72,9,1998,19000,86.000000,0,0,1300.0,3.0,...,0,0,1,0,0,1,1,0,0,0
1433,8500,71,10,1998,17016,86.000000,0,0,1300.0,3.0,...,0,0,1,0,0,0,1,0,0,0
1434,7250,70,11,1998,16916,86.000000,1,0,1300.0,3.0,...,0,0,0,0,0,0,1,0,0,0


##### Group transformation with mean values in the above 6 columns

In [255]:
g1_corolla['HP']=(g_corolla['HP'].apply(lambda grp:grp.fillna(grp.mean())))

In [256]:
g1_corolla['cc']=(g_corolla['cc'].apply(lambda grp:grp.fillna(grp.mean())))


In [257]:
g1_corolla['Doors']=(g_corolla['Doors'].apply(lambda grp:grp.fillna(grp.median())))


In [258]:
g1_corolla['Cylinders']=(g_corolla['Cylinders'].apply(lambda grp:grp.fillna(grp.median())))


In [259]:
g1_corolla['Gears']=(g_corolla['Gears'].apply(lambda grp:grp.fillna(grp.median())))


In [260]:
g1_corolla['Weight']=(g_corolla['Weight'].apply(lambda grp:grp.fillna(grp.mean())))


In [261]:
g1_corolla= pd.concat([g1_corolla,corolla['Fuel_Type']], axis=1)


In [262]:
g1_corolla = pd.get_dummies(g1_corolla)

In [263]:
y1 = g1_corolla['Price']

In [264]:
g1_corolla.drop('Price', axis=1, inplace=True) #Since it's target variable

In [265]:
g1_corolla.head()

Unnamed: 0,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,Cylinders,...,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
0,23,10,2002,46986,78.306452,1,0,2000.0,3.0,4.0,...,0,0,0,1,0,0,0,0,1,0
1,23,10,2002,72937,78.306452,1,0,2000.0,3.0,4.0,...,0,0,0,1,0,0,0,0,1,0
2,24,9,2002,41711,90.0,1,0,2000.0,3.0,4.0,...,0,0,0,1,0,0,0,0,1,0
3,26,7,2002,48000,90.0,0,0,2000.0,3.0,4.0,...,0,0,0,1,0,0,0,0,1,0
4,30,3,2002,38500,90.0,0,0,2000.0,3.0,4.0,...,0,1,0,1,0,0,0,0,1,0


##### Selected Standardscaler because of difference in the values sizes and hence needed to be normalized.

In [266]:
X_train_org, X_test_org, y_train, y_test = train_test_split(g1_corolla,y1,test_size=0.25, random_state=0)

In [267]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train_org)
X_test = sc.transform(X_test_org)

In [268]:
X_train

array([[ 1.17930639, -0.75471747, -1.04764096, ..., -0.11053527,
        -0.34067899,  0.3623569 ],
       [-0.88746902,  0.13556809,  0.86582876, ..., -0.11053527,
        -0.34067899,  0.3623569 ],
       [-0.03956116, -1.05147933,  0.22800552, ..., -0.11053527,
         2.93531454, -2.75971013],
       ...,
       [ 0.75535246,  1.61937736, -1.04764096, ..., -0.11053527,
         2.93531454, -2.75971013],
       [-0.30453237,  0.43232994,  0.22800552, ..., -0.11053527,
        -0.34067899,  0.3623569 ],
       [ 0.64936398, -1.34824118, -0.40981772, ..., -0.11053527,
        -0.34067899,  0.3623569 ]])

In [269]:
X_train.shape

(1077, 36)

## ENSEMBLE MODELS

#### Bagging

##### With DecisionTreeRegressor

In [270]:
p_dt_reg = {'max_depth':[1,2,3,4,5]}
dt_reg = DecisionTreeRegressor(random_state=0)
grid_dt_reg = GridSearchCV(dt_reg,p_dt_reg,cv=3,iid='False')
grid_dt_reg.fit(X_train,y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=0, splitter='best'),
             iid='False', n_jobs=None,
             param_grid={'max_depth': [1, 2, 3, 4, 5]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [271]:
print("Best max_depth :",grid_dt_reg.best_params_)

Best max_depth : {'max_depth': 5}


In [272]:
params = {'max_features': [2,5,10] ,'n_estimators': [100, 200, 300, 500],'max_samples': [0.1, 0.5, 1]}
dt_reg = DecisionTreeRegressor(max_depth=5,random_state=0)
bagging = BaggingRegressor(dt_reg,random_state=0)
grid_bag = GridSearchCV(bagging, params, cv=5,iid='False')

grid_bag.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=BaggingRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                                             criterion='mse',
                                                                             max_depth=5,
                                                                             max_features=None,
                                                                             max_leaf_nodes=None,
                                                                             min_impurity_decrease=0.0,
                                                                             min_impurity_split=None,
                                                                             min_samples_leaf=1,
                                                                             min_samples_split=2,
                                                                             min_weight_fract

In [273]:
prRed("\nBagging wiht DT Regressor")
print("Best Params :", grid_bag.best_params_)
print("Training Score  -->", grid_bag.score(X_train,y_train))
print("Test Score      -->", grid_bag.score(X_test,y_test))

[1m [91m 
Bagging wiht DT Regressor[00m
Best Params : {'max_features': 10, 'max_samples': 0.5, 'n_estimators': 100}
Training Score  --> 0.9048272785655762
Test Score      --> 0.8312525493300356


##### With SVR

In [274]:
# From Project 1, SVR best parameters were already determined as
# SVR -> Best parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'linear'}

In [275]:
params = {'max_features': [2,5,10] ,'n_estimators': [100,200,300,500],'max_samples': [0.1,0.5,1]}
lsvr = SVR(C=100,gamma=0.001,kernel='linear')
bagging_lsvr = BaggingRegressor(lsvr,random_state=0)
grid_bag_lsvr = GridSearchCV(bagging_lsvr, params, cv=5,iid='False')

grid_bag_lsvr.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=BaggingRegressor(base_estimator=SVR(C=100,
                                                           cache_size=200,
                                                           coef0=0.0, degree=3,
                                                           epsilon=0.1,
                                                           gamma=0.001,
                                                           kernel='linear',
                                                           max_iter=-1,
                                                           shrinking=True,
                                                           tol=0.001,
                                                           verbose=False),
                                        bootstrap=True,
                                        bootstrap_features=False,
                                        max_features=1.0, max_samples=1.0,
                                  

In [276]:
prRed("\nBagging with Linear SVR")
print("Best Params :", grid_bag_lsvr.best_params_)
print("Training Score  -->", grid_bag_lsvr.score(X_train,y_train))
print("Test Score      -->", grid_bag_lsvr.score(X_test,y_test))

[1m [91m 
Bagging with Linear SVR[00m
Best Params : {'max_features': 10, 'max_samples': 0.5, 'n_estimators': 100}
Training Score  --> 0.8043223862288902
Test Score      --> 0.7728180779522372


#### PASTING

##### With Decision Tree Regressor

In [277]:
params = {'max_features': [2,5,10] ,'n_estimators': [100, 200, 300, 500],'max_samples': [0.1, 0.5, 1]}
dt_reg = DecisionTreeRegressor(max_depth=5,random_state=0)
pasting = BaggingRegressor(dt_reg,random_state=0,bootstrap='False')
grid_pas = GridSearchCV(pasting, params, cv=5,iid='False')

grid_pas.fit(X_train,y_train)

prRed("\nPasting with DT Regressor")
print("Best Params :", grid_pas.best_params_)
print("Training Score  -->", grid_pas.score(X_train,y_train))
print("Test Score      -->", grid_pas.score(X_test,y_test))

[1m [91m 
Pasting with DT Regressor[00m
Best Params : {'max_features': 10, 'max_samples': 0.5, 'n_estimators': 100}
Training Score  --> 0.9048272785655762
Test Score      --> 0.8312525493300356


##### With Linear SVR

In [278]:
params = {'max_features': [2,5,10] ,'n_estimators': [100,200,300,500],'max_samples': [0.1,0.5,1]}
lsvr = SVR(C=100,gamma=0.001,kernel='linear')
pasting_lsvr = BaggingRegressor(lsvr,random_state=0,bootstrap='False')
grid_pas_lsvr = GridSearchCV(pasting_lsvr, params, cv=5,iid='False')

grid_pas_lsvr.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=BaggingRegressor(base_estimator=SVR(C=100,
                                                           cache_size=200,
                                                           coef0=0.0, degree=3,
                                                           epsilon=0.1,
                                                           gamma=0.001,
                                                           kernel='linear',
                                                           max_iter=-1,
                                                           shrinking=True,
                                                           tol=0.001,
                                                           verbose=False),
                                        bootstrap='False',
                                        bootstrap_features=False,
                                        max_features=1.0, max_samples=1.0,
                               

In [279]:
prRed("\nPasting with Lasso")
print("Best Params :", grid_pas_lsvr.best_params_)
print("Training Score  -->", grid_pas_lsvr.score(X_train,y_train))
print("Test Score      -->", grid_pas_lsvr.score(X_test,y_test))

[1m [91m 
Pasting with Lasso[00m
Best Params : {'max_features': 10, 'max_samples': 0.5, 'n_estimators': 100}
Training Score  --> 0.8043223862288902
Test Score      --> 0.7728180779522372


### ADABOOSTING REGRESSOR

#### - As we have already found the best parameters for SVR and Decision Tree Regressor, we will use the same parameters.

#### with DT Regressor

In [280]:
params = {'n_estimators': [50, 100, 200, 500],'learning_rate': [0.1, 0.25, 0.5, 0.75, 1]}
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5), random_state=0)

grid_ada = GridSearchCV(ada_reg, params, cv=5, iid='False')
grid_ada.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                                              criterion='mse',
                                                                              max_depth=5,
                                                                              max_features=None,
                                                                              max_leaf_nodes=None,
                                                                              min_impurity_decrease=0.0,
                                                                              min_impurity_split=None,
                                                                              min_samples_leaf=1,
                                                                              min_samples_split=2,
                                                                              min_we

In [281]:
prRed("\nAdaBoost Regressor with DT Regressor")
print("Best Params :", grid_ada.best_params_)
print("Training Score  -->", grid_ada.score(X_train,y_train))
print("Test Score      -->", grid_ada.score(X_test,y_test))

[1m [91m 
AdaBoost Regressor with DT Regressor[00m
Best Params : {'learning_rate': 0.1, 'n_estimators': 500}
Training Score  --> 0.9508459766847791
Test Score      --> 0.8809075179420086


#### With SVR

In [282]:
params = {'n_estimators': [50, 100, 200, 500],'learning_rate': [0.1,  0.5,   1]}
ada_reg_svr = AdaBoostRegressor(SVR(C=100,gamma=0.001,kernel='linear'), random_state=0)

grid_ada_svr = GridSearchCV(ada_reg_svr, params, cv=5, iid='False')
grid_ada_svr.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostRegressor(base_estimator=SVR(C=100,
                                                            cache_size=200,
                                                            coef0=0.0, degree=3,
                                                            epsilon=0.1,
                                                            gamma=0.001,
                                                            kernel='linear',
                                                            max_iter=-1,
                                                            shrinking=True,
                                                            tol=0.001,
                                                            verbose=False),
                                         learning_rate=1.0, loss='linear',
                                         n_estimators=50, random_state=0),
             iid='False', n_jobs=None,
             param_grid={'learni

In [283]:
prRed("\nAdaBoost Regressor with SVR")
print("Best Params :", grid_ada_svr.best_params_)
print("Training Score  -->", grid_ada_svr.score(X_train,y_train))
print("Test Score      -->", grid_ada_svr.score(X_test,y_test))

[1m [91m 
AdaBoost Regressor with SVR[00m
Best Params : {'learning_rate': 0.1, 'n_estimators': 50}
Training Score  --> 0.9211399353196834
Test Score      --> 0.8581108198426068


### GRADIENT BOOSTING

### With DT Reg

In [284]:
params = {'max_features': [2,5,10,20] ,'n_estimators': [50, 100, 200, 500],'learning_rate': [0.1, 0.25, 0.5, 0.75, 1]}
grad_gboost = GradientBoostingRegressor(random_state=0)

grid_gboost = GridSearchCV(grad_gboost,params,cv=5,iid='False')
grid_gboost.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                 

In [285]:
prRed("\nGradient Boosting with DT Regressor")
print("Best Params :", grid_gboost.best_params_)
print("Training Score  -->", grid_gboost.score(X_train,y_train))
print("Test Score      -->", grid_gboost.score(X_test,y_test))

[1m [91m 
Gradient Boosting with DT Regressor[00m
Best Params : {'learning_rate': 0.1, 'max_features': 10, 'n_estimators': 100}
Training Score  --> 0.9553517920714709
Test Score      --> 0.8867838592519329


## Prinicipal Component Ananlysis

##### With 95% variance explained, Train and Test datasets were transformed.

In [286]:
from sklearn.decomposition import PCA
pca = PCA(.95)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [287]:
prRed("\nPost PCA Transformation :")
print("New dimension for the datasets is :",X_test_pca.shape[1])

[1m [91m 
Post PCA Transformation :[00m
New dimension for the datasets is : 24


### <font color='red'> Models from Project 1 with PCA transformed 24 dimensions:
-Taking only grid searched optimum parameters and re executing all the models below.

#### Linear Regression 

In [288]:
g1_corolla.columns

Index(['Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM', 'HP', 'Met_Color',
       'Automatic', 'cc', 'Doors', 'Cylinders', 'Gears', 'Quarterly_Tax',
       'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee', 'Guarantee_Period', 'ABS',
       'Airbag_1', 'Airbag_2', 'Airco', 'Automatic_airco', 'Boardcomputer',
       'CD_Player', 'Central_Lock', 'Powered_Windows', 'Power_Steering',
       'Radio', 'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Tow_Bar', 'Fuel_Type_CNG', 'Fuel_Type_Diesel',
       'Fuel_Type_Petrol'],
      dtype='object')

In [289]:
lreg= LinearRegression()
lreg.fit(X_train_pca,y_train)
#lreg.fit(X_test_pca,y_test)
lreg_train_score_pca=lreg.score(X_train_pca,y_train)
lreg_test_score_pca=lreg.score(X_test_pca,y_test)
print("Linear Test Score -->", lreg_test_score_pca)

Linear Test Score --> 0.8332554899569575


#### Lasso Regression

In [290]:
lasso = Lasso(alpha=5)
lasso.fit(X_train_pca,y_train)
lasso_train_score_pca = lasso.score(X_train_pca,y_train)
lasso_test_score_pca = lasso.score(X_test_pca,y_test)
print("Lasso Test Score -->",lasso_test_score_pca)

Lasso Test Score --> 0.8339444787661593


#### Ridge Regression

In [291]:
ridge = Ridge(alpha = 15)
ridge.fit(X_train_pca,y_train)
ridge_train_score_pca=ridge.score(X_train_pca,y_train)
ridge_test_score_pca=ridge.score(X_test_pca,y_test)
print("Ridge Test Score -->",ridge_test_score_pca)

Ridge Test Score --> 0.8338818902061921


#### Polynomial Regression

In [292]:
poly = PolynomialFeatures(1)
poly.fit_transform(X_train_pca,y_train)
poly_train_score_pca=lreg.score(X_train_pca,y_train)
poly_test_score_pca=lreg.score(X_test_pca,y_test)
print("Poly Test Score -->",poly_test_score_pca)

Poly Test Score --> 0.8332554899569575


#### kNN Regressor

In [293]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_pca,y_train)
knn_train_score_pca=knn.score(X_train_pca,y_train)
knn_test_score_pca=knn.score(X_test_pca,y_test)
print("kNN Test Score -->",knn_test_score_pca)

kNN Test Score --> 0.8002188764797802


#### LinearSVR

In [294]:
param_grid = {'C':[10,50,100,150,200,250]}
lsvr = GridSearchCV(LinearSVR(), param_grid, cv=5, iid='False')
#print(lsvr.best_params_)
lsvr.fit(X_train_pca,y_train)
lsvr_train_score_pca=lsvr.score(X_train_pca,y_train)
lsvr_test_score_pca=lsvr.score(X_test_pca,y_test)
print("Linear SVR Test Score -->",lsvr_test_score_pca)

Linear SVR Test Score --> 0.8321815987158626


#### SVR

In [295]:
svr = SVR(C=100,gamma=0.001,kernel='linear')

svr.fit(X_train_pca,y_train)
svr_train_score_pca=svr.score(X_train_pca,y_train)
svr_test_score_pca=svr.score(X_test_pca,y_test)
print("SVR Test Score -->",svr_test_score_pca)

SVR Test Score --> 0.8330682575646479


In [296]:
PCA_score_model =  [('Linear ', lreg_train_score_pca, lreg_test_score_pca),
          ('Ridge ', ridge_train_score_pca, ridge_test_score_pca),
          ('Lasso ', lasso_train_score_pca, lasso_test_score_pca),
          ('Polynomial ', poly_train_score_pca, poly_test_score_pca),
          ('kNN ', knn_train_score_pca, knn_test_score_pca),
          ('LinearSVR ', lsvr_train_score_pca, lsvr_test_score_pca),
		  ('SVR ', svr_train_score_pca, svr_test_score_pca)]

In [297]:
stat = pd.DataFrame(data = PCA_score_model, columns = ['Models', 'PCA Train_Score', 'PCA Test_Score'])
prRed("With PCA")
print("\n",stat)

[1m [91m With PCA[00m

         Models  PCA Train_Score  PCA Test_Score
0      Linear          0.907934        0.833255
1       Ridge          0.907908        0.833882
2       Lasso          0.907888        0.833944
3  Polynomial          0.907934        0.833255
4         kNN          0.897985        0.800219
5   LinearSVR          0.904882        0.832182
6         SVR          0.904971        0.833068


### <font color='red'> Without PCA Transformation
             Models      Score_Train  Score_Test 
              Linear      0.922860    0.862248  
               Ridge      0.922295    0.860712  
               Lasso      0.922643    0.860758  
          Polynomial      0.922859    0.862262  
                 kNN      0.907167    0.818258  
                 SVM      0.917875    0.845226 
                 
### <font color='red'> After PCA Transformation
         Models  PCA Train_Score  PCA Test_Score
        Linear          0.907934      0.833255
         Ridge          0.907908      0.833882
         Lasso          0.907888      0.833944
    Polynomial          0.907934      0.833255
           kNN          0.897985      0.800219
     LinearSVR          0.904880      0.831845
           SVR          0.904971      0.833068

 - Test scores of models post PCA transformation are ~3%(average) lower than test scores without PCA transformation, which is unusual.

### NUERAL NETWORKS

#### Recurrent Neural Network

In [298]:
np.random.seed(10)
rnn = Sequential()
#Additon of input layer
rnn.add(Dense(36,input_dim=36,kernel_initializer='normal',activation='relu'))

#Addition of hidden layer
rnn.add(Dense(8,activation='relu',kernel_initializer='normal'))

#Output layer
rnn.add(Dense(1, kernel_initializer='normal'))

In [299]:
rnn.compile(loss='mse',optimizer='adam',metrics=['mse'])

In [300]:
rnn.fit(X_train,y_train,epochs=500, batch_size=20)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 

Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 

Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.callbacks.History at 0x2561cec0a88>

In [301]:
#Model Evaluation

rnn.evaluate(X_test,y_test)

y_train_pred = rnn.predict(X_train)
y_test_pred = rnn.predict(X_test)

prRed("\nRNN Scores :")
print("Train Score -->", r2_score(y_train_pred,y_train))
print("Test Score  -->", r2_score(y_test_pred,y_test))



[1m [91m 
RNN Scores :[00m
Train Score --> 0.948001308610764
Test Score  --> 0.8661763993633015


#### Grid Search for RNN with best parameters 

In [302]:
def rnn_fn():
    
    rnn = Sequential()
    rnn.add(Dense(33,input_dim=36,activation='relu'))
    rnn.add(Dense(12,activation='relu'))
    rnn.add(Dense(1, activation = 'sigmoid'))
    
    rnn.compile(loss='mse',optimizer='adam',metrics=['accuracy'])
    return rnn


In [303]:
rnn_1 = KerasClassifier(build_fn = rnn_fn,verbose = 0)

params = {'batch_size':[10,15,20],'epochs':[100,250,500]}
rnn_grid = GridSearchCV(estimator=rnn_1 ,param_grid=params, cv=5,n_jobs=-1)

In [304]:
rnn_grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x000002561F924FC8>,
             iid='deprecated', n_jobs=-1,
             param_grid={'batch_size': [10, 15, 20], 'epochs': [100, 250, 500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

#### RNN with the best parameters found -->

In [305]:
np.random.seed(10)
rnn = Sequential()
#Additon of input layer
rnn.add(Dense(36,input_dim=36,kernel_initializer='normal',activation='relu'))
rnn.add(Dense(8,activation='relu',kernel_initializer='normal'))
rnn.add(Dense(1, kernel_initializer='normal'))

In [306]:
rnn.compile(loss='mse',optimizer='adam',metrics=['mse'])

In [307]:
rnn.fit(X_train,y_train,epochs=500, batch_size=10)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 

Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 

Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.callbacks.History at 0x2561c71ba88>

In [308]:
rnn.evaluate(X_test,y_test)

y_train_pred = rnn.predict(X_train)
y_test_pred = rnn.predict(X_test)

prRed("\nRNN Scores :")
print("Train Score -->", r2_score(y_train_pred,y_train))
print("Test Score  -->", r2_score(y_test_pred,y_test))

[1m [91m 
RNN Scores :[00m
Train Score --> 0.9511968672357237
Test Score  --> 0.8596356894515929


In [309]:
from IPython.display import display, Javascript
display(Javascript("""
require(
    ["base/js/dialog"], 
    function(dialog) {
        dialog.modal({
            title: 'Project_2_Regression Summary',
            body: 'Looking at post PCA scores, Linear regression has the highest score.',
            buttons: {
                'Best Model is yet again Linear!!': {}
            }
        });
    })
"""))

<IPython.core.display.Javascript object>