In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

mercedes_data=pd.read_csv('Mercedes.csv')

mercedes_data

mercedes_data.isnull().sum()

cols_to_drop=[]

for col in mercedes_data.columns:
    if mercedes_data[col].nunique()==mercedes_data.shape[0] or mercedes_data[col].nunique()==1:
        cols_to_drop.append(col)
        
cols_to_drop

mercedes_data.drop(columns=cols_to_drop,inplace=True)

mercedes_data

y_output=mercedes_data['y']

mercedes_data.drop(columns='y',inplace=True)

cols_to_drop


['ID',
 'X11',
 'X93',
 'X107',
 'X233',
 'X235',
 'X268',
 'X289',
 'X290',
 'X293',
 'X297',
 'X330',
 'X347']

In [3]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(mercedes_data,y_output,test_size=0.3,random_state=45)



In [4]:
from sklearn.preprocessing import LabelEncoder
import numpy as np


class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [5]:
from sklearn.preprocessing import LabelEncoder

le_ext=LabelEncoderExt()

for col in x_train.columns:
    if x_train[col].dtype=='object':
        le_ext.fit(x_train[col])
        x_train[col]=le_ext.transform(x_train[col])
        x_test[col]=le_ext.transform(x_test[col])
        

In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif=pd.DataFrame()

vif['VIF Factor']=[variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]

vif['Column Name']=x_train.columns

vif

Unnamed: 0,VIF Factor,Column Name
0,2.627822,X0
1,6.379043,X1
2,19.723915,X2
3,2.935504,X3
4,1.088730,X4
5,1.237694,X5
6,1.302293,X6
7,1.229668,X8
8,inf,X10
9,inf,X12


In [7]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca=PCA()

pca_data=pca.fit_transform(x_train)

pca_data

pca_train=pd.DataFrame(data=pca_data)

pca_train

plt.plot(np.cumsum(pca.explained_variance_ratio_))

pca.explained_variance_ratio_

array([3.72908151e-01, 2.23800357e-01, 1.31390653e-01, 1.12965285e-01,
       9.66633037e-02, 1.64581591e-02, 7.73988976e-03, 4.58133696e-03,
       3.07592511e-03, 2.48169062e-03, 2.44282888e-03, 2.10782291e-03,
       1.77152867e-03, 1.54278655e-03, 1.35026559e-03, 1.06076880e-03,
       9.99621382e-04, 9.39822296e-04, 8.16785985e-04, 7.22659797e-04,
       6.84492998e-04, 6.57403012e-04, 6.10710407e-04, 5.33660373e-04,
       5.15050572e-04, 4.48063162e-04, 4.39687585e-04, 3.93498366e-04,
       3.87368533e-04, 3.75872243e-04, 3.40847145e-04, 3.33493867e-04,
       3.07268390e-04, 3.02613919e-04, 2.85410651e-04, 2.61308562e-04,
       2.29615631e-04, 2.27519038e-04, 2.13549199e-04, 2.00929298e-04,
       1.94941446e-04, 1.90677090e-04, 1.86018833e-04, 1.81808317e-04,
       1.78206915e-04, 1.61222000e-04, 1.58256976e-04, 1.52602405e-04,
       1.51146472e-04, 1.45499425e-04, 1.38754196e-04, 1.33914982e-04,
       1.31034759e-04, 1.24089767e-04, 1.19835985e-04, 1.15488071e-04,
      

In [8]:
from sklearn.decomposition import PCA

pca1=PCA(n_components=50)
pca_data1=pca1.fit_transform(x_train)
pca_train1=pd.DataFrame(data=pca_data1)

pca_train1

pca_data2=pca1.transform(x_test)

pca_test1=pd.DataFrame(data=pca_data2)

pca_test1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-15.372219,-11.737919,-5.120292,-4.049192,9.535208,1.042008,0.455557,0.609389,-1.126864,0.265668,...,0.147103,0.121572,-0.194761,0.265488,-0.153102,0.227343,0.003027,0.075583,0.010143,-0.014403
1,-18.514274,-5.243239,-9.620923,9.731743,-10.156683,0.878293,-1.555467,1.448271,-0.805423,2.330617,...,-0.037392,0.085048,0.000011,0.070971,0.009612,0.034156,-0.054528,-0.275958,0.242284,0.101158
2,2.711259,-2.360252,3.686402,-5.625177,1.845585,-1.254257,-2.513207,-1.146189,-0.532408,-1.015116,...,0.025381,-0.003411,-0.353460,-0.055062,-0.318842,-0.088788,-0.150135,0.078145,0.310084,0.119036
3,-10.405716,19.092552,-7.804337,6.288636,-0.823775,-1.594811,1.052971,0.316309,0.033349,-0.650252,...,-0.165899,0.156514,-0.140746,0.209359,0.197412,0.092124,-0.048366,-0.052259,-0.099565,-0.094539
4,-4.943357,-9.334090,-11.376598,2.435650,8.819421,-3.984359,-3.494229,0.790858,0.768638,0.201103,...,-0.152804,0.362768,0.209176,-0.351069,0.078570,0.247623,-0.313839,-0.105703,-0.010806,0.221329
5,9.343065,-3.954618,-2.818850,-4.307149,-2.552516,-2.575934,1.410287,-1.385250,2.958394,-0.596393,...,0.055131,0.042133,0.155848,0.149265,0.190609,-0.165945,-0.045399,0.259755,0.162644,-0.105565
6,-6.503586,-7.415721,15.261489,5.739679,-11.804038,-2.945480,1.530998,2.031503,0.709611,3.575135,...,0.310413,-0.204779,0.143180,0.101079,0.225949,-0.159625,0.385432,-0.241688,0.201176,-0.404116
7,21.512494,-7.623719,-1.036448,-7.791665,-2.864098,6.574987,-2.962539,-1.537966,0.976287,-0.363017,...,0.047034,0.131612,0.080954,0.029103,0.184378,-0.155099,-0.084240,-0.146039,-0.008282,-0.127782
8,6.933409,22.259784,9.040623,-5.584496,-3.987993,-2.117065,-0.115390,0.722188,0.938220,0.981957,...,0.160523,-0.191177,0.745619,-0.113841,-0.795377,0.098621,0.196281,-0.464006,0.159441,0.172736
9,22.922327,16.065580,0.403121,-11.913205,0.321973,0.230028,1.017027,1.634796,-2.207176,0.173530,...,-0.080645,0.204795,-0.373548,-0.255181,0.366217,-0.296058,-0.020053,0.189794,-0.026469,-0.073352


In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif1=pd.DataFrame()

vif1['VIF Factor']=[variance_inflation_factor(pca_train1.values, i) for i in range(pca_train1.shape[1])]

vif1['Column Name']=pca_train1.columns

vif1

Unnamed: 0,VIF Factor,Column Name
0,1.0,0
1,1.0,1
2,1.0,2
3,1.0,3
4,1.0,4
5,1.0,5
6,1.0,6
7,1.0,7
8,1.0,8
9,1.0,9


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

lin_reg=LinearRegression()
lin_reg.fit(pca_train1,y_train)

y_pred=lin_reg.predict(pca_test1)

print('R score:',r2_score(y_test,y_pred))
print('MSE:',mean_squared_error(y_test,y_pred))




R score: 0.5546441555357454
MSE: 64.69074809474682


In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

laso=Lasso()
laso.fit(pca_train1,y_train)

y_pred_laso=laso.predict(pca_test1)

print('R score:',r2_score(y_test,y_pred_laso))
print('MSE:',mean_squared_error(y_test,y_pred_laso))

laso.coef_


R score: 0.48699761506515904
MSE: 74.51683517423284


array([ 1.39156855e-01,  6.89531721e-03, -1.19916144e-01,  6.44989924e-04,
        2.47496630e-02, -0.00000000e+00,  9.01145526e-01, -2.21104357e+00,
        0.00000000e+00,  6.89212034e-01,  2.45153626e+00, -1.95532481e+00,
        6.56390765e-02, -1.55932930e+00,  0.00000000e+00,  3.42897746e-01,
        4.15809818e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00])

In [12]:
from sklearn.tree import DecisionTreeRegressor

d_tree=DecisionTreeRegressor()
d_tree.fit(pca_train1,y_train)

y_pred_tree=d_tree.predict(pca_test1)

print('R score:',r2_score(y_test,y_pred_tree))
print('MSE:',mean_squared_error(y_test,y_pred_tree))

R score: -0.03713532319737545
MSE: 150.65045349036689


In [13]:
from sklearn.ensemble import RandomForestRegressor

r_forest=RandomForestRegressor()
r_forest.fit(pca_train1,y_train)

r_pred_for=r_forest.predict(pca_test1)


print('R score:',r2_score(y_test,r_pred_for))
print('MSE:',mean_squared_error(y_test,r_pred_for))

R score: 0.45376358446256027
MSE: 79.3442879372523


In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

ran_dict={'max_depth':[2,6,8,4],
         'min_samples_leaf':[2,5,7,9],
         'min_samples_split':[3,4,6,7],
         'n_estimators':np.arange(10,100)}

ran_search=RandomizedSearchCV(r_forest,ran_dict,cv=10,n_iter=20)
ran_search.fit(pca_train1,y_train)
ran_predict=ran_search.predict(pca_test1)

print('R score:',r2_score(y_test,ran_predict))
print('MSE:',mean_squared_error(y_test,ran_predict))

print(ran_search.best_estimator_)
print(ran_search.best_score_)

R score: 0.6112030891792464
MSE: 56.47520590681246
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=7, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=31,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
0.5638636953348694


In [15]:
from sklearn.ensemble import AdaBoostRegressor

ab=AdaBoostRegressor()
ab.fit(pca_train1,y_train)

y_ab=ab.predict(pca_test1)


print('R score:',r2_score(y_test,y_ab))
print('MSE:',mean_squared_error(y_test,y_ab))


R score: -0.19458353025738795
MSE: 173.52079959111578


In [16]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

ran_ada={'learning_rate':[1,2,3],
         'n_estimators':np.arange(10,100)}

ada_search=RandomizedSearchCV(ab,ran_ada,cv=10,n_iter=20)
ada_search.fit(pca_train1,y_train)
ada_predict=ada_search.predict(pca_test1)

print('R score:',r2_score(y_test,ada_predict))
print('MSE:',mean_squared_error(y_test,ada_predict))

print(ada_search.best_estimator_)
print(ada_search.best_score_)



R score: -0.0226490239811854
MSE: 148.54622707217086
AdaBoostRegressor(base_estimator=None, learning_rate=3, loss='linear',
                  n_estimators=73, random_state=None)
0.06792227938836758


In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

grid_ada={'learning_rate':[0.5,1,1.5,2],
         'n_estimators':np.arange(10,30)}

grid_search=GridSearchCV(ab,grid_ada,cv=10)
grid_search.fit(pca_train1,y_train)
grid_predict=grid_search.predict(pca_test1)

print('R score:',r2_score(y_test,grid_predict))
print('MSE:',mean_squared_error(y_test,grid_predict))

print(grid_search.best_estimator_)
print(grid_search.best_score_)



R score: 0.361692016336864
MSE: 92.71826448733616
AdaBoostRegressor(base_estimator=None, learning_rate=0.5, loss='linear',
                  n_estimators=13, random_state=None)
0.3448773019362117


In [18]:
from sklearn.ensemble import GradientBoostingRegressor

gb=GradientBoostingRegressor()
gb.fit(pca_train1,y_train)

y_gb=gb.predict(pca_test1)


print('R score:',r2_score(y_test,y_gb))
print('MSE:',mean_squared_error(y_test,y_gb))

R score: 0.5070584292902802
MSE: 71.60287525713787


In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

ran_grad={'max_depth':[2,6,8,4],
         'min_samples_leaf':[2,5,7,9],
         'min_samples_split':[3,4,6,7],
         'n_estimators':np.arange(80,110)}

grad_search=RandomizedSearchCV(gb,ran_grad,cv=10)
grad_search.fit(pca_train1,y_train)
grad_predict=grad_search.predict(pca_test1)

print('R score:',r2_score(y_test,grad_predict))
print('MSE:',mean_squared_error(y_test,grad_predict))

print(grad_search.best_estimator_)
print(grad_search.best_score_)

R score: 0.5275028986373087
MSE: 68.63318701143766
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=9, min_samples_split=7,
                          min_weight_fraction_leaf=0.0, n_estimators=103,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
0.5043199610265621


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

grid_grad={'max_depth':[3,4,5],
         'min_samples_leaf':[6,7,8],
         'min_samples_split':[6,7,8],
          'n_estimators':np.arange(100,110)}

grad_search1=GridSearchCV(gb,grid_grad,cv=10)
grad_search1.fit(pca_train1,y_train)
grad_predict1=grad_search1.predict(pca_test1)

print('R score:',r2_score(y_test,grad_predict1))
print('MSE:',mean_squared_error(y_test,grad_predict1))

print(grad_search1.best_estimator_)
print(grad_search1.best_score_)

In [21]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

params= {'min_child_weight':[4,6],'gamma':[i/10.0 for i in range(3,6)],
         'subsample':[i/10 for i in range(6,11)], 'colsample_bytree':[i/10.0 for i in range(6,11)],
        'max_depth':[2,3,4]}

xgb= XGBRegressor(nthread=-1, objective='reg:linear')

grid= GridSearchCV(xgb, params, cv=5)
grid.fit(pca_train1,y_train)
grid.best_estimator_.fit(pca_train1,y_train)

y_pred=grid.best_estimator_.predict(pca_test1)

print('R score:',r2_score(y_test,y_pred))
print('MSE:',mean_squared_error(y_test,y_pred))































































R score: 0.5284048675229747
MSE: 68.50217033635144


In [22]:
import statsmodels.api as sm

#Initialise and fit linear regression model using 'statsmodels'
X=sm.add_constant(x_train)
model=sm.OLS(y_train,X).fit()

model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.58
Model:,OLS,Adj. R-squared:,0.541
Method:,Least Squares,F-statistic:,14.85
Date:,"Sun, 26 Jan 2020",Prob (F-statistic):,0.0
Time:,11:57:21,Log-Likelihood:,-10442.0
No. Observations:,2946,AIC:,21390.0
Df Residuals:,2694,BIC:,22900.0
Df Model:,251,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
X0,0.0874,0.020,4.431,0.000,0.049,0.126
X1,-0.0165,0.048,-0.345,0.730,-0.110,0.077
X2,-0.0229,0.066,-0.348,0.728,-0.152,0.106
X3,-0.3439,0.159,-2.159,0.031,-0.656,-0.032
X4,-4.2853,4.089,-1.048,0.295,-12.303,3.732
X5,-0.0524,0.023,-2.230,0.026,-0.098,-0.006
X6,0.0595,0.064,0.935,0.350,-0.065,0.184
X8,-0.0226,0.025,-0.889,0.374,-0.072,0.027
X10,7.1179,5.925,1.201,0.230,-4.501,18.736

0,1,2,3
Omnibus:,2941.625,Durbin-Watson:,2.026
Prob(Omnibus):,0.0,Jarque-Bera (JB):,423855.883
Skew:,4.461,Prob(JB):,0.0
Kurtosis:,61.081,Cond. No.,1.32e+16


In [23]:
x_train['X1'].dtype

dtype('int64')

In [24]:
lin_reg.coef_

array([ 0.14442256,  0.01566932, -0.13486107,  0.01802753,  0.04506372,
       -0.01582024,  1.15484732, -2.63965729,  0.12337981,  1.48045648,
        3.25536822, -2.88691346,  1.17407382, -2.83210674,  0.23439404,
        2.19403054,  6.12246584, -0.28650434,  0.03584965,  0.72181364,
       -0.05925043, -1.71803704,  1.22676384, -0.47662485, -0.85620422,
        0.3403356 ,  1.3462844 , -0.81009671,  0.11596911,  0.32047949,
        0.03908514,  0.95373618,  1.74861098, -0.97234155,  0.22077854,
        0.50650878, -0.31757126, -0.5700503 ,  0.70355517, -1.41970074,
        0.19488087,  0.06353637,  0.95894048, -1.81561681, -1.27370606,
       -0.33862758, -0.9008841 ,  1.26412412,  1.46470156, -0.10004297])

In [25]:
y_output

0       130.81
1        88.53
2        76.26
3        80.62
4        78.02
5        92.93
6       128.76
7        91.91
8       108.67
9       126.99
10      102.09
11       98.12
12       82.62
13       94.12
14       99.15
15       93.64
16      106.10
17      114.13
18       89.81
19       90.81
20       90.56
21       94.57
22      108.14
23      120.77
24       84.84
25       93.59
26      104.07
27       89.37
28       90.08
29      128.19
         ...  
4179     85.93
4180     90.45
4181     90.06
4182     90.38
4183     95.56
4184    109.00
4185    109.64
4186    131.98
4187     98.15
4188    102.33
4189    102.42
4190     89.11
4191     88.93
4192    103.03
4193    107.24
4194     91.13
4195     86.23
4196     99.93
4197     89.25
4198     97.09
4199     88.24
4200    108.59
4201    107.39
4202    123.34
4203     85.71
4204    107.39
4205    108.77
4206    109.22
4207     87.48
4208    110.85
Name: y, Length: 4209, dtype: float64

In [26]:
mercedes_data['X1'].nunique()

27