# Importing required libraries

In [183]:
import pandas as pd
import matplotlib as plt
from scipy import stats
import numpy as np
from scipy.stats import norm, skew #for some statistics
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from statsmodels.graphics.gofplots import qqplot

## Importing Data

In [154]:
data=pd.read_csv('mpg.csv')
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [160]:
# Checking data
print(data.apply(pd.Series.count))

mpg             392
cylinders       392
displacement    392
horsepower      392
weight          392
acceleration    392
model_year      392
origin          392
name            392
dtype: int64


In [161]:
print(data.dtypes)

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model_year        int64
origin            int64
name             object
dtype: object


In [162]:
#Data Cleaning
print('Dataset Length:',len(data.mpg))
data = data.replace('?', np.nan)
data = data.dropna()
print('Dataset Length:',len(data.mpg))

Dataset Length: 392
Dataset Length: 392


In [163]:
data=data.drop(['name'],axis=1)

In [164]:
X=data.drop(['mpg'],axis=1)

In [165]:
y=data.mpg

# Check for normality/ Distribution

In [209]:
alpha=0.05
from scipy.stats import shapiro
stat,p=shapiro(np.log(data.mpg))
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

# Split Data

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

# Linear regression

In [245]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [246]:
print( "R Squared Value:",regression_model.score(X_test, y_test))

R Squared Value: 0.824572132070496


In [247]:
# Predict values using a model
y_predict=regression_model.predict(X=X_test)

In [248]:
r2_score(y_test,y_predict)

0.824572132070496

In [249]:
# Get Mean Squared Error
regression_model_mse = mean_squared_error(y_predict, y_test)
regression_model_mse

11.69822884987538

In [250]:
# Get Root Mean Squared Error
print("Root mean Squared error:",sqrt(regression_model_mse))

Root mean Squared error: 3.420267365261871


In [251]:
# Look at coefficients
for i in enumerate(regression_model.coef_):
    print(X_test.columns[i[0]],':',i[1])
    

cylinders : -0.08244289632082673
displacement : 0.015620692129260552
horsepower : -0.0030398367863882603
weight : -0.007022610340509868
acceleration : 0.23030228302693775
model_year : 0.7118374249486005
origin : 1.4236527094547786


# Ridge regression

In [252]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [253]:
# Setting tuning Parameters
parameters={'alpha': [1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
ridge = Ridge(normalize=True)
ridge_regressor = GridSearchCV(ridge, parameters,scoring='neg_mean_squared_error', cv=5)

In [254]:
ridge_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [255]:
# The best tuning value 'Alpha'
ridge_regressor.best_params_

{'alpha': 0.001}

In [256]:
# Finding MSE
y_predict=ridge_regressor.best_estimator_.predict(X_test)
mean_squared_error(y_predict, y_test)

11.688355583302979

In [257]:
# R squared value
print('R Squared Value:',r2_score(y_test,y_predict))

R Squared Value: 0.8247201926125253


# Lasso

In [259]:
from sklearn.linear_model import Lasso
lasso = Lasso(normalize=True)

parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv = 5)

lasso_regressor.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [262]:
lasso_regressor.best_params_

{'alpha': 0.01}

In [263]:
# Finding MSE
y_predict=lasso_regressor.best_estimator_.predict(X_test)
mean_squared_error(y_predict, y_test)

11.882747043637583

In [264]:
# R squared value
print('R Squared Value:',r2_score(y_test,y_predict))

R Squared Value: 0.8218050778658544


# Random Forest Regressor

In [265]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [266]:
rf = RandomForestRegressor(random_state = 27)

In [270]:
# Get current parameters
print('Parameters currently in use:\n')
print(rf.get_params())

Parameters currently in use:

{'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': 1, 'oob_score': False, 'random_state': 27, 'verbose': 0, 'warm_start': False}


In [277]:
# since the tune grid is going to be huge for this one due to various hyperparameters, we are going to use randomizedsearch cross Val
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in the Forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [283]:
# Creating Hyper Parameter Grid
rf_hyperparam_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(rf_hyperparam_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [285]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = rf_hyperparam_grid, n_iter = 100, cv = 3, verbose=2, random_state=27, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)





Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=100, bootstrap=True 
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=100, bootstrap=True 
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=100, bootstrap=True 
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False, total=   0.6s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False, total=   0.5s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap

[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=False, total=   5.4s
[CV] n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False 


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.5s


[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False, total=   6.4s
[CV] n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False, total=   4.8s
[CV] n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False, total=   6.7s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False, total=   7.0s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimator

[CV] n_estimators=600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=100, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=70, bootstrap=True, total=   1.3s
[CV] n_estimators=600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=100, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=70, bootstrap=True, total=   1.2s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV]  n_estimators=600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=100, bootstrap=True, total=   1.2s
[CV] n_estimators=1400, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=40, bootstrap=True 
[CV]  n_estimators=600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=100, bootstrap=True, total=   1.2s
[CV] n_estimators=1400, min_

[CV] n_estimators=1400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=80, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=40, bootstrap=False, total=   4.1s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=80, bootstrap=False, total=   4.1s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=True, total=   2.1s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=80, bootstrap=False, total=   4.5s
[CV] n_estimators=6

[CV]  n_estimators=1200, min_samples_split=5, min_samples_leaf=4, max_features=auto, max_depth=80, bootstrap=True, total=   2.6s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=   2.9s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=   2.8s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=100, bootstrap=True, total=   2.6s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=20, bootstrap=True 
[CV]  n_estimators=180

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.2min


[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=True, total=   0.5s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=70, bootstrap=False, total=   2.6s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=90, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=70, bootstrap=False, total=   2.5s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=90, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=110, bootstrap=True, total=   0.5s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=90, bootstrap=True 
[CV]  n_estimators=1000, min_

[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=False, total=   2.5s
[CV] n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=70, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False, total=   4.3s
[CV] n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=70, bootstrap=False 
[CV]  n_estimators=2000, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=60, bootstrap=False, total=   4.2s
[CV] n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=70, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=70, bootstrap=False, total=   3.7s
[CV] n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=None, bootstrap=False 
[CV]  n_estimat

[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, total=   2.7s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, total=   2.7s
[CV] n_estimators=1800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=30, bootstrap=True, total=   2.7s
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=110, bootstrap=True 
[CV]  n_estimators=1800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False, total=   3.4s
[CV] n_estima

[CV]  n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=80, bootstrap=False, total=   3.5s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=70, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=100, bootstrap=True, total=   0.6s
[CV] n_estimators=1600, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=70, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=100, bootstrap=True, total=   0.5s
[CV] n_estimators=600, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimators=600, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=60, bootstrap=False, total=   1.1s
[CV] n_estimators=600, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=60, bootstrap=False 
[CV]  n_estimators=600

[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=20, bootstrap=False, total=   3.1s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=5, min_samples_leaf=4, max_features=sqrt, max_depth=20, bootstrap=False, total=   3.0s
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=False, total=   2.5s
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=False, total=   2.6s
[CV] n_estimators=800,

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=27, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [287]:
rf_random.best_params_

{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [289]:
# Finding MSE
y_predict=rf_random.predict(X_test)
mean_squared_error(y_predict, y_test)

10.95401491686797

In [290]:
# R squared value
print('R Squared Value:',r2_score(y_test,y_predict))

R Squared Value: 0.8357324423385166


In [294]:
for i in range(0,len(X_train.columns)):
    print(X_train.columns[i],':',rf_random.best_estimator_.feature_importances_[i])

cylinders : 0.17713620175153924
displacement : 0.23612751463348844
horsepower : 0.1766204145535035
weight : 0.22513354927147583
acceleration : 0.037576723644542044
model_year : 0.11477259924706362
origin : 0.03263299689838735


# Gradient Boosted Tree Regressor

In [295]:
from sklearn.ensemble import GradientBoostingRegressor 

In [296]:
#Simple model
gbrt=GradientBoostingRegressor(n_estimators=100) 
gbrt.fit(X_train, y_train)
y_pred=gbrt.predict(X_test) 

In [299]:
#R Squared Value for Train Data 
gbrt.score(X_train, y_train)

0.9786311413869238

In [301]:
# R Squared Value for Test Data
gbrt.score(X_test, y_test)
# Looks like this model is overfitting.

0.8419718203728827

In [305]:
param_grid={'n_estimators':[100], 
            'learning_rate': [0.1, 0.05, 0.02, 0.01],
            'max_depth':[6,4,6],
            'min_samples_leaf':[3,5,9,17],
            'max_features':[1.0,0.3,0.1] 
           }

n_jobs=4 

In [314]:
gb=GradientBoostingRegressor()
gbrt_random = RandomizedSearchCV(estimator = gb, param_distributions = param_grid,cv=10,n_iter =100,random_state=27,verbose=1 ,n_jobs = -1)
gbrt_random.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   12.0s finished


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [100], 'learning_rate': [0.1, 0.05, 0.02, 0.01], 'max_depth': [6, 4, 6], 'min_samples_leaf': [3, 5, 9, 17], 'max_features': [1.0, 0.3, 0.1]},
          pre_dispatch='2*n_jobs', random_state=27, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [319]:
# Training  R Squared
gbrt_random.best_estimator_.score(X_train,y_train)

0.9750267703652288

In [320]:
#Test R Squared
gbrt_random.best_estimator_.score(X_test,y_test)

0.8444390547950473

In [321]:
mean_squared_error(y_true=y_test,y_pred=gbrt.predict(X_test))

10.537948342107857