# Gradient Boosting (Regression)

In [2]:
import numpy as np
import pandas as pd
import sklearn.ensemble
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [4]:
data = pd.read_csv('after_lasso.csv')
data = data.drop(['Id', 'TotRmsAbvGrd'], axis = 1)
data.head()

Unnamed: 0,MSZoning,LotArea,LotShape,LotConfig,Neighborhood,Condition1,BldgType,OverallQual,OverallCond,YearBuilt,...,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,SaleType,SaleCondition,LogSalePrice
0,2,8450,3,4,5,0,0,7,5,104,...,2,0,61,0,0,0,0,8,0,12.247694
1,2,9600,3,2,24,1,0,6,6,77,...,2,298,0,0,0,0,0,8,0,12.109011
2,2,11250,0,4,5,0,0,7,5,102,...,2,0,42,0,0,0,0,8,0,12.317167
3,2,9550,0,0,6,0,0,7,5,19,...,3,0,35,272,0,0,0,8,1,11.849398
4,2,14260,0,2,15,0,0,8,5,101,...,3,192,84,0,0,0,0,8,0,12.429216


In [5]:
X= data.drop(['LogSalePrice'], axis = 1)
Y = np.ravel(data.LogSalePrice)

In [6]:
test_csv = pd.read_csv('after_lasso_test.csv')
test = test_csv.drop(['Id', 'TotRmsAbvGrd'], axis = 1)
test.head()

Unnamed: 0,MSZoning,LotArea,LotShape,LotConfig,Neighborhood,Condition1,BldgType,OverallQual,OverallCond,YearBuilt,...,GarageType,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,SaleType,SaleCondition
0,1,11622,3,4,12,1,0,5,6,56,...,0,1.0,140,0,0,0,120,0,9,0
1,2,14267,0,0,12,0,0,6,6,53,...,0,1.0,393,36,0,0,0,12500,9,0
2,2,13830,0,4,8,0,0,5,5,92,...,0,2.0,212,34,0,0,0,0,9,0
3,2,9978,0,4,8,0,0,6,6,93,...,0,2.0,360,36,0,0,0,0,9,0
4,2,5005,0,4,22,0,1,8,5,87,...,0,2.0,0,82,0,0,144,0,9,0


In [7]:
test.isnull().sum().sum()

0

In [8]:
X.shape

(1460, 39)

In [9]:
gbm = GradientBoostingRegressor() # creating an instance of the Class

In [10]:
gbm.fit(X,Y) # baseline model without any alteration of parameters

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [11]:
gbm.score(X,Y) # before train_test_split (baseline model)

0.9505385799456486

In [12]:
X_test, X_train, Y_test, Y_train = train_test_split(X, Y, test_size=0.3, random_state=42) # cross validation: splitting the data into train & test group

In [13]:
# setting parameters to use for Gradient Boosting
gbm.set_params(learning_rate=0.01, n_estimators=1500, verbose=1, subsample=0.7, warm_start=False, loss='ls', max_depth=2, max_features=2, min_impurity_decrease=0.01)
gbm.fit(X_train,Y_train) # fitting the model

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.1638           0.0014           42.43s
         2           0.1659           0.0020           23.25s
         3           0.1738           0.0016           17.30s
         4           0.1596           0.0021           13.86s
         5           0.1703           0.0011           11.95s
         6           0.1646           0.0009           11.46s
         7           0.1555           0.0014           10.43s
         8           0.1609           0.0014            9.48s
         9           0.1595           0.0013            8.84s
        10           0.1590           0.0008            8.14s
        20           0.1514           0.0016            4.62s
        30           0.1320           0.0006            3.45s
        40           0.1409           0.0003            3.03s
        50           0.1162           0.0003            2.60s
        60           0.1076           0.0013            2.48s
       

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=2,
                          max_features=2, max_leaf_nodes=None,
                          min_impurity_decrease=0.01, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=0.7, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [14]:
print("The train set R^2 is %.3f" %(gbm.score(X_train, Y_train)))
print("The test set R^2 is %.3f" %(gbm.score(X_test,Y_test)))

The train set R^2 is 0.939
The test set R^2 is 0.874


In [15]:
train_error = (1 - gbm.score(X_train, Y_train))
test_error = (1 - gbm.score(X_test, Y_test))
print("The training error is: %.5f" %train_error)
print("The test      error is: %.5f" %test_error)

The training error is: 0.06093
The test      error is: 0.12575


In [16]:
# plt.rcParams['figure.figsize'] = (16,5)
# a, b = zip(*sorted_importance) # this splits the list of tuples into two lists
# pd.DataFrame({'feature_name':a, 'importance_score':b}).plot.bar(x='feature_name',y='importance_score', legend=None)
# plt.title('Feature Importance Plot of 1500-Tree GBM')

NameError: name 'sorted_importance' is not defined

In [None]:
# plt.rcParams['figure.figsize'] = (10,5)
# ## Top 10 features Bar Chart
# a, b = zip(*sorted_importance) # this splits the list of tuples into two lists
# pd.DataFrame({'feature_name':a[0:10], 'importance_score':b[0:10]}).plot.bar(x='feature_name',y='importance_score', legend=None)
# plt.title('Feature Importance Plot of 1500-Tree GBM')

# Running Grid Search to find optimal Hyperparameters

In [None]:
# gbm.set_params(learning_rate=0.01, n_estimators=1500, verbose=1, subsample=0.7, warm_start=False, loss='ls', max_depth=2, max_features=2, min_impurity_decrease=0.01)

In [None]:
# grid_para_gbm = [{
#     "learning_rate": [0.01],
#     "verbose": [1],
#     "subsample": [0.7],
#     "loss": ['ls'],
#     "n_estimators": range(500, 2000, 500),
#     "max_depth": range(2, 5),
#     "max_features": range(2, 5),  #  sqrt(number of samples) 
#     "min_impurity_decrease": np.linspace(0.01,0.05, 3), 
#     "min_samples_split": range(2,5),
#     "random_state": [42] }]

# gbm.set_params()
# grid_search_gbm = GridSearchCV(gbm, grid_para_gbm, cv = 5, scoring = 'r2', n_jobs = -1)
# %time grid_search_gbm.fit(X, Y)

In [17]:
# ## best parameters
# grid_search_gbm.best_params_

NameError: name 'grid_search_gbm' is not defined

In [None]:
# ## best score
# grid_search_gbm.best_score_

In [None]:
# ## The overall accuracy on the training set:
# grid_search_gbm.score(X_train, Y_train)

In [None]:
# ## The overall accuracy on the test set:
# grid_search_gbm.score(X_test, Y_test)

# Predictions

In [18]:
# setting parameters to use for Gradient Boosting after GridSearch
gbm.set_params(learning_rate=0.01, n_estimators=1500, verbose=1, subsample=0.7, warm_start=False, loss='ls', min_samples_split = 2, max_depth=4, max_features=4, min_impurity_decrease=0.01)
gbm.fit(X,Y) # fitting the model

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.1518           0.0019            7.85s
         2           0.1550           0.0022            6.80s
         3           0.1533           0.0018            5.70s
         4           0.1495           0.0019            5.67s
         5           0.1479           0.0018            5.22s
         6           0.1417           0.0019            4.92s
         7           0.1475           0.0018            4.81s
         8           0.1475           0.0019            5.18s
         9           0.1464           0.0016            5.74s
        10           0.1361           0.0017            5.76s
        20           0.1251           0.0013            4.33s
        30           0.1078           0.0014            4.02s
        40           0.0974           0.0011            4.24s
        50           0.0911           0.0008            3.73s
        60           0.0791           0.0009            3.43s
       

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=4,
                          max_features=4, max_leaf_nodes=None,
                          min_impurity_decrease=0.01, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=0.7, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [19]:
test_y = gbm.predict(test)
test_y = np.exp(test_y)
test_y

array([117173.30999233, 168750.81749219, 173718.45885726, ...,
       154115.67973052, 120790.9289408 , 206855.87592695])

In [20]:
sorted_importance = sorted(zip(test.columns, gbm.feature_importances_), key=lambda t:t[1], reverse=True)
pd.DataFrame(sorted_importance, columns = ['FeatureName',  'Importance'])

Unnamed: 0,FeatureName,Importance
0,OverallQual,0.147584
1,GrLivArea,0.130318
2,TotalBsmtSF,0.089251
3,GarageCars,0.070691
4,YearBuilt,0.065789
5,ExterQual,0.062677
6,FullBath,0.058999
7,Fireplaces,0.049183
8,KitchenQual,0.040634
9,LotArea,0.040619


In [21]:
# Making a dataframe for predictions to import as csv
predictions = pd.concat([test_csv['Id'], pd.Series(test_y)], axis = 1)
predictions = predictions.rename(columns={0 : "SalePrice"})
predictions

Unnamed: 0,Id,SalePrice
0,1461,117173.309992
1,1462,168750.817492
2,1463,173718.458857
3,1464,194913.661552
4,1465,200099.496717
...,...,...
1454,2915,83900.047494
1455,2916,86341.543548
1456,2917,154115.679731
1457,2918,120790.928941


In [22]:
print("The train set R^2 is %.3f" %(gbm.score(X, Y)))

The train set R^2 is 0.964


In [23]:
#predictions.to_csv('Predictions_Gradient_Boosting.csv', index = False)

In [25]:
# Iowa_importance = pd.DataFrame([model.feature_importances_ for model in gbm_models], index=steps, columns=X_train.columns)
# Iowa_importance

In [None]:
# plt.rcParams['figure.figsize'] = (14,20)
# for feature in Iowa_importance.columns:
#     plt.plot(Iowa_importance.index, Iowa_importance[feature], label=feature)
    
# plt.legend(loc=1)    
# plt.xlabel('n_estimators')
# plt.ylabel('feature importance')