In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error, SCORERS, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## Load Data

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Explore Data

In [4]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Check columns with missing values

In [6]:
missing = list(data.isnull().sum()[data.isnull().sum() > 0].index)
print(missing)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [7]:
mis_cols_toberemoved = data.isnull().sum()[data.isnull().sum() > 0]/data.shape[0]
mis_cols_toberemoved = mis_cols_toberemoved > 0.5
mis_cols_toberemoved = list(mis_cols_toberemoved[mis_cols_toberemoved].index)
mis_cols_toberemoved

['Alley', 'PoolQC', 'Fence', 'MiscFeature']

### List of Columns with Categorical data

In [8]:
cat_cols = [col for col in data.columns if data[col].dtype == 'O' ]
print('Total Columns: {}'.format(len(data.columns)))
print('Categorical Columns: {}\n'.format(len(cat_cols)))
print(cat_cols)

Total Columns: 81
Categorical Columns: 43

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


### Check Cardinality of Categorical Columns & Remove columns with High Cardinality

In [9]:
print('Categorical Columns before Removal: {}'.format(len(cat_cols)))
card_col = [col for col in cat_cols if data[col].nunique()<10]
print('Categorical Columns after Removal: {}\n'.format(len(card_col)))
print(card_col)

Categorical Columns before Removal: 43
Categorical Columns after Removal: 40

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


### List Numerical Columns

In [10]:
num_cols = [col for col in data.columns if data[col].dtype in ('int64', 'float64') ]
num_cols.remove('Id')
print('Total Columns: {}'.format(len(data.columns)))
print('Numerical Columns: {}\n'.format(len(num_cols)))
print(num_cols)

Total Columns: 81
Numerical Columns: 37

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


### Select Numerical Columns based on Correlation above 0.3

In [11]:
print('Numerical Columns before Removal: {}'.format(len(num_cols)))
num_cols = list(data[num_cols].corr()['SalePrice'][abs(data[num_cols].corr()['SalePrice'])>0.3].index)
print('Numerical Columns after Removal: {}\n'.format(len(num_cols)))
print(num_cols)

Numerical Columns before Removal: 37
Numerical Columns after Removal: 19

['LotFrontage', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'SalePrice']


### Select Required Columns

In [12]:
card_col = [col for col in card_col if col not in mis_cols_toberemoved]
my_cols = card_col + num_cols
len(my_cols)

55

### Impute Numerical Columns

In [13]:
impt = SimpleImputer(strategy = 'median')
X_imp = pd.DataFrame(impt.fit_transform(data[num_cols]))
X_imp.columns = data[num_cols].columns
print('Type of Imputed Numerical Columns: {}'.format(type(X_imp)))
print('Type of Imputed Numerical Columns: {}'.format(X_imp.shape))

Type of Imputed Numerical Columns: <class 'pandas.core.frame.DataFrame'>
Type of Imputed Numerical Columns: (1460, 19)


### Impute Categorical Columns

In [14]:
imptC = SimpleImputer(strategy = 'most_frequent')
X_impC = pd.DataFrame(imptC.fit_transform(data[card_col]))
X_impC.columns = data[card_col].columns
print('Type of Imputed Numerical Columns: {}'.format(type(X_impC)))
print('Type of Imputed Numerical Columns: {}'.format(X_impC.shape))

Type of Imputed Numerical Columns: <class 'pandas.core.frame.DataFrame'>
Type of Imputed Numerical Columns: (1460, 36)


### Join Numerical/Categoical Imputed Data and Split into Features and Output

In [15]:
sel_data = pd.concat([X_imp, X_impC], axis='columns')
y = sel_data['SalePrice']
X = sel_data.drop(['SalePrice'], axis=1)
print('Shape of Features Data {}'.format(X.shape))
print('Shape of Output Columns {}'.format(y.shape))

Shape of Features Data (1460, 54)
Shape of Output Columns (1460,)


### OneHotEncode Categorical Columns

In [16]:
X = pd.get_dummies(X)
print('Shape of Features Data after OneHotEncoding {}'.format(X.shape))

Shape of Features Data after OneHotEncoding (1460, 201)


### Train-Validation Split

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8, test_size=0.2,random_state=0)
print('Shape of Train Features Data {}'.format(X_train.shape))
print('Shape of Train Output Columns {}'.format(y_train.shape))
print('Shape of Validation Features Data {}'.format(X_valid.shape))
print('Shape of Validation Output Columns {}'.format(y_valid.shape))

Shape of Train Features Data (1168, 201)
Shape of Train Output Columns (1168,)
Shape of Validation Features Data (292, 201)
Shape of Validation Output Columns (292,)


### Function for Error Calculations

In [18]:
def errors(y_valid, X_valid):
    print('Mean Absulute Error (MAE):', mean_absolute_error(y_valid, model.predict(X_valid)))
    print('Mean Absulute Percentage Error (MAPE):{:.2%}' .format(mean_absolute_percentage_error(y_valid, model.predict(X_valid))))
    print('Mean Squared Error (MSE):', mean_squared_error(y_valid, model.predict(X_valid)))
    print('Root Mean Squared Error (RMSE):', mean_squared_error(y_valid, model.predict(X_valid), squared=False))

### <font color='green'> 1. Model (Decesion Tree)

In [19]:
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 28296.777397260274
Mean Absulute Percentage Error (MAPE):16.04%
Mean Squared Error (MSE): 2118650375.7294521
Root Mean Squared Error (RMSE): 46028.79941655498


### <font color='green'> 2. Model (Random Forest)

In [20]:
model = RandomForestRegressor(n_estimators = 500, random_state=0)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 17503.999410958902
Mean Absulute Percentage Error (MAPE):9.83%
Mean Squared Error (MSE): 1073582467.4738611
Root Mean Squared Error (RMSE): 32765.56832215582


### <font color='green'> 3. Model (XG Boost)

In [21]:
model = XGBRegressor(n_estimators = 800, learning_rate = 0.04, n_jobs = -1,  random_state=0)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 17608.678483518837
Mean Absulute Percentage Error (MAPE):9.62%
Mean Squared Error (MSE): 1093187787.4436297
Root Mean Squared Error (RMSE): 33063.390440843024


### <font color='green'> 4. Model (LightGBM)

In [22]:
lgbm_parameters = {
    'metric': 'rmse', 
    'n_jobs': -1,
    'n_estimators': 50000,
    'reg_alpha': 10.924491968127692,
    'reg_lambda': 17.396730654687218,
    'colsample_bytree': 0.21497646795452627,
    'subsample': 0.7582562557431147,
    'learning_rate': 0.009985133666265425,
    'max_depth': 18,
    'num_leaves': 63,
    'min_child_samples': 27,
    'max_bin': 523,
    'cat_l2': 0.025083670064082797
}
model = LGBMRegressor(**lgbm_parameters,  random_state=0)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 17424.15738651568
Mean Absulute Percentage Error (MAPE):9.53%
Mean Squared Error (MSE): 819353946.5073445
Root Mean Squared Error (RMSE): 28624.35932046942


### <font color='green'> 5. Model (Use Cross Validation on LightGBM)

In [23]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [24]:
cv_scores_mae = -1 * cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE score:", cv_scores_mae.mean())

cv_scores_mape = -1 * cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_percentage_error')
print("Average MAPE score: {:.2%}" .format(cv_scores_mape.mean()))

cv_scores_mse = -1 * cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print("Average MSE score:", cv_scores_mse.mean())

cv_scores_rmse = -1 * cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
print("Average RMSE score:", cv_scores_rmse.mean())

Average MAE score: 17584.64269246599
Average MAPE score: 10.25%
Average MSE score: 767597782.5945097
Average RMSE score: 27444.233648421585


### <font color='green'> 6. Model (Hyperparameter Random Search with XGBoost)

In [20]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from datetime import datetime

In [21]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

A total number of combinations for the set of parameters above is a product of options for each parameter (3 x 5 x 3 x 3 x 3 = 405). It also needs to be multiplied by 5 to calculate a total number of data-fitting runs as we will be doing 5-fold cross-validation. That gets to be a large number in a hurry if you are using many parameters and lots of options, which is why **brute-force grid search takes a long time**.

Next we set up our stratified folds and grid search parameters.
The param_comb parameter declares how many different combinations should be picked randomly out of our total (405, see above). 
Definitely use a bigger number for param_comb.

In [32]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.04, 0.05, 0.06, 0.07],
        'n_estimators': [500, 600, 700, 800, 900, 1000, 1100, 1200]
        }
xgb = XGBRegressor(objective='reg:squarederror',random_state=0, n_jobs = -1)
folds = 5
param_comb = 10
kf = KFold(n_splits=folds, shuffle = True, random_state = 0)
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=kf.split(X,y), verbose=3, random_state=0 )

start_time = timer(None)
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 10 candidates, totalling 50 fits

 Time taken: 0 hours 1 minutes and 59.35 seconds.


In [34]:
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)

# print('\n All results:')
# print(random_search.cv_results_)
# print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
# print(random_search.best_score_ * 2 - 1)
# results = pd.DataFrame(random_search.cv_results_)
# results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 Best estimator:
XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.04, max_delta_step=0, max_depth=4,
             min_child_weight=10, missing=nan, monotone_constraints=None,
             n_estimators=1100, n_jobs=-1, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method=None, validate_parameters=False, verbosity=None)

 Best hyperparameters:
{'subsample': 1.0, 'n_estimators': 1100, 'min_child_weight': 10, 'max_depth': 4, 'learning_rate': 0.04, 'gamma': 2, 'colsample_bytree': 0.6}


In [35]:
XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=2, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.04, max_delta_step=0, max_depth=4,
             min_child_weight=10, missing=np.nan, monotone_constraints=None,
             n_estimators=1100, n_jobs=-1, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method=None, validate_parameters=False, verbosity=None)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 17424.15738651568
Mean Absulute Percentage Error (MAPE):9.53%
Mean Squared Error (MSE): 819353946.5073445
Root Mean Squared Error (RMSE): 28624.35932046942


### <font color='green'> 7. Model (Hyperparameter Grid Search with XGBoost)

In [22]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.04, 0.05, 0.06, 0.07],
        'n_estimators': [500, 600, 700, 800, 900, 1000, 1100, 1200]
        }
xgb = XGBRegressor(objective='reg:squarederror',random_state=0, n_jobs = -1, tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=0)
folds = 3
param_comb = 100
kf = KFold(n_splits=folds, shuffle = True, random_state = 0)
grid_search = GridSearchCV(estimator=xgb, param_grid=params, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, cv=kf.split(X,y), verbose=3)

start_time = timer(None)
grid_search.fit(X, y)
timer(start_time)

Fitting 3 folds for each of 12960 candidates, totalling 38880 fits

 Time taken: 31 hours 18 minutes and 32.36 seconds.


In [25]:
print('\n Best estimator:')
print(grid_search.best_estimator_)
print('\n Best hyperparameters:')
print(grid_search.best_params_)


 Best estimator:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=0,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=-1, num_parallel_tree=1,
             predictor='gpu_predictor', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

 Best hyperparameters:
{'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.8}


In [26]:
model = grid_search.best_estimator_
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 16006.847589362158
Mean Absulute Percentage Error (MAPE):8.95%
Mean Squared Error (MSE): 744670318.0019029
Root Mean Squared Error (RMSE): 27288.64815270084


In [27]:
params = {'colsample_bytree': 0.6, 
          'gamma': 0.5, 
          'learning_rate': 0.05, 
          'max_depth': 4, 
          'min_child_weight': 1, 
          'n_estimators': 500, 
          'subsample': 0.8}
model = XGBRegressor(random_state=0, n_jobs = -1, **params)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 15916.798333154966
Mean Absulute Percentage Error (MAPE):8.82%
Mean Squared Error (MSE): 818551200.8758787
Root Mean Squared Error (RMSE): 28610.33381273065


### <font color='green'> 8. Model (Hyperparameter Random Search with LightGBM)

In [62]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
# param_test ={'num_leaves': sp_randint(6, 50), 
#              'min_child_samples': sp_randint(100, 500), 
#              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#              'subsample': sp_uniform(loc=0.2, scale=0.8), 
#              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
param_dist = {
            'num_leaves': [27, 31, 61, 81, 127, 197, 231, 275, 302],
            'bagging_fraction': [0.5, 0.7, 0.8, 0.9],
            'learning_rate': [0.01, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.3, 0.5],
            'min_data': [300, 400, 450, 500, 550, 650],
            'max_bin': [3, 5, 10, 12, 18, 20, 22],
            'boosting_type' : ['gbdt', 'dart'],
            'bagging_freq': [3, 9, 11, 15, 17, 23, 31],
            'max_depth': [3, 4, 5, 6, 7, 9, 11],       
            'feature_fraction': [0.5, 0.7, 0.8, 0.9],
            'lambda_l1': [0, 10, 20, 30, 40],
               }

# lgbm_parameters = {
# #    'metric': 'mean_absolute_percentage_error', 
# #    'n_jobs': -1,
#     'n_estimators': [1000,10000, 20000, 30000, 40000, 50000, 60000, 70000],
#     'reg_alpha': 10.924491968127692,
#     'reg_lambda': 17.396730654687218,
#     'colsample_bytree': 0.21497646795452627,
#     'subsample': 0.7582562557431147,
#     'learning_rate': 0.009985133666265425,
#     'max_depth': 18,
#     'num_leaves': 63,
#     'min_child_samples': 27,
#     'max_bin': 523,
#     'cat_l2': 0.025083670064082797
# }


lgbm = LGBMRegressor(max_depth=-1, metric='root_mean_squared_error', n_estimators=5000, random_state=0, n_jobs = -1)
folds = 5
param_comb = 100
kf = KFold(n_splits=folds, shuffle = True, random_state = 0)
random_search = RandomizedSearchCV(estimator = lgbm, param_distributions = param_dist, n_iter=param_comb, 
                                   scoring=''neg_root_mean_squared_error', n_jobs=-1, cv=kf.split(X,y), 
                                   verbose=3, random_state=0 )

start_time = timer(None)
random_search.fit(X, y)
timer(start_time)

Fitting 5 folds for each of 100 candidates, totalling 500 fits

 Time taken: 0 hours 0 minutes and 56.72 seconds.


In [64]:
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best estimator:
LGBMRegressor(bagging_fraction=0.9, bagging_freq=23, boosting_type='dart',
              feature_fraction=0.8, lambda_l1=0, learning_rate=0.05, max_bin=22,
              max_depth=7, metric='mean_absolute_percentage_error',
              min_data=300, n_estimators=5000, num_leaves=127, random_state=0)

 Best hyperparameters:
{'num_leaves': 127, 'min_data': 300, 'max_depth': 7, 'max_bin': 22, 'learning_rate': 0.05, 'lambda_l1': 0, 'feature_fraction': 0.8, 'boosting_type': 'dart', 'bagging_freq': 23, 'bagging_fraction': 0.9}


In [65]:
model = LGBMRegressor(bagging_fraction=0.9, bagging_freq=23, boosting_type='dart',
              feature_fraction=0.8, lambda_l1=0, learning_rate=0.05, max_bin=22,
              max_depth=7, metric='mean_absolute_percentage_error',
              min_data=300, n_estimators=5000, num_leaves=127, random_state=0)
model.fit(X_train, y_train)
errors(y_valid = y_valid, X_valid = X_valid)

Mean Absulute Error (MAE): 24028.110263325598
Mean Absulute Percentage Error (MAPE):13.23%
Mean Squared Error (MSE): 1713255003.5894022
Root Mean Squared Error (RMSE): 41391.484674862804


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [77]:
from bayes_opt import BayesianOptimization