# Team 3 - Merchant Category Recommendation
## XGBoost Model Parameter Tuning and Training

### Team 3
- Vinicio De Sola
- Kevin Hanna
- Pri Nonis
- Bradley Nott

In [98]:
import numpy               as np
import matplotlib.pyplot   as plt
import pandas              as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics       import mean_squared_error

np.random.seed(0)


In [127]:
# Load engineered (expanded) training data
#train = pd.read_pickle('./Elo_kaggle/input/engineered_train.pkl')
train = pd.read_pickle('./Elo_kaggle/input/engineered_train_new.pkl')


In [128]:
# Simple (baseline) linear regression model RMSE: 15.059
# - mean of training target: -0.392
# - mean of testing target: -0.399

# Create baseline with expanded data set

# Extract features and target
X, y = train.iloc[:,1:].values, train.iloc[:,0].values

# Create a train/test split for a single baseline model
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

# Convert to XGBoost DMatrices
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Training mean
mean_train = np.mean(y_train)

# Use mean value to make baseline predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute RMSE
rmse_baseline = mean_squared_error(y_test, baseline_predictions)
print("Baseline RMSE is {:.2f}".format(rmse_baseline))


Baseline RMSE is 14.41


In [129]:
# Use some default parameters for a baseline XGBoost model
params = {
    'eval_metric': 'rmse',
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:linear',
}

# num_boost_round: number of boosting rounds/trees/estimators to build (1 = a single decision tree)
# - XGBoost has a method to find optimal number of rounds while training
# - early_stopping_round stops training if performance has not increased for n boosting rounds

num_boost_round = 999

In [130]:
# No cross-validation; single thread
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
)

[0]	Test-rmse:3.76525
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:3.70075
[2]	Test-rmse:3.66402
[3]	Test-rmse:3.64829
[4]	Test-rmse:3.6375
[5]	Test-rmse:3.63642
[6]	Test-rmse:3.63471
[7]	Test-rmse:3.63187
[8]	Test-rmse:3.62936
[9]	Test-rmse:3.62917
[10]	Test-rmse:3.6284
[11]	Test-rmse:3.62863
[12]	Test-rmse:3.62878
[13]	Test-rmse:3.62948
[14]	Test-rmse:3.62905
[15]	Test-rmse:3.6278
[16]	Test-rmse:3.62833
[17]	Test-rmse:3.62794
[18]	Test-rmse:3.62907
[19]	Test-rmse:3.62998
[20]	Test-rmse:3.62851
[21]	Test-rmse:3.62894
[22]	Test-rmse:3.62766
[23]	Test-rmse:3.62784
[24]	Test-rmse:3.63194
[25]	Test-rmse:3.63176
[26]	Test-rmse:3.63209
[27]	Test-rmse:3.63164
[28]	Test-rmse:3.63269
[29]	Test-rmse:3.63195
[30]	Test-rmse:3.63247
[31]	Test-rmse:3.63355
[32]	Test-rmse:3.6345
Stopping. Best iteration:
[22]	Test-rmse:3.62766



In [131]:
print("Best RMSE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))


Best RMSE: 3.63 with 23 rounds


In [132]:
# Cross validation to find 
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=0,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10
)
cv_results # current parameters

# Rows: number of boosting trees

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3.824151,0.009559,3.845768,0.038694
1,3.743784,0.009105,3.783899,0.038037
2,3.694852,0.009345,3.75217,0.037258
3,3.659074,0.008756,3.733303,0.036987
4,3.633878,0.010136,3.724616,0.037409
5,3.614419,0.011418,3.719167,0.036381
6,3.59745,0.009206,3.716252,0.037217
7,3.582534,0.007625,3.715262,0.037262
8,3.569413,0.008879,3.714629,0.036281
9,3.558153,0.007342,3.713058,0.036246


In [133]:
# Best RMSE score
cv_results['test-rmse-mean'].min()

3.7124574000000004

In [134]:
# Tune max_depth and min_child_weight
# - Tune in conjunction to manage bias-variance tradeoff

# max_depth: constrain tree complexity; guard against overfitting
# min_child_weight: minimum number of samples to create a new node in the tree (smaller = more likely to overfit)

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,12)
    for min_child_weight in range(10,50,10)
]

In [135]:
# Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=0,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
        
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=4, min_child_weight=10


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	RMSE 3.7025404 for 39 rounds
CV with max_depth=4, min_child_weight=20
	RMSE 3.6983932000000004 for 32 rounds
CV with max_depth=4, min_child_weight=30
	RMSE 3.7013074 for 37 rounds
CV with max_depth=4, min_child_weight=40
	RMSE 3.698410199999999 for 52 rounds
CV with max_depth=5, min_child_weight=10
	RMSE 3.7046080000000003 for 19 rounds
CV with max_depth=5, min_child_weight=20
	RMSE 3.7037670000000005 for 18 rounds
CV with max_depth=5, min_child_weight=30
	RMSE 3.698261 for 33 rounds
CV with max_depth=5, min_child_weight=40
	RMSE 3.698092 for 20 rounds
CV with max_depth=6, min_child_weight=10
	RMSE 3.7108552 for 13 rounds
CV with max_depth=6, min_child_weight=20
	RMSE 3.7049390000000004 for 15 rounds
CV with max_depth=6, min_child_weight=30
	RMSE 3.7062508000000003 for 11 rounds
CV with max_depth=6, min_child_weight=40
	RMSE 3.7001102 for 19 rounds
CV with max_depth=7, min_child_weight=10
	RMSE 3.7163618 for 9 rounds
CV with max_depth=7, min_child_weight=20
	RMSE 3.7135614000000006 fo

In [136]:
#params['max_depth'] = 4
#params['min_child_weight'] = 36

# new
params['max_depth'] = 5
params['min_child_weight'] = 40

In [137]:
gridsearch_params = [
    (subsample, colsample)
    #for subsample in [i/10. for i in range(7,11)]
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(3,11)]
]

In [138]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=0,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSEE: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	RMSE 3.698092 for 20 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 3.6965822000000004 for 23 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 3.6994266000000002 for 25 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 3.7002021999999997 for 23 rounds
CV with subsample=1.0, colsample=0.6
	RMSE 3.6997838 for 29 rounds
CV with subsample=1.0, colsample=0.5
	RMSE 3.6987906 for 28 rounds
CV with subsample=1.0, colsample=0.4
	RMSE 3.6991278000000003 for 26 rounds
CV with subsample=1.0, colsample=0.3
	RMSE 3.7015826000000005 for 39 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 3.697797 for 22 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 3.7007144 for 27 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 3.7003408 for 28 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 3.7003708000000004 for 17 rounds
CV with subsample=0.9, colsample=0.6
	RMSE 3.7000066000000005 for 29 rounds
CV with subsample=0.9, colsample=0.5
	RMSE 3.6987544 for 20 rounds
CV with subsample=0.9, colsample=0.4
	

In [139]:
#params['subsample'] = 1.0
#params['colsample_bytree'] = 0.6

# new
params['subsample'] = 0.8
params['colsample_bytree'] = 0.5

In [141]:
%time
# This can take some time…
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    
    %time cv_results = xgb.cv(params,dtrain,num_boost_round=num_boost_round,seed=0,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

Wall time: 0 ns
CV with eta=0.3
Wall time: 22.4 s
	RMSE 3.6959676000000004 for 28 rounds

CV with eta=0.2


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.
  from ipykernel import kernelapp as app


Wall time: 26.7 s
	RMSE 3.6949778 for 36 rounds

CV with eta=0.1
Wall time: 1min 9s
	RMSE 3.6887582 for 123 rounds

CV with eta=0.05
Wall time: 1min 34s
	RMSE 3.6867959999999997 for 172 rounds

CV with eta=0.01
Wall time: 8min 20s
	RMSE 3.6839033999999997 for 998 rounds

CV with eta=0.005


KeyboardInterrupt: 

	RMSE 3.6839033999999997 for 998 rounds

Best params: 0.01, RMSE: 3.6839033999999997


In [142]:
#params['eta'] = .1

params['eta'] = .05

In [143]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

print("Best RMSE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-rmse:3.87139
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:3.85024
[2]	Test-rmse:3.83121
[3]	Test-rmse:3.81382
[4]	Test-rmse:3.79818
[5]	Test-rmse:3.78465
[6]	Test-rmse:3.77148
[7]	Test-rmse:3.75926
[8]	Test-rmse:3.74824
[9]	Test-rmse:3.73835
[10]	Test-rmse:3.72907
[11]	Test-rmse:3.72103
[12]	Test-rmse:3.71281
[13]	Test-rmse:3.70554
[14]	Test-rmse:3.69887
[15]	Test-rmse:3.69329
[16]	Test-rmse:3.68799
[17]	Test-rmse:3.68289
[18]	Test-rmse:3.67814
[19]	Test-rmse:3.67359
[20]	Test-rmse:3.67001
[21]	Test-rmse:3.66628
[22]	Test-rmse:3.66297
[23]	Test-rmse:3.65964
[24]	Test-rmse:3.65635
[25]	Test-rmse:3.65383
[26]	Test-rmse:3.65117
[27]	Test-rmse:3.64926
[28]	Test-rmse:3.64711
[29]	Test-rmse:3.64532
[30]	Test-rmse:3.64371
[31]	Test-rmse:3.64181
[32]	Test-rmse:3.64004
[33]	Test-rmse:3.63876
[34]	Test-rmse:3.63722
[35]	Test-rmse:3.63612
[36]	Test-rmse:3.63512
[37]	Test-rmse:3.63426
[38]	Test-rmse:3.63317
[39]	Test-rmse:3.63204
[40]	Test-rmse:3.63101
[41]	Test-

In [120]:
model.best_iteration + 1

127

In [146]:
num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-rmse:3.87139
[1]	Test-rmse:3.85024
[2]	Test-rmse:3.83121
[3]	Test-rmse:3.81382
[4]	Test-rmse:3.79818
[5]	Test-rmse:3.78465
[6]	Test-rmse:3.77148
[7]	Test-rmse:3.75926
[8]	Test-rmse:3.74824
[9]	Test-rmse:3.73835
[10]	Test-rmse:3.72907
[11]	Test-rmse:3.72103
[12]	Test-rmse:3.71281
[13]	Test-rmse:3.70554
[14]	Test-rmse:3.69887
[15]	Test-rmse:3.69329
[16]	Test-rmse:3.68799
[17]	Test-rmse:3.68289
[18]	Test-rmse:3.67814
[19]	Test-rmse:3.67359
[20]	Test-rmse:3.67001
[21]	Test-rmse:3.66628
[22]	Test-rmse:3.66297
[23]	Test-rmse:3.65964
[24]	Test-rmse:3.65635
[25]	Test-rmse:3.65383
[26]	Test-rmse:3.65117
[27]	Test-rmse:3.64926
[28]	Test-rmse:3.64711
[29]	Test-rmse:3.64532
[30]	Test-rmse:3.64371
[31]	Test-rmse:3.64181
[32]	Test-rmse:3.64004
[33]	Test-rmse:3.63876
[34]	Test-rmse:3.63722
[35]	Test-rmse:3.63612
[36]	Test-rmse:3.63512
[37]	Test-rmse:3.63426
[38]	Test-rmse:3.63317
[39]	Test-rmse:3.63204
[40]	Test-rmse:3.63101
[41]	Test-rmse:3.62985
[42]	Test-rmse:3.629
[43]	Test-rmse:3.62797


In [154]:
# Number of Trees (best model)
print('Best model used {} decision trees'.format(num_boost_round))

Best model used 181 decision trees


In [156]:
print('Trees: {}'.format(num_boost_round))
print()
print('Final parameters\n')
print(params)
print()
print('RMSE of predictions on test set: {}'.format(mean_squared_error(y_test, best_model.predict(dtest))**0.5))

Trees: 181

Final parameters

{'eval_metric': 'rmse', 'max_depth': 5, 'min_child_weight': 40, 'eta': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.5, 'objective': 'reg:linear'}

RMSE of predictions on test set: 3.600634349650864


In [152]:
# save model
best_model.save_model("eng_train_new_model.model")

In [157]:
# Loading model for later use
loaded_model = xgb.Booster()
loaded_model.load_model("eng_train_new_model.model")

# Made predictions with loaded model
loaded_model.predict(dtest)

print(mean_squared_error(y_test, loaded_model.predict(dtest))**0.5)
print(mean_squared_error(y_test, loaded_model.predict(dtest)))

3.600634349595675
12.96456771948827
