In [44]:
import pandas as pd # load and manipulate data and for One-hot Encoding
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb #XGBoost stuff
from sklearn.model_selection import train_test_split # split data into training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # for scoring during..
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from xgboost import XGBRegressor

### Import the data

In [4]:
df = pd.read_csv('training_WeeklyAggregate.csv')

In [5]:
df.head()

Unnamed: 0,sourceid,dstid,dow,mean_travel_time
0,10,241,3,2334.43
1,10,612,5,1529.83
2,10,905,4,1390.04
3,10,407,7,157.91
4,10,603,4,1781.67


In [7]:
df.dtypes

sourceid              int64
dstid                 int64
dow                   int64
mean_travel_time    float64
dtype: object

In [8]:
len(df.loc[df['mean_travel_time'] == ' ']) 

0

In [11]:
len(df.loc[df['sourceid'] == ' ']) 

0

In [12]:
len(df.loc[df['dow'] == ' ']) 

0

In [14]:
X,y = df.iloc[:,:-1],df.iloc[:,-1]

In [15]:
X.head()

Unnamed: 0,sourceid,dstid,dow
0,10,241,3
1,10,612,5
2,10,905,4
3,10,407,7
4,10,603,4


### Training and testing round # 1

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
x_val = pd.read_csv('testing_dataset.csv')

In [50]:
# regressor with hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

regressor = XGBRegressor()

params = {
    'n_estimators' : [1000,1500,2000],
    'max_depth': [1,2,3,None],
    'learning_rate': [0.1,0.2,0.3,0.4,0.5], #eta
    'reg_lambda': [0.1,0.5,1],
    }


#grid_regressor = GridSearchCV(estimator = regressor, param_grid = params, cv = 5, verbose = 1, n_jobs = -1)
regressor.fit(X_train,y_train,eval_metric='rmse', verbose=True,  ## 'auc' to evaluate how well the predictions are made
                   eval_set=[(X_test, y_test)])

[0]	validation_0-rmse:1366.88367
[1]	validation_0-rmse:1087.12647
[2]	validation_0-rmse:911.90649
[3]	validation_0-rmse:814.45917
[4]	validation_0-rmse:751.53375
[5]	validation_0-rmse:721.27545
[6]	validation_0-rmse:704.51917
[7]	validation_0-rmse:691.94452
[8]	validation_0-rmse:684.44141
[9]	validation_0-rmse:680.86908
[10]	validation_0-rmse:676.51910
[11]	validation_0-rmse:674.60699
[12]	validation_0-rmse:670.42548
[13]	validation_0-rmse:668.92847
[14]	validation_0-rmse:668.38403
[15]	validation_0-rmse:663.15448
[16]	validation_0-rmse:662.86700
[17]	validation_0-rmse:659.72583
[18]	validation_0-rmse:660.13306
[19]	validation_0-rmse:660.49304
[20]	validation_0-rmse:659.90485
[21]	validation_0-rmse:658.04376
[22]	validation_0-rmse:657.69513
[23]	validation_0-rmse:658.77057
[24]	validation_0-rmse:658.15210
[25]	validation_0-rmse:658.09058
[26]	validation_0-rmse:658.65533
[27]	validation_0-rmse:658.69849
[28]	validation_0-rmse:657.79358
[29]	validation_0-rmse:655.89844
[30]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [51]:
y_pred = regressor.predict(X_test)

In [52]:
from sklearn.metrics import mean_squared_error

In [53]:
mse = mean_squared_error(y_test,y_pred)
print("MSE: %0.2f" % (mse))

MSE: 404395.71


In [54]:
rmse = np.sqrt(mse)
print("RMSE: %0.2f" % (rmse))

RMSE: 635.92


### Training and testing round # 2 - With hyperparameters optimization round # 1

In [56]:
param_grid = {
    'max_depth': [3,4,5],
    'learning_rate': [0.1, 0.01, 0.05],
    'gamma': [0,0.25,1.0],
    'reg_lambda': [0,1.0,10.0],
    'scale_pos_weight': [1,3,5]
}
optimal_params = GridSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror',
                               seed=42,
                               subsample=0.9,
                               colsample_bytree=0.5),
    param_grid=param_grid,
    scoring='max_error', ##see scikitlearn website
    verbose=0, #NOTE: If you want to see what Grid Search is doing, set verbose=2
    n_jobs=10,
    cv=3 # cv stands for cross validation, if you want to increase the cross validation, increase cv
)

optimal_params.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_metric='rmse',
                  eval_set=[(X_test,y_test)],
                  verbose=False)
print(optimal_params.best_params_)

{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'reg_lambda': 0, 'scale_pos_weight': 1}


In [57]:
clf_xgb = xgb.XGBRegressor(seed=42,
                            objective='reg:squarederror',
                            gamma=0,
                            learn_rate=0.1,
                            max_depth=5,
                            reg_lambda=0,
                            scale_pos_weight=1,
                            subsample=0.9,
                            colsample_bytree=0.5)
clf_xgb.fit(X_train,
           y_train,
           verbose=True,
           early_stopping_rounds=10,
           eval_metric='rmse',
           eval_set=[(X_test,y_test)])

[0]	validation_0-rmse:1371.57739
[1]	validation_0-rmse:1094.23975
[2]	validation_0-rmse:925.10120
[3]	validation_0-rmse:825.24170
[4]	validation_0-rmse:771.97351
[5]	validation_0-rmse:739.46576
[6]	validation_0-rmse:723.90808
[7]	validation_0-rmse:715.58331
[8]	validation_0-rmse:710.54047
[9]	validation_0-rmse:707.65778
[10]	validation_0-rmse:704.53400
[11]	validation_0-rmse:703.40448
[12]	validation_0-rmse:700.66071
[13]	validation_0-rmse:699.41248
[14]	validation_0-rmse:699.81787
[15]	validation_0-rmse:699.53784
[16]	validation_0-rmse:698.91223
[17]	validation_0-rmse:698.84802
[18]	validation_0-rmse:698.31982
[19]	validation_0-rmse:696.45648
[20]	validation_0-rmse:696.42602
[21]	validation_0-rmse:695.25116
[22]	validation_0-rmse:694.93298
[23]	validation_0-rmse:694.66534
[24]	validation_0-rmse:694.54840
[25]	validation_0-rmse:693.82764
[26]	validation_0-rmse:694.13861
[27]	validation_0-rmse:692.81397
[28]	validation_0-rmse:692.58765
[29]	validation_0-rmse:691.73932
[30]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='', learn_rate=0.1,
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=42,
             subsample=0.9, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [59]:
y_pred = clf_xgb.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
print("MSE: %0.2f" % (mse))

MSE: 468355.80


### Training and testing round # 3 - With hyperparameters optimization round # 2

In [60]:
param_grid = {
    'n_estimators' : [1000,1500,2000],
    'max_depth': [3,4,5],
    'learning_rate': [0.1, 0.01, 0.05],
    'gamma': [0,0.25,1.0],
    'reg_lambda': [0.1,0.5,1],
    'scale_pos_weight': [1,3,5]
}
optimal_params = GridSearchCV(
    estimator=xgb.XGBRegressor(#objective='reg:squarederror',
                               #seed=42,
                               subsample=0.9,
                               colsample_bytree=0.5),
    param_grid=param_grid,
    #scoring='max_error', ##see scikitlearn website
    verbose=0, #NOTE: If you want to see what Grid Search is doing, set verbose=2
    n_jobs=10,
    cv=5 # cv stands for cross validation, if you want to increase the cross validation, increase cv
)

optimal_params.fit(X_train,
                  y_train,
                  early_stopping_rounds=10,
                  eval_metric='rmse',
                  eval_set=[(X_test,y_test)],
                  verbose=False)
print(optimal_params.best_params_)

{'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000, 'reg_lambda': 1, 'scale_pos_weight': 1}


In [67]:
clf_xgb = xgb.XGBRegressor(n_estimators=1000,
                            #seed=42,
                            gamma=0,
                            learning_rate=0.05,
                            max_depth=5,
                            reg_lambda=1,
                            scale_pos_weight=1,
                            #subsample=0.9,
                            #colsample_bytree=0.5
                          )
clf_xgb.fit(X_train,
           y_train,
           verbose=True,
           #early_stopping_rounds=10,
           eval_metric='rmse',
           eval_set=[(X_test,y_test)])

[0]	validation_0-rmse:1728.78442
[1]	validation_0-rmse:1659.20398
[2]	validation_0-rmse:1593.38648
[3]	validation_0-rmse:1531.56482
[4]	validation_0-rmse:1473.37915
[5]	validation_0-rmse:1418.75293
[6]	validation_0-rmse:1367.44336
[7]	validation_0-rmse:1319.38245
[8]	validation_0-rmse:1274.42053
[9]	validation_0-rmse:1232.08411
[10]	validation_0-rmse:1192.51050
[11]	validation_0-rmse:1155.52441
[12]	validation_0-rmse:1120.61743
[13]	validation_0-rmse:1088.31665
[14]	validation_0-rmse:1058.35315
[15]	validation_0-rmse:1030.25854
[16]	validation_0-rmse:1004.47009
[17]	validation_0-rmse:980.38605
[18]	validation_0-rmse:957.69958
[19]	validation_0-rmse:937.01148
[20]	validation_0-rmse:917.90912
[21]	validation_0-rmse:899.16339
[22]	validation_0-rmse:882.50006
[23]	validation_0-rmse:867.60931
[24]	validation_0-rmse:852.94367
[25]	validation_0-rmse:840.31610
[26]	validation_0-rmse:827.75946
[27]	validation_0-rmse:816.63995
[28]	validation_0-rmse:806.65918
[29]	validation_0-rmse:796.78931
[30

[244]	validation_0-rmse:652.39209
[245]	validation_0-rmse:652.18219
[246]	validation_0-rmse:652.22461
[247]	validation_0-rmse:652.04596
[248]	validation_0-rmse:652.02612
[249]	validation_0-rmse:651.86755
[250]	validation_0-rmse:651.80017
[251]	validation_0-rmse:651.59930
[252]	validation_0-rmse:651.42523
[253]	validation_0-rmse:651.39813
[254]	validation_0-rmse:651.01862
[255]	validation_0-rmse:650.92822
[256]	validation_0-rmse:650.79804
[257]	validation_0-rmse:650.74750
[258]	validation_0-rmse:650.74371
[259]	validation_0-rmse:650.64087
[260]	validation_0-rmse:650.56189
[261]	validation_0-rmse:650.61969
[262]	validation_0-rmse:650.60486
[263]	validation_0-rmse:650.40936
[264]	validation_0-rmse:650.18854
[265]	validation_0-rmse:649.91363
[266]	validation_0-rmse:649.79395
[267]	validation_0-rmse:649.74628
[268]	validation_0-rmse:649.52435
[269]	validation_0-rmse:649.56970
[270]	validation_0-rmse:649.56775
[271]	validation_0-rmse:649.62390
[272]	validation_0-rmse:649.58112
[273]	validati

[485]	validation_0-rmse:639.82507
[486]	validation_0-rmse:639.72723
[487]	validation_0-rmse:639.75403
[488]	validation_0-rmse:639.78406
[489]	validation_0-rmse:639.77515
[490]	validation_0-rmse:639.57764
[491]	validation_0-rmse:639.49597
[492]	validation_0-rmse:639.45081
[493]	validation_0-rmse:639.23840
[494]	validation_0-rmse:639.13464
[495]	validation_0-rmse:639.14838
[496]	validation_0-rmse:639.03076
[497]	validation_0-rmse:638.96252
[498]	validation_0-rmse:638.92444
[499]	validation_0-rmse:638.73364
[500]	validation_0-rmse:638.75183
[501]	validation_0-rmse:638.77588
[502]	validation_0-rmse:638.78839
[503]	validation_0-rmse:638.73041
[504]	validation_0-rmse:638.65057
[505]	validation_0-rmse:638.74640
[506]	validation_0-rmse:638.72754
[507]	validation_0-rmse:638.71155
[508]	validation_0-rmse:638.72266
[509]	validation_0-rmse:638.75324
[510]	validation_0-rmse:638.74976
[511]	validation_0-rmse:638.65552
[512]	validation_0-rmse:638.63287
[513]	validation_0-rmse:638.67084
[514]	validati

[726]	validation_0-rmse:633.72339
[727]	validation_0-rmse:633.69977
[728]	validation_0-rmse:633.72485
[729]	validation_0-rmse:633.77441
[730]	validation_0-rmse:633.72162
[731]	validation_0-rmse:633.66577
[732]	validation_0-rmse:633.62695
[733]	validation_0-rmse:633.65063
[734]	validation_0-rmse:633.62976
[735]	validation_0-rmse:633.65924
[736]	validation_0-rmse:633.61273
[737]	validation_0-rmse:633.57953
[738]	validation_0-rmse:633.67792
[739]	validation_0-rmse:633.74805
[740]	validation_0-rmse:633.69299
[741]	validation_0-rmse:633.73651
[742]	validation_0-rmse:633.74463
[743]	validation_0-rmse:633.75330
[744]	validation_0-rmse:633.77325
[745]	validation_0-rmse:633.68634
[746]	validation_0-rmse:633.72290
[747]	validation_0-rmse:633.82581
[748]	validation_0-rmse:633.84424
[749]	validation_0-rmse:633.80518
[750]	validation_0-rmse:633.73724
[751]	validation_0-rmse:633.81598
[752]	validation_0-rmse:633.80695
[753]	validation_0-rmse:633.79565
[754]	validation_0-rmse:633.78772
[755]	validati

[966]	validation_0-rmse:631.53137
[967]	validation_0-rmse:631.56250
[968]	validation_0-rmse:631.55810
[969]	validation_0-rmse:631.54242
[970]	validation_0-rmse:631.56586
[971]	validation_0-rmse:631.62183
[972]	validation_0-rmse:631.65460
[973]	validation_0-rmse:631.67322
[974]	validation_0-rmse:631.68054
[975]	validation_0-rmse:631.69312
[976]	validation_0-rmse:631.70129
[977]	validation_0-rmse:631.59381
[978]	validation_0-rmse:631.54273
[979]	validation_0-rmse:631.54364
[980]	validation_0-rmse:631.58118
[981]	validation_0-rmse:631.64935
[982]	validation_0-rmse:631.56744
[983]	validation_0-rmse:631.59906
[984]	validation_0-rmse:631.52130
[985]	validation_0-rmse:631.55750
[986]	validation_0-rmse:631.59424
[987]	validation_0-rmse:631.53723
[988]	validation_0-rmse:631.54437
[989]	validation_0-rmse:631.64624
[990]	validation_0-rmse:631.64508
[991]	validation_0-rmse:631.66101
[992]	validation_0-rmse:631.67841
[993]	validation_0-rmse:631.65869
[994]	validation_0-rmse:631.65301
[995]	validati

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

#### Prediction of "testing_dataset " (provided by MEC)

In [69]:
y_new = regressor.predict(x_val)
y_new

array([1508.5464, 1157.1948, 1590.6996, ..., 1036.1084, 1382.0481,
       1411.7349], dtype=float32)

In [73]:
y_pred = pd.DataFrame(y_new)    
# y_test = pd.DataFrame(y_test)
y_pred.columns = ['Mean_travel_time']
valset_table = pd.concat([x_val,y_pred], axis=1)
valset_table
#testset_vs_pred.columns = ['Actual', 'Predicted']
#testset_vs_pred.to_csv('Universal_XGBoost_'+attribute+'_test_evaluation'+'.csv')
#testset_vs_pred

Unnamed: 0,sourceid,dstid,dow,Mean_travel_time
0,10,950,2,1508.546387
1,10,889,2,1157.194824
2,260,145,2,1590.699585
3,260,932,7,1491.782104
4,41,808,2,949.204285
...,...,...,...,...
1957,712,435,7,1311.291016
1958,200,356,1,1766.437622
1959,200,716,3,1036.108398
1960,657,549,5,1382.048096
