## Gradient Boosting Machines (XGBoost)

Notebook with implementation of the XGBoost algorithm to predict victory in Dota 2

-------------------------------------------------------------------------------------------------------------------------------

#### Note that this is my first version of the XGBoost implementation.

#### The version that I actually used can be found in the *prediction-explanation-SHAP* directory, together with the SHAP technique implementation!

#### If you are running this code on Google Colab, you need to first upload the following feature file: *dota2_regular_features.csv*

## Regular matches

Useful functions to use to explore the data and preprocessing steps before feeding the data into the algorithm:

* df.columns : to see the names of the columns (i.e., features)
* df.dtype : to see the types in the data
* data.head()
* data.info()
* df.describe()

In [None]:
# Import neccessary libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, auc
import statistics as st

In [None]:
# NOTE: uncomment this cell if you are running this code on a local machine. Please adjust the following variables to correctly point to the feature file location on your machine

# # Set directory for the regular match group
# cwd = os.getcwd()
# root_directory = os.path.dirname(cwd)

# regular_data_dir = root_directory + "\\model_features_pre-match\\regular\\"
# path_to_features = regular_data_dir + "dota2_regular_features.csv"

In [None]:
# NOTE: use this cell if you are running this code on Google Colab

# Set directory for the regular match group. Make sure the feature file is uploaded to this Colab session
path_to_features = "/content/dota2_regular_features.csv"

In [None]:
# Read the data (model feature file)
feature_regular_df = pd.read_csv(path_to_features)

//content/dota2_regular_features.csv


In [None]:
# Print feature names
feature_regular_df.columns

Index(['match_id', 'rad_hero_1', 'rad_hero_2', 'rad_hero_3', 'rad_hero_4',
       'rad_hero_5', 'rad_hero_6', 'rad_hero_7', 'rad_hero_8', 'rad_hero_9',
       ...
       'hero_damagem_hp_hero3_d', 'hero_damagem_hp_hero4_d',
       'hero_damagem_hp_hero5_d', 'healingm_hp_hero1_d', 'healingm_hp_hero2_d',
       'healingm_hp_hero3_d', 'healingm_hp_hero4_d', 'healingm_hp_hero5_d',
       'rad_first_pick', 'win_label'],
      dtype='object', length=459)

In [None]:
# Drop first column (match id)
feature_regular_df = feature_regular_df.drop(['match_id'], axis=1)

In [None]:
# Check the types of the features
feature_regular_df.dtypes

rad_hero_1               int64
rad_hero_2               int64
rad_hero_3               int64
rad_hero_4               int64
rad_hero_5               int64
                        ...   
healingm_hp_hero3_d    float64
healingm_hp_hero4_d    float64
healingm_hp_hero5_d    float64
rad_first_pick         float64
win_label                int64
Length: 458, dtype: object

In [None]:
feature_regular_df.head()

Unnamed: 0,rad_hero_1,rad_hero_2,rad_hero_3,rad_hero_4,rad_hero_5,rad_hero_6,rad_hero_7,rad_hero_8,rad_hero_9,rad_hero_10,rad_hero_11,rad_hero_12,rad_hero_13,rad_hero_14,rad_hero_15,rad_hero_16,rad_hero_17,rad_hero_18,rad_hero_19,rad_hero_20,rad_hero_21,rad_hero_22,rad_hero_23,rad_hero_25,rad_hero_26,rad_hero_27,rad_hero_28,rad_hero_29,rad_hero_30,rad_hero_31,rad_hero_32,rad_hero_33,rad_hero_34,rad_hero_35,rad_hero_36,rad_hero_37,rad_hero_38,rad_hero_39,rad_hero_40,rad_hero_41,...,xpm_hp_hero3_d,xpm_hp_hero4_d,xpm_hp_hero5_d,goldm_hp_hero1_d,goldm_hp_hero2_d,goldm_hp_hero3_d,goldm_hp_hero4_d,goldm_hp_hero5_d,deathsm_hp_hero1_d,deathsm_hp_hero2_d,deathsm_hp_hero3_d,deathsm_hp_hero4_d,deathsm_hp_hero5_d,damagem_hp_hero1_d,damagem_hp_hero2_d,damagem_hp_hero3_d,damagem_hp_hero4_d,damagem_hp_hero5_d,killsm_hp_hero1_d,killsm_hp_hero2_d,killsm_hp_hero3_d,killsm_hp_hero4_d,killsm_hp_hero5_d,assistsm_hp_hero1_d,assistsm_hp_hero2_d,assistsm_hp_hero3_d,assistsm_hp_hero4_d,assistsm_hp_hero5_d,hero_damagem_hp_hero1_d,hero_damagem_hp_hero2_d,hero_damagem_hp_hero3_d,hero_damagem_hp_hero4_d,hero_damagem_hp_hero5_d,healingm_hp_hero1_d,healingm_hp_hero2_d,healingm_hp_hero3_d,healingm_hp_hero4_d,healingm_hp_hero5_d,rad_first_pick,win_label
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,509.0,,,263.0,246.777778,381.0,,,0.16282,0.109091,,,,,,,,,0.148492,0.096481,0.152727,,,6.5e-05,0.322785,0.087273,,,0.0,0.0,0.0,,,0.0,0.0,0.0,,,0.0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,260.0,500.0,516.0,333.333333,498.0,189.0,624.333333,514.333333,0.120243,0.219178,0.020243,0.083682,0.236911,,,,0.0,,0.147123,0.191728,0.027397,0.160772,0.190687,7.9e-05,0.284324,0.109589,0.191673,0.358843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,,,,293.0,,,,,,,,,,,,,,,0.113841,,,,,7.4e-05,,,,,0.0,,,,,0.0,,,,,1.0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,...,337.0,354.5,312.0,,156.0,236.0,305.5,358.0,0.198741,0.119245,0.206938,0.274757,,,,,0.0,,,0.019874,0.158993,0.066858,0.137378,,0.139119,0.258364,0.246686,0.274757,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,1


In [None]:
feature_regular_df.describe()

Unnamed: 0,rad_hero_1,rad_hero_2,rad_hero_3,rad_hero_4,rad_hero_5,rad_hero_6,rad_hero_7,rad_hero_8,rad_hero_9,rad_hero_10,rad_hero_11,rad_hero_12,rad_hero_13,rad_hero_14,rad_hero_15,rad_hero_16,rad_hero_17,rad_hero_18,rad_hero_19,rad_hero_20,rad_hero_21,rad_hero_22,rad_hero_23,rad_hero_25,rad_hero_26,rad_hero_27,rad_hero_28,rad_hero_29,rad_hero_30,rad_hero_31,rad_hero_32,rad_hero_33,rad_hero_34,rad_hero_35,rad_hero_36,rad_hero_37,rad_hero_38,rad_hero_39,rad_hero_40,rad_hero_41,...,xpm_hp_hero3_d,xpm_hp_hero4_d,xpm_hp_hero5_d,goldm_hp_hero1_d,goldm_hp_hero2_d,goldm_hp_hero3_d,goldm_hp_hero4_d,goldm_hp_hero5_d,deathsm_hp_hero1_d,deathsm_hp_hero2_d,deathsm_hp_hero3_d,deathsm_hp_hero4_d,deathsm_hp_hero5_d,damagem_hp_hero1_d,damagem_hp_hero2_d,damagem_hp_hero3_d,damagem_hp_hero4_d,damagem_hp_hero5_d,killsm_hp_hero1_d,killsm_hp_hero2_d,killsm_hp_hero3_d,killsm_hp_hero4_d,killsm_hp_hero5_d,assistsm_hp_hero1_d,assistsm_hp_hero2_d,assistsm_hp_hero3_d,assistsm_hp_hero4_d,assistsm_hp_hero5_d,hero_damagem_hp_hero1_d,hero_damagem_hp_hero2_d,hero_damagem_hp_hero3_d,hero_damagem_hp_hero4_d,hero_damagem_hp_hero5_d,healingm_hp_hero1_d,healingm_hp_hero2_d,healingm_hp_hero3_d,healingm_hp_hero4_d,healingm_hp_hero5_d,rad_first_pick,win_label
count,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,36348.0,...,15085.0,14937.0,14886.0,26166.0,14823.0,15085.0,14937.0,14886.0,14025.0,14302.0,14101.0,14112.0,14030.0,9193.0,9584.0,9792.0,9375.0,9216.0,26166.0,14823.0,15085.0,14937.0,14886.0,26166.0,14823.0,15085.0,14937.0,14886.0,26166.0,14823.0,15085.0,14937.0,14886.0,26166.0,14823.0,15085.0,14937.0,14886.0,35873.0,36348.0
mean,0.02996,0.040387,0.048476,0.0178,0.047238,0.032271,0.086937,0.075107,0.084269,0.045009,0.062754,0.031776,0.07464,0.014169,0.054528,0.077391,0.054666,0.036948,0.052217,0.080335,0.034362,0.024183,0.046578,0.054831,0.060636,0.05871,0.053896,0.058985,0.063965,0.053703,0.015874,0.027814,0.033124,0.01813,0.021872,0.027209,0.041268,0.056344,0.032739,0.066606,...,440.534214,436.354944,447.31983,401.451846,414.160639,403.16058,398.301713,413.2222,0.129736,0.130708,0.13019,0.128824,0.127985,0.127723,0.124177,0.117199,0.124984,0.127571,0.136876,0.145629,0.141165,0.137129,0.145324,8.2e-05,0.293149,0.29588,0.297344,0.294667,238.354814,222.174944,214.430981,213.72977,220.123572,9.794132,13.103732,14.093069,13.228779,13.37576,0.489449,0.504897
std,0.17048,0.196869,0.214772,0.132226,0.21215,0.176722,0.281747,0.263568,0.277794,0.207328,0.242524,0.175406,0.262813,0.118187,0.227061,0.267214,0.22733,0.188638,0.222468,0.271814,0.18216,0.153619,0.210735,0.227653,0.238665,0.235085,0.225815,0.235601,0.244694,0.225434,0.124991,0.164443,0.178963,0.133424,0.146267,0.162695,0.198912,0.230588,0.177955,0.249342,...,142.494085,144.290639,141.263959,105.38553,145.645941,142.141046,143.331676,143.755402,0.065387,0.065359,0.064716,0.066673,0.066158,0.08429,0.083445,0.084437,0.083298,0.082965,0.07738,0.100726,0.098473,0.096964,0.096963,3e-05,0.129176,0.128003,0.128388,0.13123,228.319073,248.50648,240.06813,241.346798,243.393764,29.75595,40.327858,41.77331,42.094062,42.210627,0.499896,0.499983
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.0,80.0,4.0,54.0,46.0,60.0,71.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,325.857143,320.0,332.142857,328.824405,290.333333,285.0,280.0,292.0,0.086815,0.087639,0.087808,0.085047,0.084304,0.082083,0.077757,0.071767,0.07979,0.081991,0.088495,0.07371,0.071239,0.070157,0.075781,6.3e-05,0.206947,0.212181,0.211268,0.208705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,434.0,423.0,443.0,391.541667,397.5,380.5,369.769231,395.0,0.122649,0.122037,0.122271,0.119326,0.119264,0.12321,0.120063,0.11445,0.119755,0.123986,0.123573,0.122475,0.119789,0.11393,0.12745,7.8e-05,0.27941,0.28093,0.283455,0.278509,229.269977,166.859454,167.989886,165.901639,171.018315,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,538.2,536.0,547.98913,461.333333,523.6,509.0,503.5,520.458333,0.163369,0.164609,0.164171,0.161435,0.162667,0.16684,0.163553,0.15792,0.164695,0.167737,0.168272,0.197021,0.187385,0.182768,0.196642,9.6e-05,0.359181,0.36024,0.365671,0.359104,374.495025,369.804394,344.350168,339.846932,360.84146,5.086512,2.498133,3.588686,2.166306,2.097971,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1100.0,1126.0,1119.0,1184.0,1125.0,1176.0,1227.0,1116.0,0.575946,0.587896,0.667539,0.688073,0.646651,0.90737,1.054639,1.131395,0.970282,1.072964,0.97767,0.912162,0.95791,0.950119,1.01531,0.000352,1.111846,1.121495,1.282895,1.428571,2073.971675,1553.005464,1733.408578,2013.219061,1764.290351,870.20202,847.880914,627.365702,807.933194,1102.591861,1.0,1.0


In [None]:
feature_regular_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36348 entries, 0 to 36347
Columns: 458 entries, rad_hero_1 to win_label
dtypes: float64(161), int64(297)
memory usage: 127.0 MB


### Model building, training and evaluation

In [None]:
# Import xgboost libraries
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# Split data into features (X) and label (y)
X, y = feature_regular_df.iloc[:,:-1],feature_regular_df.iloc[:,-1]

In [None]:
features = [c for c in feature_regular_df.columns if c != 'win_label']
target = 'win_label'

In [None]:
# Define the number of folds to the K-fold cross-validation
kfolds = KFold(n_splits=10, shuffle=True)

In [None]:
# Define the parameters for the training process
param = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.2,
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
     'max_depth': 10,
     'alpha': 10
}

num_round = 100

In [None]:
# NOTE: the training process might take a while to execute

auc = list()

for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    
    param['scale_pos_weight'] = (y_train.size - y_train.sum()) / y_train.sum()    
    
    xg_train = xgb.DMatrix(
        X_train.values, feature_names=features, label=y_train.values
    )
    xg_test = xgb.DMatrix(
        X_test.values, feature_names=features, label=y_test.values
    )
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    bst = xgb.train(param, xg_train, num_round, watchlist, verbose_eval=False)
    preds = bst.predict(xg_test)
    
    auc.append(roc_auc_score(y_test, preds))


'Median AUC: {:.04f}'.format(st.median(auc))