## Gradient Boosting Machines (XGBoost)

Notebook with implementation of the XGBoost algorithm to predict victory in Dota 2

-------------------------------------------------------------------------------------------------------------------------------

## Time blowout matches

Useful functions to use to explore the data and preprocessing steps before feeding the data into the algorithm:

* df.columns : to see the names of the columns (i.e., features)
* df.dtype : to see the types in the data
* data.head()
* data.info()
* df.describe()

Preprocessing steps:

* Try two methods for handling missing data: 'automatic xgboost handling' and 'imputing'

* Do we need to check for correlation between features? NO (for xgboost)

* Do we need to perform feature scaling? NO (for xgboost)(scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X))

In [45]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve
import statistics as st

In [36]:
# Directory for the time blowout group
cwd = os.getcwd()
root_directory = os.path.dirname(cwd)
print(root_directory)
time_blowout_data_dir = root_directory + "/model_features_pre-match/time_blowout/"
print(time_blowout_data_dir)

C:\Users\markos-ece\Desktop\Viggiato\PhD - UofA\Research\2-Dota2\git-repo-code\data-analysis\prediction-models
C:\Users\markos-ece\Desktop\Viggiato\PhD - UofA\Research\2-Dota2\git-repo-code\data-analysis\prediction-models/model_features_pre-match/time_blowout/


### Exploration and preprocessing of the data

In [4]:
feature_time_blowout_df = pd.read_csv(time_blowout_data_dir + "dota2_time_blowout_features-used_features.csv")

In [None]:
# Print feature names
feature_time_blowout_df.columns

In [None]:
# Drop first ccolumn (match id)
feature_time_blowout_df = feature_time_blowout_df.drop(['match_id'], axis=1)

In [None]:
# Existing types
feature_time_blowout_df.dtypes

In [None]:
# Print feature names
feature_time_blowout_df.columns

In [8]:
feature_time_blowout_df.head()


Unnamed: 0,role_carry_r,role_support_r,role_nuker_r,role_disabler_r,role_jungler_r,role_durable_r,role_escape_r,role_pusher_r,role_initiator_r,role_carry_d,...,winR_hp_md_d,xpm_hp_md_d,goldm_hp_md_d,deathsm_hp_md_d,killsm_hp_md_d,assistsm_hp_md_d,damagem_hp_md_d,healingm_hp_md_d,rad_first_pick,win_label
0,1,1,1,1,0,1,1,0,1,1,...,0.0,462.0,393.0,0.12605,0.084034,0.105042,0.0,0.0,1.0,1
1,0,1,1,1,1,1,1,1,1,1,...,0.333333,281.666667,264.833333,0.138787,0.046137,0.230105,0.0,0.0,1.0,0
2,1,1,1,1,1,1,1,1,1,1,...,0.666667,452.666667,408.666667,0.115375,0.228942,0.260469,0.0,0.0,0.0,1
3,1,1,1,1,1,1,1,1,1,1,...,1.0,365.666667,276.833333,0.131327,0.094068,0.211092,0.0,0.0,1.0,1
4,1,1,1,1,0,1,1,1,1,1,...,0.746667,433.853333,388.16,0.091623,0.193855,0.282416,0.0,0.0,0.0,0


In [None]:
feature_time_blowout_df.describe()

In [10]:
feature_time_blowout_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5528 entries, 0 to 5527
Data columns (total 58 columns):
role_carry_r        5528 non-null int64
role_support_r      5528 non-null int64
role_nuker_r        5528 non-null int64
role_disabler_r     5528 non-null int64
role_jungler_r      5528 non-null int64
role_durable_r      5528 non-null int64
role_escape_r       5528 non-null int64
role_pusher_r       5528 non-null int64
role_initiator_r    5528 non-null int64
role_carry_d        5528 non-null int64
role_support_d      5528 non-null int64
role_nuker_d        5528 non-null int64
role_disabler_d     5528 non-null int64
role_jungler_d      5528 non-null int64
role_durable_d      5528 non-null int64
role_escape_d       5528 non-null int64
role_pusher_d       5528 non-null int64
role_initiator_d    5528 non-null int64
bstr_md_r           5528 non-null int64
bagi_md_r           5528 non-null int64
bint_md_r           5528 non-null int64
strg_md_r           5528 non-null float64
agig_md_r  

In [11]:
# Drop first ccolumn (match id)
feature_time_blowout_df = feature_time_blowout_df.drop(['deathsm_hp_md_r'], axis=1)
feature_time_blowout_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5528 entries, 0 to 5527
Data columns (total 57 columns):
role_carry_r        5528 non-null int64
role_support_r      5528 non-null int64
role_nuker_r        5528 non-null int64
role_disabler_r     5528 non-null int64
role_jungler_r      5528 non-null int64
role_durable_r      5528 non-null int64
role_escape_r       5528 non-null int64
role_pusher_r       5528 non-null int64
role_initiator_r    5528 non-null int64
role_carry_d        5528 non-null int64
role_support_d      5528 non-null int64
role_nuker_d        5528 non-null int64
role_disabler_d     5528 non-null int64
role_jungler_d      5528 non-null int64
role_durable_d      5528 non-null int64
role_escape_d       5528 non-null int64
role_pusher_d       5528 non-null int64
role_initiator_d    5528 non-null int64
bstr_md_r           5528 non-null int64
bagi_md_r           5528 non-null int64
bint_md_r           5528 non-null int64
strg_md_r           5528 non-null float64
agig_md_r  

In [None]:
print(feature_time_blowout_df.isnull().sum())

In [None]:
# Change type of 'radiant_first_pick' from float to int
# feature_time_blowout_df["rad_first_pick"] = feature_time_blowout_df["rad_first_pick"].astype("int")

### Model building, training and evaluation

In [13]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import statistics as st

In [14]:
X, y = feature_time_blowout_df.iloc[:,:-1],feature_time_blowout_df.iloc[:,-1]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y.head()

In [None]:
y.shape

In [15]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [21]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100, eval_metric='auc')

### k-fold cv - 3 (best)

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# define evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(xg_reg, X, y, scoring='roc_auc', cv=cv)
# summarize performance
print(len(scores))
print('Median ROC AUC: %.5f' % st.median(scores))

30
Median ROC AUC: 0.84990


In [37]:
features = [c for c in feature_time_blowout_df.columns if c != 'win_label']
target = 'win_label'

In [38]:
kfolds = KFold(n_splits=10, shuffle=True)

In [43]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.2,
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
     'max_depth': 10,
     'alpha': 10
}

num_round = 100
thres = 0.5

In [50]:
cnf = list()
auc = list()

for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    
    param['scale_pos_weight'] = (y_train.size - y_train.sum()) / y_train.sum()    
    
    xg_train = xgb.DMatrix(
        X_train.values, feature_names=features, label=y_train.values
    )
    xg_test = xgb.DMatrix(
        X_test.values, feature_names=features, label=y_test.values
    )
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    bst = xgb.train(param, xg_train, num_round, watchlist, verbose_eval=False)
    preds = bst.predict(xg_test)
    
    cnf.append(confusion_matrix(y_test, (preds > thres).astype(int)))
    auc.append(roc_auc_score(y_test, preds))

cnf = sum(cnf)

'Median AUC: {:.04f}'.format(st.median(auc))

# auc = sum(auc) / len(auc)
# 'Average AUC: {:.04f}'.format(auc)

'Median AUC: 0.8343'