## **SHAP Values Implementation**

## Explanation of XGBoost prediction models for Dota 2 team victory by means of feature importance

---

#### If you are running this code on Google Colab, you need to first upload the following feature files to run all the notebook sections below: 

*   *dota2_regular_features.csv*
*   *dota2_score_blowout_features.csv*
*   *dota2_time_blowout_features.csv*

  
  

###  **REGULAR MATCHES**

In [None]:
# Install shap for python
!pip install shap

In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, auc

import statistics as st

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

import matplotlib.pylab as pl
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import shap

In [None]:
# Set seed for reproducibility purposes
np.random.seed(3)

In [None]:
# load JS visualization code to notebook
shap.initjs()

In [None]:
# NOTE: uncomment this cell if you are running this code on a local machine. Please adjust the following variables to correctly point to the feature file location on your machine

# # Set directory for the regular match group
# cwd = os.getcwd()
# root_directory = os.path.dirname(cwd)

# regular_data_dir = root_directory + "\\model_features_pre-match\\regular\\"
# path_to_features = regular_data_dir + "dota2_regular_features.csv"

In [None]:
# NOTE: use this cell if you are running this code on Google Colab

# Set directory for the regular match group. Make sure the feature file is uploaded to this Colab session
path_to_features = "/content/dota2_regular_features.csv"

In [None]:
# Read the data (model feature file)
feature_regular_df = pd.read_csv(path_to_features)

#### Data exploration

In [None]:
len(feature_regular_df.columns)

In [None]:
# Drop first column (match id)
feature_regular_df = feature_regular_df.drop(['match_id'], axis=1)

In [None]:
feature_regular_df.head()

In [None]:
feature_regular_df['win_label'].value_counts()

### **Model building, training, and evaluation**

In [None]:
features = [c for c in feature_regular_df.columns if c != 'win_label']
target = 'win_label'
X, y = feature_regular_df.iloc[:,:-1],feature_regular_df.iloc[:,-1]

#### Grid search to tune hyperparameters

NOTE: This step takes a while to run. You can use the lastest found best parameters (at the beginning of the next section), you if want to skip the hyper-parameter tunning

In [None]:
# Define a timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
# A parameter grid for XGBoost
params_search = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.3, 0.5, 0.7],
        'max_depth': [4, 6, 8, 10],
        'n_estimators': [10, 50, 100]
        }

In [None]:
xgb = XGBClassifier(objective='binary:logistic', silent=False, nthread=4)

In [None]:
train, test, train_labels, test_labels = train_test_split(X, y, 
                                                          stratify = y,
                                                          test_size = 0.2)

In [None]:
# NOTE: the hyperparameter tunning (this cell) might take a while to execute

folds = 5
param_comb = 100

# Stratified k fold is used to maintain the class distribution from the original data in each fold
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# Define the model for the random search
random_search = RandomizedSearchCV(xgb, param_distributions=params_search, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(train,train_labels), verbose=True)

# Run random search
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(train, train_labels)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: ignored

In [None]:
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best hyperparameters:


NameError: ignored

#### Use best hyperparameters to build and train the model

**Best hyperparameters regular:** {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.5}

In [None]:
# Define the number of fold for the k-fold cross-validation
kfolds = KFold(n_splits=10, shuffle=True)

In [None]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'learning_rate': 0.1,
    'max_depth': 4,
    'gamma': 0.5
}

# This variable corresponds to 'n_estimators' (number of trees)
num_round = 100

In [None]:
# NOTE: the training process (this cell) might take a while to execute

import xgboost as xgb

auc = list()
for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    
    param['scale_pos_weight'] = (y_train.size - y_train.sum()) / y_train.sum()    
    
    xg_train = xgb.DMatrix(
        X_train.values, feature_names=features, label=y_train.values
    )
    xg_test = xgb.DMatrix(
        X_test.values, feature_names=features, label=y_test.values
    )
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    bst = xgb.train(param, xg_train, num_round, watchlist, verbose_eval=False)
    preds = bst.predict(xg_test) 

    auc.append(roc_auc_score(y_test, preds))

'Median AUC: {:.04f}'.format(st.median(auc))


### **SHAP values - Regular**

In [None]:
# Create the original summary plot
# Use only the last x_test as there is little variance between the folds
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test)

In [None]:
# Using np.abs(shap_values).mean(0) will summarize the global importance of a feature as the mean of the absolute change in the prediction
# caused by that feature over the entire dataset. 

np.abs(shap_values).mean(0)

In [None]:
# Check the shape of the shap_values matrix
shap_values.shape

In [None]:
# Keep a backup in case it is necessary
shap_values_backup = shap_values

In [None]:
new_columns = ['heroes_r', 'heroes_d',
               'role_carry_r', 'role_support_r', 'role_nuker_r', 'role_disabler_r', 'role_jungler_r',
                'role_durable_r', 'role_escape_r', 'role_pusher_r', 'role_initiator_r',
                'role_carry_d', 'role_support_d', 'role_nuker_d', 'role_disabler_d', 'role_jungler_d',
                'role_durable_d', 'role_escape_d', 'role_pusher_d', 'role_initiator_d',
                'strength_r', 'agility_r', 'intellig_r', 'strength_gain_r', 'agility_gain_r', 'intellig_gain_r', 'health_r', 'health_regeneration_r', 'move_speed_r',
                'strength_d', 'agility_d', 'intellig_d', 'strength_gain_d', 'agility_gain_d', 'intellig_gain_d', 'health_d', 'health_regeneration_d', 'move_speed_d',
                'hero_winrate_r', 'hero_winrate_d', 'player_winrate_r', 'player_winrate_d',
                'hp_winrate_r', 'hp_xp_min_r', 'hp_gold_min_r', 'hp_death_min_r', 'hp_taken_damage_min_r', 'hp_kill_min_r', 'hp_assist_min_r', 'hp_caused_damage_min_r', 'hp_heal_min_r',
                'hp_winrate_d', 'hp_xp_min_d', 'hp_gold_min_d', 'hp_death_min_d', 'hp_taken_damage_min_d', 'hp_kill_min_d', 'hp_assist_min_d', 'hp_caused_damage_min_d', 'hp_heal_min_d',
                'first_pick']

len(new_columns)

In [None]:
# Create new shap values matrix
shap_values_new = pd.DataFrame(columns=new_columns)
print(shap_values_new.shape)

for row in range(len(shap_values)):

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + shap_values[row, col]
    
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + shap_values[row, col]

    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + shap_values[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + shap_values[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + shap_values[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + shap_values[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + shap_values[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + shap_values[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + shap_values[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + shap_values[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + shap_values[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + shap_values[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + shap_values[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + shap_values[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + shap_values[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + shap_values[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + shap_values[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + shap_values[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + shap_values[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + shap_values[row, col]


    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + shap_values[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + shap_values[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + shap_values[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + shap_values[row, col]

    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + shap_values[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + shap_values[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + shap_values[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + shap_values[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + shap_values[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + shap_values[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + shap_values[row, col]

    index_hero_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_hero_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + shap_values[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + shap_values[row, col]


    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + shap_values[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + shap_values[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + shap_values[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_dire = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_dire = deathsm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_dire = 0
    for col in index_damagem_hp_rad:
      damagem_hp_dire = damagem_hp_dire + shap_values[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + shap_values[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + shap_values[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + shap_values[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(shap_values[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [shap_values[row,198]]

    shap_values_new.loc[row] = new_row

In [None]:
# Check the shape of the new shap values matrix
shap_values_new.shape

In [None]:
#Convert to numpy
shap_values_new = shap_values_new.to_numpy()

In [None]:
# Modify X_test accordingly

X_test_new = pd.DataFrame(columns=new_columns)
print(X_test_new.shape)
for row in range(len(X_test)):

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + X_test.iloc[row, col]
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + X_test.iloc[row, col]


    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + X_test.iloc[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + X_test.iloc[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + X_test.iloc[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + X_test.iloc[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + X_test.iloc[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + X_test.iloc[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + X_test.iloc[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + X_test.iloc[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + X_test.iloc[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + X_test.iloc[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + X_test.iloc[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + X_test.iloc[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + X_test.iloc[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + X_test.iloc[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + X_test.iloc[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + X_test.iloc[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + X_test.iloc[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + X_test.iloc[row, col]

    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + X_test.iloc[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + X_test.iloc[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + X_test.iloc[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + X_test.iloc[row, col]


    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + X_test.iloc[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + X_test.iloc[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + X_test.iloc[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + X_test.iloc[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + X_test.iloc[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + X_test.iloc[row, col]

    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + X_test.iloc[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + X_test.iloc[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + X_test.iloc[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + X_test.iloc[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + X_test.iloc[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + X_test.iloc[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(X_test.iloc[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [X_test.iloc[row,198]]

    X_test_new.loc[row] = new_row

In [None]:
# Fill in missing values with the median
X_test_new = X_test_new.fillna(X_test_new.median())
X_test_new.shape

In [None]:
# Generate summary plot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, show=False)  
plt.xlabel('')
plt.xticks(fontsize=18)
plt.yticks(fontsize=22)
plt.savefig('shapvalues_regular_newdata_bold.pdf', bbox_inches = "tight")

In [None]:
# Generate barplot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, plot_type="bar", show=False) 
size = plt.gcf().get_size_inches()
print(size) 
plt.gcf().set_size_inches(9, 13)
plt.xlabel('')
plt.xticks(fontsize=21)
plt.yticks(fontsize=24)
plt.savefig('shapvalues_regular_newdata_barplot_bold.pdf', bbox_inches = "tight")

In [None]:
max(np.abs(shap_values_new).mean(0))

### **TIME BLOWOUT MATCHES**

In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, auc

import statistics as st

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

import matplotlib.pylab as pl
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import shap

In [None]:
# Set seed for reproducibility purposes
np.random.seed(3)

In [None]:
# load JS visualization code to notebook
shap.initjs()

In [None]:
# NOTE: uncomment this cell if you are running this code on a local machine. Please adjust the following variables to correctly point to the feature file location on your machine

# # Set directory for the time blowout match group
# cwd = os.getcwd()
# root_directory = os.path.dirname(cwd)

# time_blowout_data_dir = root_directory + "\\model_features_pre-match\\time_blowout\\"
# path_to_features = time_blowout_data_dir + "dota2_time_blowout_features.csv"

In [None]:
# NOTE: use this cell if you are running this code on Google Colab

# Set directory for the time blowout match group. Make sure the feature file is uploaded to this Colab session
path_to_features = "/content/dota2_time_blowout_features.csv"

In [None]:
# Read the data (model feature file)
feature_time_blowout_df = pd.read_csv(path_to_features)

#### Data exploration

In [None]:
len(feature_time_blowout_df.columns)

In [None]:
# Drop first column (match id)
feature_time_blowout_df = feature_time_blowout_df.drop(['match_id'], axis=1)

In [None]:
feature_time_blowout_df.head()

In [None]:
feature_time_blowout_df['win_label'].value_counts()

### **Model building, training, and evaluation**

In [None]:
features = [c for c in feature_time_blowout_df.columns if c != 'win_label']
target = 'win_label'
X, y = feature_time_blowout_df.iloc[:,:-1],feature_time_blowout_df.iloc[:,-1]

#### Grid search to tune hyperparameters

NOTE: This step takes a while to run. You can use the lastest found best parameters (at the beginning of the next section), you if want to skip the hyper-parameter tunning

In [None]:
# Define a timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
# A parameter grid for XGBoost
params_search = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.3, 0.5, 0.7],
        'max_depth': [4, 6, 8, 10],
        'n_estimators': [10, 50, 100]
        }

In [None]:
xgb = XGBClassifier(objective='binary:logistic', silent=False, nthread=4)

In [None]:
train, test, train_labels, test_labels = train_test_split(X, y, 
                                                          stratify = y,
                                                          test_size = 0.2)

In [None]:
# NOTE: the hyperparameter tunning (this cell) might take a while to execute

folds = 5
param_comb = 100

# Stratified k fold is used to maintain the class distribution from the original data in each fold
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# Define the model for the random search
random_search = RandomizedSearchCV(xgb, param_distributions=params_search, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(train,train_labels), verbose=True)

# Run random search
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(train, train_labels)
timer(start_time) # timing ends here for "start_time" variable

In [None]:
print('\n Best hyperparameters:')
print(random_search.best_params_)

**Best hyperparameters time blowout:**
{'subsample': 1.0, 'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 1, 'colsample_bytree': 0.3}

#### Use best hyperparameters to build and train the model

In [None]:
# Define the number of fold for the k-fold cross-validation
kfolds = KFold(n_splits=10, shuffle=True)

In [None]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'subsample': 1.0,
    'colsample_bytree': 0.3,
    'learning_rate': 0.05,
    'max_depth': 4,
    'gamma': 1
}

# This variable corresponds to 'n_estimators' (number of trees)
num_round = 100 # n_estimators

In [None]:
# NOTE: the training process (this cell) might take a while to execute

import xgboost as xgb

auc = list()
for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    
    param['scale_pos_weight'] = (y_train.size - y_train.sum()) / y_train.sum()    
    
    xg_train = xgb.DMatrix(
        X_train.values, feature_names=features, label=y_train.values
    )
    xg_test = xgb.DMatrix(
        X_test.values, feature_names=features, label=y_test.values
    )
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    bst = xgb.train(param, xg_train, num_round, watchlist, verbose_eval=False)
    preds = bst.predict(xg_test) 

    auc.append(roc_auc_score(y_test, preds))

'Median AUC: {:.04f}'.format(st.median(auc))

### **SHAP values - Time blowout**

In [None]:
# Create the original summary plot
# Use only the last x_test as there is little variance between the folds
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test)

In [None]:
# Using np.abs(shap_values).mean(0) will summarize the global importance of a feature as the mean of the absolute change in the prediction
# caused by that feature over the entire dataset. 

np.abs(shap_values).mean(0)

In [None]:
# Check the shape of the shap_values matrix
shap_values.shape

In [None]:
# Keep a backup in case it is necessary
shap_values_backup = shap_values

In [None]:
new_columns = ['heroes_r', 'heroes_d',
               'role_carry_r', 'role_support_r', 'role_nuker_r', 'role_disabler_r', 'role_jungler_r',
                'role_durable_r', 'role_escape_r', 'role_pusher_r', 'role_initiator_r',
                'role_carry_d', 'role_support_d', 'role_nuker_d', 'role_disabler_d', 'role_jungler_d',
                'role_durable_d', 'role_escape_d', 'role_pusher_d', 'role_initiator_d',
                'strength_r', 'agility_r', 'intellig_r', 'strength_gain_r', 'agility_gain_r', 'intellig_gain_r', 'health_r', 'health_regeneration_r', 'move_speed_r',
                'strength_d', 'agility_d', 'intellig_d', 'strength_gain_d', 'agility_gain_d', 'intellig_gain_d', 'health_d', 'health_regeneration_d', 'move_speed_d',
                'hero_winrate_r', 'hero_winrate_d', 'player_winrate_r', 'player_winrate_d',
                'hp_winrate_r', 'hp_xp_min_r', 'hp_gold_min_r', 'hp_death_min_r', 'hp_taken_damage_min_r', 'hp_kill_min_r', 'hp_assist_min_r', 'hp_caused_damage_min_r', 'hp_heal_min_r',
                'hp_winrate_d', 'hp_xp_min_d', 'hp_gold_min_d', 'hp_death_min_d', 'hp_taken_damage_min_d', 'hp_kill_min_d', 'hp_assist_min_d', 'hp_caused_damage_min_d', 'hp_heal_min_d',
                'first_pick']

len(new_columns)

In [None]:
# Create new shap values matrix
shap_values_new = pd.DataFrame(columns=new_columns)
print(shap_values_new.shape)

for row in range(len(shap_values)):

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + shap_values[row, col]
    
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + shap_values[row, col]

    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + shap_values[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + shap_values[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + shap_values[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + shap_values[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + shap_values[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + shap_values[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + shap_values[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + shap_values[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + shap_values[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + shap_values[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + shap_values[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + shap_values[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + shap_values[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + shap_values[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + shap_values[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + shap_values[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + shap_values[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + shap_values[row, col]


    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + shap_values[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + shap_values[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + shap_values[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + shap_values[row, col]

    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + shap_values[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + shap_values[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + shap_values[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + shap_values[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + shap_values[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + shap_values[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + shap_values[row, col]

    index_hero_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_hero_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + shap_values[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + shap_values[row, col]


    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + shap_values[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + shap_values[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + shap_values[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_dire = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_dire = deathsm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_dire = 0
    for col in index_damagem_hp_rad:
      damagem_hp_dire = damagem_hp_dire + shap_values[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + shap_values[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + shap_values[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + shap_values[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(shap_values[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [shap_values[row,198]]

    shap_values_new.loc[row] = new_row

In [None]:
# Check the shape of the new shap values matrix
shap_values_new.shape

In [None]:
#Convert to numpy
shap_values_new = shap_values_new.to_numpy()

In [None]:
# Modify X_test accordingly

X_test_new = pd.DataFrame(columns=new_columns)
print(X_test_new.shape)
for row in range(len(X_test)):

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + X_test.iloc[row, col]
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + X_test.iloc[row, col]


    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + X_test.iloc[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + X_test.iloc[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + X_test.iloc[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + X_test.iloc[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + X_test.iloc[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + X_test.iloc[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + X_test.iloc[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + X_test.iloc[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + X_test.iloc[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + X_test.iloc[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + X_test.iloc[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + X_test.iloc[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + X_test.iloc[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + X_test.iloc[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + X_test.iloc[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + X_test.iloc[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + X_test.iloc[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + X_test.iloc[row, col]

    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + X_test.iloc[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + X_test.iloc[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + X_test.iloc[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + X_test.iloc[row, col]


    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + X_test.iloc[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + X_test.iloc[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + X_test.iloc[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + X_test.iloc[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + X_test.iloc[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + X_test.iloc[row, col]

    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + X_test.iloc[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + X_test.iloc[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + X_test.iloc[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + X_test.iloc[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + X_test.iloc[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + X_test.iloc[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(X_test.iloc[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [X_test.iloc[row,198]]

    X_test_new.loc[row] = new_row

In [None]:
# Fill in missing values with the median
X_test_new = X_test_new.fillna(X_test_new.median())
X_test_new.shape

In [None]:
# Generate summary plot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, show=False)  
plt.xlabel('')
plt.xticks(fontsize=18)
plt.yticks(fontsize=22)
plt.savefig('shapvalues_time_bold.pdf', bbox_inches = "tight")

In [None]:
# Generate barplot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, plot_type="bar", show=False) 
size = plt.gcf().get_size_inches()
print(size) 
plt.gcf().set_size_inches(9, 13)
plt.xlabel('')
plt.xticks(fontsize=21)
plt.yticks(fontsize=24)
plt.savefig('shapvalues_time_barplot_bold.pdf', bbox_inches = "tight")



---



### **SCORE BLOWOUT MATCHES**

In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, auc

import statistics as st

import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

import matplotlib.pylab as pl
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import shap

In [None]:
# Set seed for reproducibility purposes
np.random.seed(3)

In [None]:
# load JS visualization code to notebook
shap.initjs()

In [None]:
# NOTE: uncomment this cell if you are running this code on a local machine. Please adjust the following variables to correctly point to the feature file location on your machine

# # Set directory for the score blowout match group
# cwd = os.getcwd()
# root_directory = os.path.dirname(cwd)

# score_blowout_data_dir = root_directory + "\\model_features_pre-match\\score_blowout\\"
# path_to_features = score_blowout_data_dir + "dota2_score_blowout_features.csv"

In [None]:
# NOTE: use this cell if you are running this code on Google Colab

# Set directory for the score blowout match group. Make sure the feature file is uploaded to this Colab session
path_to_features = "/content/dota2_score_blowout_features.csv"

In [None]:
# Read the data (model feature file)
feature_score_blowout_df = pd.read_csv(path_to_features)

#### Data exploration

In [None]:
len(feature_score_blowout_df.columns)

In [None]:
# Drop first column (match id)
feature_score_blowout_df = feature_score_blowout_df.drop(['match_id'], axis=1)

In [None]:
feature_score_blowout_df.head()

In [None]:
feature_score_blowout_df['win_label'].value_counts()

### **Model building, training, and evaluation**

In [None]:
features = [c for c in feature_score_blowout_df.columns if c != 'win_label']
target = 'win_label'
X, y = feature_score_blowout_df.iloc[:,:-1],feature_score_blowout_df.iloc[:,-1]

#### Grid search to tune hyperparameters

NOTE: This step takes a while to run. You can use the lastest found best parameters (at the beginning of the next section), you if want to skip the hyper-parameter tunning

In [None]:
# Define a timer function
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
# A parameter grid for XGBoost
params_search = {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.3, 0.5, 0.7],
        'max_depth': [4, 6, 8, 10],
        'n_estimators': [10, 50, 100]
        }

In [None]:
xgb = XGBClassifier(objective='binary:logistic', silent=False, nthread=4)

In [None]:
train, test, train_labels, test_labels = train_test_split(X, y, 
                                                          stratify = y,
                                                          test_size = 0.2)

In [None]:
# NOTE: the hyperparameter tunning (this cell) might take a while to execute

folds = 5
param_comb = 100

# Stratified k fold is used to maintain the class distribution from the original data in each fold
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

# Define the model for the random search
random_search = RandomizedSearchCV(xgb, param_distributions=params_search, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(train,train_labels), verbose=True)

# Run random search
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(train, train_labels)
timer(start_time) # timing ends here for "start_time" variable

In [None]:
print('\n Best hyperparameters:')
print(random_search.best_params_)

**Best hyperparameters score blowout:**
{'subsample': 1.0, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.3}

#### Use best hyperparameters to build and train the model

In [None]:
# Define the number of fold for the k-fold cross-validation
kfolds = KFold(n_splits=10, shuffle=True)

In [None]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'subsample': 1.0,
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 0.5
}

# This variable corresponds to 'n_estimators' (number of trees)
num_round = 100

In [None]:
# NOTE: the training process (this cell) might take a while to execute

import xgboost as xgb

auc = list()
for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    
    param['scale_pos_weight'] = (y_train.size - y_train.sum()) / y_train.sum()    
    
    xg_train = xgb.DMatrix(
        X_train.values, feature_names=features, label=y_train.values
    )
    xg_test = xgb.DMatrix(
        X_test.values, feature_names=features, label=y_test.values
    )
    
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    bst = xgb.train(param, xg_train, num_round, watchlist, verbose_eval=False)
    preds = bst.predict(xg_test) 

    auc.append(roc_auc_score(y_test, preds))

'Median AUC: {:.04f}'.format(st.median(auc))

### **SHAP values - Score blowout**

In [None]:
# Create the original summary plot
# Use only the last x_test as there is little variance between the folds
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test)

In [None]:
# Using np.abs(shap_values).mean(0) will summarize the global importance of a feature as the mean of the absolute change in the prediction
# caused by that feature over the entire dataset. 

np.abs(shap_values).mean(0)

In [None]:
# Check the shape of the shap_values matrix
shap_values.shape

In [None]:
# Keep a backup in case it is necessary
shap_values_backup = shap_values

In [None]:
new_columns = ['heroes_r', 'heroes_d',
               'role_carry_r', 'role_support_r', 'role_nuker_r', 'role_disabler_r', 'role_jungler_r',
                'role_durable_r', 'role_escape_r', 'role_pusher_r', 'role_initiator_r',
                'role_carry_d', 'role_support_d', 'role_nuker_d', 'role_disabler_d', 'role_jungler_d',
                'role_durable_d', 'role_escape_d', 'role_pusher_d', 'role_initiator_d',
                'strength_r', 'agility_r', 'intellig_r', 'strength_gain_r', 'agility_gain_r', 'intellig_gain_r', 'health_r', 'health_regeneration_r', 'move_speed_r',
                'strength_d', 'agility_d', 'intellig_d', 'strength_gain_d', 'agility_gain_d', 'intellig_gain_d', 'health_d', 'health_regeneration_d', 'move_speed_d',
                'hero_winrate_r', 'hero_winrate_d', 'player_winrate_r', 'player_winrate_d',
                'hp_winrate_r', 'hp_xp_min_r', 'hp_gold_min_r', 'hp_death_min_r', 'hp_taken_damage_min_r', 'hp_kill_min_r', 'hp_assist_min_r', 'hp_caused_damage_min_r', 'hp_heal_min_r',
                'hp_winrate_d', 'hp_xp_min_d', 'hp_gold_min_d', 'hp_death_min_d', 'hp_taken_damage_min_d', 'hp_kill_min_d', 'hp_assist_min_d', 'hp_caused_damage_min_d', 'hp_heal_min_d',
                'first_pick']

len(new_columns)

In [None]:
# Create new shap values matrix
shap_values_new = pd.DataFrame(columns=new_columns)
print(shap_values_new.shape)

for row in range(len(shap_values)):
    print(row)

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + shap_values[row, col]
    
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + shap_values[row, col]

    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + shap_values[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + shap_values[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + shap_values[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + shap_values[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + shap_values[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + shap_values[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + shap_values[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + shap_values[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + shap_values[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + shap_values[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + shap_values[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + shap_values[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + shap_values[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + shap_values[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + shap_values[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + shap_values[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + shap_values[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + shap_values[row, col]


    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + shap_values[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + shap_values[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + shap_values[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + shap_values[row, col]

    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + shap_values[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + shap_values[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + shap_values[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + shap_values[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + shap_values[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + shap_values[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + shap_values[row, col]

    index_hero_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_hero_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + shap_values[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + shap_values[row, col]


    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + shap_values[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + shap_values[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + shap_values[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_dire = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_dire = deathsm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_dire = 0
    for col in index_damagem_hp_rad:
      damagem_hp_dire = damagem_hp_dire + shap_values[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + shap_values[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + shap_values[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + shap_values[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + shap_values[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(shap_values[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [shap_values[row,198]]

    shap_values_new.loc[row] = new_row

In [None]:
# Check the shape of the new shap values matrix
shap_values_new.shape

In [None]:
#Convert to numpy
shap_values_new = shap_values_new.to_numpy()

In [None]:
# Modify X_test accordingly

X_test_new = pd.DataFrame(columns=new_columns)
print(X_test_new.shape)
for row in range(len(X_test)):

    # hero binary array
    sum_hero_rad = 0
    for col in range(119):
      sum_hero_rad = sum_hero_rad + X_test.iloc[row, col]
    sum_hero_dire = 0
    for col in range(119,238):
      sum_hero_dire = sum_hero_dire + X_test.iloc[row, col]


    # base and gain stats
    
    # RADIANT
    index_base_str_rad = [18+238, 27+238, 36+238, 45+238, 54+238]
    index_base_agi_rad = [x + 1 for x in index_base_str_rad] 
    index_base_int_rad = [x + 1 for x in index_base_agi_rad] 
    index_str_gain_rad = [x + 1 for x in index_base_int_rad] 
    index_agi_gain_rad = [x + 1 for x in index_str_gain_rad] 
    index_int_gain_rad = [x + 1 for x in index_agi_gain_rad]
    index_base_health_rad = [x + 1 for x in index_int_gain_rad]
    index_base_health_reg_rad = [x + 1 for x in index_base_health_rad]
    index_move_speed_rad = [x + 1 for x in index_base_health_reg_rad]

    sum_base_str_rad = 0
    for col in index_base_str_rad:
      sum_base_str_rad = sum_base_str_rad + X_test.iloc[row, col] 

    sum_base_agi_rad = 0
    for col in index_base_agi_rad:
      sum_base_agi_rad = sum_base_agi_rad + X_test.iloc[row, col]

    sum_base_int_rad = 0
    for col in index_base_int_rad:
      sum_base_int_rad = sum_base_int_rad + X_test.iloc[row, col]

    sum_str_gain_rad = 0
    for col in index_str_gain_rad:
      sum_str_gain_rad = sum_str_gain_rad + X_test.iloc[row, col]

    sum_agi_gain_rad = 0
    for col in index_agi_gain_rad:
      sum_agi_gain_rad = sum_agi_gain_rad + X_test.iloc[row, col]

    sum_int_gain_rad = 0
    for col in index_int_gain_rad:
      sum_int_gain_rad = sum_int_gain_rad + X_test.iloc[row, col]

    sum_base_health_rad = 0
    for col in index_base_health_rad:
      sum_base_health_rad = sum_base_health_rad + X_test.iloc[row, col]

    sum_base_health_reg_rad = 0
    for col in index_base_health_reg_rad:
      sum_base_health_reg_rad = sum_base_health_reg_rad + X_test.iloc[row, col]

    sum_move_speed_rad = 0
    for col in index_move_speed_rad:
      sum_move_speed_rad = sum_move_speed_rad + X_test.iloc[row, col]  

    # DIRE
    index_base_str_dire = [63+238, 72+238, 81+238, 90+238, 99+238]
    index_base_agi_dire = [x + 1 for x in index_base_str_dire] 
    index_base_int_dire = [x + 1 for x in index_base_agi_dire] 
    index_str_gain_dire = [x + 1 for x in index_base_int_dire] 
    index_agi_gain_dire = [x + 1 for x in index_str_gain_dire] 
    index_int_gain_dire = [x + 1 for x in index_agi_gain_dire]
    index_base_health_dire = [x + 1 for x in index_int_gain_dire]
    index_base_health_reg_dire = [x + 1 for x in index_base_health_dire]
    index_move_speed_dire = [x + 1 for x in index_base_health_reg_dire]

    sum_base_str_dire = 0
    for col in index_base_str_dire:
      sum_base_str_dire = sum_base_str_dire + X_test.iloc[row, col] 

    sum_base_agi_dire = 0
    for col in index_base_agi_dire:
      sum_base_agi_dire = sum_base_agi_dire + X_test.iloc[row, col]

    sum_base_int_dire = 0
    for col in index_base_int_dire:
      sum_base_int_dire = sum_base_int_dire + X_test.iloc[row, col]

    sum_str_gain_dire = 0
    for col in index_str_gain_dire:
      sum_str_gain_dire = sum_str_gain_dire + X_test.iloc[row, col]

    sum_agi_gain_dire = 0
    for col in index_agi_gain_dire:
      sum_agi_gain_dire = sum_agi_gain_dire + X_test.iloc[row, col]

    sum_int_gain_dire = 0
    for col in index_int_gain_dire:
      sum_int_gain_dire = sum_int_gain_dire + X_test.iloc[row, col]

    sum_base_health_dire = 0
    for col in index_base_health_dire:
      sum_base_health_dire = sum_base_health_dire + X_test.iloc[row, col]

    sum_base_health_reg_dire = 0
    for col in index_base_health_reg_dire:
      sum_base_health_reg_dire = sum_base_health_reg_dire + X_test.iloc[row, col]

    sum_move_speed_dire = 0
    for col in index_move_speed_dire:
      sum_move_speed_dire = sum_move_speed_dire + X_test.iloc[row, col]

    # Win rate historical stats for heroes and players
    
    index_winrate_rad = list(range(108+238,113+238))
    sum_winrate_rad = 0
    for col in index_winrate_rad:
      sum_winrate_rad = sum_winrate_rad + X_test.iloc[row, col]

    index_winrate_dire = list(range(113+238,118+238))
    sum_winrate_dire = 0
    for col in index_winrate_dire:
      sum_winrate_dire = sum_winrate_dire + X_test.iloc[row, col]

    index_winrate_player_rad = list(range(118+238,123+238))
    winrate_player_rad = 0
    for col in index_winrate_player_rad:
      winrate_player_rad = winrate_player_rad + X_test.iloc[row, col]

    index_winrate_player_dire = list(range(123+238,128+238))
    winrate_player_dire = 0
    for col in index_winrate_player_dire:
      winrate_player_dire = winrate_player_dire + X_test.iloc[row, col]


    # Win rate historical stats for hero-player tuple
    
    # RADIANT
    index_winrate_hp_rad = list(range(128+238,133+238))
    winrate_hp_rad = 0
    for col in index_winrate_hp_rad:
      winrate_hp_rad = winrate_hp_rad + X_test.iloc[row, col]

    index_xpm_hp_rad = list(range(133+238,138+238))
    xpm_hp_rad = 0
    for col in index_xpm_hp_rad:
      xpm_hp_rad = xpm_hp_rad + X_test.iloc[row, col]

    index_goldm_hp_rad = list(range(138+238,143+238))
    goldm_hp_rad = 0
    for col in index_goldm_hp_rad:
      goldm_hp_rad = goldm_hp_rad + X_test.iloc[row, col]

    index_deathsm_hp_rad = list(range(143+238,148+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(148+238,153+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_rad = list(range(153+238,158+238))
    killm_hp_rad = 0
    for col in index_killm_hp_rad:
      killm_hp_rad = killm_hp_rad + X_test.iloc[row, col]

    index_assistm_hp_rad = list(range(158+238,163+238))
    assistm_hp_rad = 0
    for col in index_assistm_hp_rad:
      assistm_hp_rad = assistm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_rad = list(range(163+238,168+238))
    herodamagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      herodamagem_hp_rad = herodamagem_hp_rad + X_test.iloc[row, col]

    index_healm_hp_rad = list(range(168+238,173+238))
    healm_hp_rad = 0
    for col in index_healm_hp_rad:
      healm_hp_rad = healm_hp_rad + X_test.iloc[row, col]

    # DIRE
    index_winrate_hp_dire = list(range(173+238,178+238))
    winrate_hp_dire = 0
    for col in index_winrate_hp_dire:
      winrate_hp_dire = winrate_hp_dire + X_test.iloc[row, col]

    index_xpm_hp_dire = list(range(178+238,183+238))
    xpm_hp_dire = 0
    for col in index_xpm_hp_dire:
      xpm_hp_dire = xpm_hp_dire + X_test.iloc[row, col]

    index_goldm_hp_dire = list(range(183+238,188+238))
    goldm_hp_dire = 0
    for col in index_goldm_hp_dire:
      goldm_hp_dire = goldm_hp_dire + X_test.iloc[row, col]

    index_deathsm_hp_dire = list(range(188+238,193+238))
    deathsm_hp_rad = 0
    for col in index_deathsm_hp_rad:
      deathsm_hp_rad = deathsm_hp_rad + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(193+238,198+238))
    damagem_hp_rad = 0
    for col in index_damagem_hp_rad:
      damagem_hp_rad = damagem_hp_rad + X_test.iloc[row, col]

    index_killm_hp_dire = list(range(198+238,203+238))
    killm_hp_dire = 0
    for col in index_killm_hp_dire:
      killm_hp_dire = killm_hp_dire + X_test.iloc[row, col]

    index_assistm_hp_dire = list(range(203+238,208+238))
    assistm_hp_dire = 0
    for col in index_assistm_hp_dire:
      assistm_hp_dire = assistm_hp_dire + X_test.iloc[row, col]

    index_damagem_hp_dire = list(range(208+238,213+238))
    herodamagem_hp_dire = 0
    for col in index_damagem_hp_dire:
      herodamagem_hp_dire = herodamagem_hp_dire + X_test.iloc[row, col]

    index_healm_hp_dire = list(range(213+238,218+238))
    healm_hp_dire = 0
    for col in index_healm_hp_dire:
      healm_hp_dire = healm_hp_dire + X_test.iloc[row, col]
  
    new_row = [sum_hero_rad] + [sum_hero_dire] + list(X_test.iloc[row, 238:256]) + [sum_base_str_rad] + [sum_base_agi_rad] + [sum_base_int_rad] + [sum_str_gain_rad] + [sum_agi_gain_rad] + [sum_int_gain_rad] + [sum_base_health_rad] + [sum_base_health_reg_rad] + [sum_move_speed_rad] + [sum_base_str_dire] + [sum_base_agi_dire] + [sum_base_int_dire] + [sum_str_gain_dire] + [sum_agi_gain_dire] + [sum_int_gain_dire] + [sum_base_health_dire] + [sum_base_health_reg_dire] + [sum_move_speed_dire] + [sum_winrate_rad] + [sum_winrate_dire] + [winrate_player_rad] + [winrate_player_dire] + [winrate_hp_rad] + [xpm_hp_rad] + [goldm_hp_rad] + [deathsm_hp_rad] + [damagem_hp_rad] + [killm_hp_rad] + [assistm_hp_rad] + [herodamagem_hp_rad] + [healm_hp_rad] + [winrate_hp_dire] + [xpm_hp_dire] + [goldm_hp_dire] + [deathsm_hp_dire] + [damagem_hp_dire] + [killm_hp_dire] + [assistm_hp_dire] + [herodamagem_hp_dire] + [healm_hp_dire] + [X_test.iloc[row,198]]

    X_test_new.loc[row] = new_row

In [None]:
# Fill in missing values with the median
X_test_new = X_test_new.fillna(X_test_new.median())
X_test_new.shape

In [None]:
# Generate summary plot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, show=False)  
plt.xlabel('')
plt.xticks(fontsize=18)
plt.yticks(fontsize=22)
plt.savefig('shapvalues_score_bold.pdf', bbox_inches = "tight")

In [None]:
# Generate barplot with proper plot configurations

matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rcParams["font.weight"] = "bold"

fig = shap.summary_plot(shap_values_new, X_test_new, plot_type="bar", show=False) 
size = plt.gcf().get_size_inches()
print(size) 
plt.gcf().set_size_inches(9, 13)
plt.xlabel('')
plt.xticks(fontsize=21)
plt.yticks(fontsize=24)
plt.savefig('shapvalues_score_barplot_bold.pdf', bbox_inches = "tight")