## Compute basic data stats and perform Feature Engineering (Pre-match features)

Notebook with code to compute basic stats and collect/compute features for the machine learning models

Date: May 28th, 2020

In [5]:
#Import necessary libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import percentile
import statistics as st
import csv
from time import gmtime, strftime
from datetime import datetime, timedelta

In [None]:
# Load the JSON data for all files - Get current and its parent directory
cwd = os.getcwd()
repo_root_directory = os.path.dirname(cwd)
print(repo_root_directory)

In [7]:
# Adjust cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

### Collect features for the ML models

In [None]:
# Binary arrays to flag if hero is in radiant team or dire team

with open(repo_root_directory + "/data-analysis/patch.json", 'r') as patch_file:
    patches = json.load(patch_file) 
    
hero_id_list = list()
with open(repo_root_directory + "/data-analysis/heroes.json", 'r') as hero_file:
    heroes = json.load(hero_file) # 119 different heroes May 23rd
    
for each_id in heroes:
    hero_id_list.append(each_id)

columns_df = ['match_id']

for index_hero_array in hero_id_list:
    hero_id = "rad_hero_" + str(index_hero_array)
    columns_df.append(hero_id)

for index_hero_array in hero_id_list:
    hero_id = "dire_hero_" + str(index_hero_array)
    columns_df.append(hero_id)
    
    
hero_roles_r_list = ['role_carry_r', 'role_support_r', 'role_nuker_r', 'role_disabler_r', 'role_jungler_r',
                       'role_durable_r', 'role_escape_r', 'role_pusher_r', 'role_initiator_r']

hero_roles_d_list = ['role_carry_d', 'role_support_d', 'role_nuker_d', 'role_disabler_d', 'role_jungler_d',
                        'role_durable_d', 'role_escape_d', 'role_pusher_d', 'role_initiator_d']

for role in hero_roles_r_list:
    columns_df.append(role)
for role in hero_roles_d_list:
    columns_df.append(role)

columns_stats_r = list()
for i in range(1,6):
    base_str = "base_str_hero_" + str(i) + "_r"
    base_agi = "base_agi_hero_" + str(i) + "_r"
    base_int = "base_int_hero_" + str(i) + "_r"
    str_gain = "str_gain_hero_" + str(i) + "_r"
    agi_gain = "agi_gain_hero_" + str(i) + "_r"
    int_gain = "int_gain_hero_" + str(i) + "_r"
    base_health = "base_health_hero_" + str(i) + "_r"
    base_health_regeneration = "base_health_reg_hero_" + str(i) + "_r"
    move_speed = "move_speed_hero_" + str(i) + "_r"

    columns_stats_r.append(base_str)
    columns_stats_r.append(base_agi)
    columns_stats_r.append(base_int)
    columns_stats_r.append(str_gain)
    columns_stats_r.append(agi_gain)
    columns_stats_r.append(int_gain)
    columns_stats_r.append(base_health)
    columns_stats_r.append(base_health_regeneration)
    columns_stats_r.append(move_speed)

columns_stats_d = list()
for i in range(1,6):
    base_str = "base_str_hero_" + str(i) + "_d"
    base_agi = "base_agi_hero_" + str(i) + "_d"
    base_int = "base_int_hero_" + str(i) + "_d"
    str_gain = "str_gain_hero_" + str(i) + "_d"
    agi_gain = "agi_gain_hero_" + str(i) + "_d"
    int_gain = "int_gain_hero_" + str(i) + "_d"
    base_health = "base_health_hero_" + str(i) + "_d"
    base_health_regeneration = "base_health_reg_hero_" + str(i) + "_d"
    move_speed = "move_speed_hero_" + str(i) + "_d"

    columns_stats_d.append(base_str)
    columns_stats_d.append(base_agi)
    columns_stats_d.append(base_int)
    columns_stats_d.append(str_gain)
    columns_stats_d.append(agi_gain)
    columns_stats_d.append(int_gain)
    columns_stats_d.append(base_health)
    columns_stats_d.append(base_health_regeneration)
    columns_stats_d.append(move_speed)   

columns_stats_mn_r = ['bstr_mn_r', 'bagi_mn_r', 'bint_mn_r', 'strg_mn_r',
                          'agig_mn_r', 'intg_mn_r', 'bhealth_mn_r', 'bhealth_reg_mn_r',
                          'mspeed_mn_r']

columns_stats_md_r = ['bstr_md_r', 'bagi_md_r', 'bint_md_r', 'strg_md_r',
                          'agig_md_r', 'intg_md_r', 'bhealth_md_r', 'bhealth_reg_md_r',
                          'mspeed_md_r']

columns_stats_mn_d = ['bstr_mn_d', 'bagi_mn_d', 'bint_mn_d', 'strg_mn_d',
                           'agig_mn_d', 'intg_mn_d', 'bhealth_mn_d', 'bhealth_reg_mn_d',
                           'mspeed_mn_d']

columns_stats_md_d = ['bstr_md_d', 'bagi_md_d', 'bint_md_d', 'strg_md_d',
                           'agig_md_d', 'intg_md_d', 'bhealth_md_d', 'bhealth_reg_md_d',
                           'mspeed_md_d']
    

for item in columns_stats_r:
    columns_df.append(item)

for item in columns_stats_mn_r:
    columns_df.append(item) 

for item in columns_stats_md_r:
    columns_df.append(item) 
    
for item in columns_stats_d:
    columns_df.append(item)    

for item in columns_stats_mn_d:
    columns_df.append(item)

for item in columns_stats_md_d:
    columns_df.append(item)
    
win_rate_r = ['win_rate_hero1_r', 'win_rate_hero2_r', 'win_rate_hero3_r', 'win_rate_hero4_r', 'win_rate_hero5_r']
win_rate_d = ['win_rate_hero1_d', 'win_rate_hero2_d', 'win_rate_hero3_d', 'win_rate_hero4_d', 'win_rate_hero5_d']

for item in win_rate_r:
    columns_df.append(item)
    
columns_df.append('winR_mn_r')
columns_df.append('winR_md_r')

for item in win_rate_d:
    columns_df.append(item)

columns_df.append('winR_mn_d')
columns_df.append('winR_md_d')
    

win_rate_player_r = ['win_rate_player1_r', 'win_rate_player2_r', 'win_rate_player3_r', 'win_rate_player4_r', 'win_rate_player5_r']
win_rate_player_d = ['win_rate_player1_d', 'win_rate_player2_d', 'win_rate_player3_d', 'win_rate_player4_d', 'win_rate_player5_d']

for item in win_rate_player_r:
    columns_df.append(item)
    
columns_df.append('winR_plr_mn_r')
columns_df.append('winR_plr_md_r')

for item in win_rate_player_d:
    columns_df.append(item)

columns_df.append('winR_plr_mn_d')
columns_df.append('winR_plr_md_d') 


# RADIANT 
win_rate_hp_list_r = ['win_rate_hp_hero1_r', 'win_rate_hp_hero2_r', 'win_rate_hp_hero3_r',
                                'win_rate_hp_hero4_r', 'win_rate_hp_hero5_r']

for item in win_rate_hp_list_r:
    columns_df.append(item)
    
columns_df.append('winR_hp_mn_r')
columns_df.append('winR_hp_md_r')  


xpm_hp_list_r = ['xpm_hp_hero1_r', 'xpm_hp_hero2_r', 'xpm_hp_hero3_r',
                              'xpm_hp_hero4_r', 'xpm_hp_hero5_r']

for item in xpm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('xpm_hp_mn_r')
columns_df.append('xpm_hp_md_r') 


goldm_hp_list_r = ['goldm_hp_hero1_r', 'goldm_hp_hero2_r', 'goldm_hp_hero3_r',
                              'goldm_hp_hero4_r', 'goldm_hp_hero5_r']

for item in goldm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('goldm_hp_mn_r')
columns_df.append('goldm_hp_md_r') 


deathsm_hp_list_r = ['deathsm_hp_hero1_r', 'deathsm_hp_hero2_r', 'deathsm_hp_hero3_r',
                              'deathsm_hp_hero4_r', 'deathsm_hp_hero5_r']

for item in deathsm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('deathsm_hp_mn_r')
columns_df.append('deathsm_hp_md_r')


# DAMAGE THIS HERO RECEIVED FROM ALL CREATURES (OPPONENT HEROES AND CREEPS)
damagem_hp_list_r = ['damagem_hp_hero1_r', 'damagem_hp_hero2_r', 'damagem_hp_hero3_r',
                              'damagem_hp_hero4_r', 'damagem_hp_hero5_r']

for item in damagem_hp_list_r:
    columns_df.append(item)
    
columns_df.append('damagem_hp_mn_r')
columns_df.append('damagem_hp_md_r')


killsm_hp_list_r = ['killsm_hp_hero1_r', 'killsm_hp_hero2_r', 'killsm_hp_hero3_r',
                              'killsm_hp_hero4_r', 'killsm_hp_hero5_r']

for item in killsm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('killsm_hp_mn_r')
columns_df.append('killsm_hp_md_r')

assistsm_hp_list_r = ['assistsm_hp_hero1_r', 'assistsm_hp_hero2_r', 'assistsm_hp_hero3_r',
                              'assistsm_hp_hero4_r', 'assistsm_hp_hero5_r']

for item in assistsm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('assistsm_hp_mn_r')
columns_df.append('assistsm_hp_md_r')

# DAMAGE THIS HERO CAUSES IN OTHER HEROES - DAMAGE DEALT TO OTHER HEROES
damagem_hp_list_r = ['hero_damagem_hp_hero1_r', 'hero_damagem_hp_hero2_r', 'hero_damagem_hp_hero3_r',
                              'hero_damagem_hp_hero4_r', 'hero_damagem_hp_hero5_r']

for item in damagem_hp_list_r:
    columns_df.append(item)
    
columns_df.append('hero_damagem_hp_mn_r')
columns_df.append('hero_damagem_hp_md_r')

healingm_hp_list_r = ['healingm_hp_hero1_r', 'healingm_hp_hero2_r', 'healingm_hp_hero3_r',
                              'healingm_hp_hero4_r', 'healingm_hp_hero5_r']

for item in healingm_hp_list_r:
    columns_df.append(item)
    
columns_df.append('healingm_hp_mn_r')
columns_df.append('healingm_hp_md_r')


    
# DIRE
win_rate_hp_list_d = ['win_rate_hp_hero1_d', 'win_rate_hp_hero2_d', 'win_rate_hp_hero3_d',
                                'win_rate_hp_hero4_d', 'win_rate_hp_hero5_d']

for item in win_rate_hp_list_d:
    columns_df.append(item)
    
columns_df.append('win_rate_hp_mn_d')
columns_df.append('win_rate_hp_md_d')  


xpm_hp_list_d = ['xpm_hp_hero1_d', 'xpm_hp_hero2_d', 'xpm_hp_hero3_d',
                              'xpm_hp_hero4_d', 'xpm_hp_hero5_d']

for item in xpm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('xpm_hp_mn_d')
columns_df.append('xpm_hp_md_d') 


goldm_hp_list_d = ['goldm_hp_hero1_d', 'goldm_hp_hero2_d', 'goldm_hp_hero3_d',
                              'goldm_hp_hero4_d', 'goldm_hp_hero5_d']

for item in goldm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('goldm_hp_mn_d')
columns_df.append('goldm_hp_md_d') 


deathsm_hp_list_d = ['deathsm_hp_hero1_d', 'deathsm_hp_hero2_d', 'deathsm_hp_hero3_d',
                              'deathsm_hp_hero4_d', 'deathsm_hp_hero5_d']

for item in deathsm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('deathsm_hp_mn_d')
columns_df.append('deathsm_hp_md_d')


# DAMAGE THIS HERO RECEIVED FROM ALL CREATURES (OPPONENT HEROES AND CREEPS)
damagem_hp_list_d = ['damagem_hp_hero1_d', 'damagem_hp_hero2_d', 'damagem_hp_hero3_d',
                              'damagem_hp_hero4_d', 'damagem_hp_hero5_d']

for item in damagem_hp_list_d:
    columns_df.append(item)
    
columns_df.append('damagem_hp_mn_d')
columns_df.append('damagem_hp_md_d')


killsm_hp_list_d = ['killsm_hp_hero1_d', 'killsm_hp_hero2_d', 'killsm_hp_hero3_d',
                              'killsm_hp_hero4_d', 'killsm_hp_hero5_d']

for item in killsm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('killsm_hp_mn_d')
columns_df.append('killsm_hp_md_d')

assistsm_hp_list_d = ['assistsm_hp_hero1_d', 'assistsm_hp_hero2_d', 'assistsm_hp_hero3_d',
                              'assistsm_hp_hero4_d', 'assistsm_hp_hero5_d']

for item in assistsm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('assistsm_hp_mn_d')
columns_df.append('assistsm_hp_md_d')

# DAMAGE THIS HERO CAUSES IN OTHER HEROES - DAMAGE DEALT TO OTHER HEROES
damagem_hp_list_d = ['hero_damagem_hp_hero1_d', 'hero_damagem_hp_hero2_d', 'hero_damagem_hp_hero3_d',
                              'hero_damagem_hp_hero4_d', 'hero_damagem_hp_hero5_d']

for item in damagem_hp_list_d:
    columns_df.append(item)
    
columns_df.append('hero_damagem_hp_mn_d')
columns_df.append('hero_damagem_hp_md_d')

healingm_hp_list_d = ['healingm_hp_hero1_d', 'healingm_hp_hero2_d', 'healingm_hp_hero3_d',
                              'healingm_hp_hero4_d', 'healingm_hp_hero5_d']

for item in healingm_hp_list_d:
    columns_df.append(item)
    
columns_df.append('healingm_hp_mn_d')
columns_df.append('healingm_hp_md_d')

columns_df.append('rad_first_pick')

columns_df.append('win_label')
print(len(columns_df))

# Data frame to save model features
features_df = pd.DataFrame(columns=columns_df)
print(features_df)

In [None]:
# Run to build dictionary with match id and match start date
all_matches_dir = repo_root_directory + "/data-collection/matches-after-cleaning/"
all_matches_json_files = os.listdir(all_matches_dir)

matches_start_date_dict = {}
count_matches = 0
for each_match_file in all_matches_json_files:
    print(count_matches)
    count_matches += 1
    with open(all_matches_dir + each_match_file , 'r') as f:
        each_match = json.load(f)
    specific_match_start_time = each_match['start_time']
    specific_match_date = '{}-{}-{}'.format(*gmtime(specific_match_start_time))
    matches_start_date_dict[each_match_file] = specific_match_date

### Load data from match historical stats dictionary

In [10]:
# The next parts of the code depends on the matches_dict files. You can obtain these files from the dataset hosted in zenodo (http://doi.org/10.5281/zenodo.3890315)
# Please adjust the directories to the location where you put those files

match_hist_dir = repo_root_directory + "/data-collection/matches_dict/matches_dict/"
print(match_hist_dir)
match_hist_newdata_dir = repo_root_directory + "/data-collection/matches_dict/matches_dict_complement/"
print(match_hist_newdata_dir)

C:\Users\markos-ece\Desktop\Viggiato\PhD - UofA\Research\2-Dota2\git-repo-code/data-collection/matches_dict/matches_dict_all/
C:\Users\markos-ece\Desktop\Viggiato\PhD - UofA\Research\2-Dota2\git-repo-code/data-collection/matches_dict/matches_dict_all-newdata/


In [11]:
match_hist_stats = {}
all_match_hist_csv = os.listdir(match_hist_dir)
print("Number of match hist files : " + str(len(all_match_hist_csv)))

# Iterate through the match dict files to build the match_hist_stats dictionary
for each_csv in all_match_hist_csv:
    match_id_key = int(each_csv.replace("dict_", "").replace(".csv", ""))
    df = pd.read_csv(match_hist_dir + each_csv)
    df = df.drop(df.columns[0], axis=1)
    match_hist_stats[match_id_key] = df    
    
print(len(match_hist_stats))

match_hist_stats_newdata = {}
all_match_hist_newdata_csv = os.listdir(match_hist_newdata_dir)
print("Number of match hist files : " + str(len(all_match_hist_newdata_csv)))

# Iterate through the match dict new data files to build the match_hist_stats_newdata dictionary
for each_csv in all_match_hist_newdata_csv:
    match_id_key = int(each_csv.replace("dict_", "").replace(".csv", ""))    
    df = pd.read_csv(match_hist_newdata_dir + each_csv)
    df = df.drop(df.columns[0], axis=1)
    match_hist_stats_newdata[match_id_key] = df    
    
print(len(match_hist_stats_newdata))

Number of match hist files : 55287
55287
Number of match hist files : 55287
55287


In [27]:
list_keys = list()
for key in match_hist_stats:
    list_keys.append(int(key))
print(len(list_keys))
list_keys = list()
for key in match_hist_stats_newdata:
    list_keys.append(int(key))
print(len(list_keys))

55287
55287


In [None]:
# Compute the features. Adjust the following directories according to the location where the match json files are located
# Note that the features are computed for each group of match separately (this cell needs to be run for every group)

dataset_group_dir = repo_root_directory + "/data-analysis/dataset_groups/regular-new/"
model_features_dir = repo_root_directory + "/data-analysis/prediction-models/model_features_pre-match_newdata/regular-new/"
hero_historical_stats_dir = repo_root_directory + "/data-analysis/hero-attributes/hero_historical_stats/"
all_matches_dir = repo_root_directory + "/data-collection/matches-after-cleaning/"

all_json_files_get_features = os.listdir(dataset_group_dir)
index = 0
index_add_data = 0
count_not_included = 0

# data frame to save model features
features_df = pd.DataFrame(columns=columns_df)

for jsonfile in all_json_files_get_features:
    json_id = jsonfile.replace("match-", "")
    json_id_pure = int(json_id.replace(".json", ""))
    
    if json_id_pure not in list_keys:
        count_not_included += 1
        continue
    
    print("Collecting features for file : " + str(index))
    index += 1
    with open(dataset_group_dir + jsonfile , 'r') as f:
        match = json.load(f)
        match_id = match['match_id']
    
    ## MATCH ID
    print(match_id)
    
    ####################### TRUTH LABEL #######################
    
    win_label_bool = match['radiant_win']
    if win_label_bool:
        win_label = 1
    else:
        win_label = 0
    
    ####################### HERO BINARY ARRAY (BOTH TEAMS) #######################
    
    # radiant team
    hero_binary_array_rad = {}
    for hero_id in hero_id_list:
        hero_binary_array_rad[hero_id] = 0    

    # dire team
    hero_binary_array_dire = {}
    for hero_id in hero_id_list:
        hero_binary_array_dire[hero_id] = 0    
    
    players = match['players']
    for each_player in players:
        hero_id = each_player['hero_id']
        is_player_in_Radiant = each_player['isRadiant']
        if is_player_in_Radiant:
            hero_binary_array_rad[str(hero_id)] = 1
        else:
            hero_binary_array_dire[str(hero_id)] = 1
            
    # binary array to use - radiant
    hero_binary_array_rad_use = list()
    for hero_id in hero_id_list:
        element = hero_binary_array_rad[str(hero_id)]
        hero_binary_array_rad_use.append(int(element))
    
    # binary array to use - dire
    hero_binary_array_dire_use = list()
    for hero_id in hero_id_list:
        element = hero_binary_array_dire[str(hero_id)]
        hero_binary_array_dire_use.append(int(element))


    ####################### HERO ROLE BINARY ARRAY (BOTH TEAMS) #######################
    
    heroes_in_radiant_list = list()
    heroes_in_dire_list = list()
    players_array = match['players']
    for each_player in players_array:
        if each_player['isRadiant']:
            heroes_in_radiant_list.append(each_player['hero_id'])
        else:
            heroes_in_dire_list.append(each_player['hero_id'])

    role_carry_rad = 0
    role_support_rad = 0
    role_nuker_rad = 0
    role_disabler_rad = 0
    role_jungler_rad = 0
    role_durable_rad = 0
    role_escape_rad = 0
    role_pusher_rad = 0
    role_initiator_rad = 0
    
    role_carry_dire = 0
    role_support_dire = 0
    role_nuker_dire = 0
    role_disabler_dire = 0
    role_jungler_dire = 0
    role_durable_dire = 0
    role_escape_dire = 0
    role_pusher_dire = 0
    role_initiator_dire = 0
    
    
    # compute number of times each role appears (it can also be a binary array, just flagging whether the role is present or not)
    for each_hero_id_radiant in heroes_in_radiant_list:
        try:
            specific_hero_dict = heroes[str(each_hero_id_radiant)]
            role_array = specific_hero_dict['roles']
            for each_role in role_array:
                if each_role == 'Carry':
                    role_carry_rad = role_carry_rad + 1
                elif each_role == 'Support':
                    role_support_rad = role_support_rad + 1
                elif each_role == 'Nuker':
                    role_nuker_rad = role_nuker_rad + 1
                elif each_role == 'Disabler':
                    role_disabler_rad = role_disabler_rad + 1
                elif each_role == 'Jungler':
                    role_jungler_rad = role_jungler_rad + 1
                elif each_role == 'Durable':
                    role_durable_rad = role_durable_rad + 1
                elif each_role == 'Escape':
                    role_escape_rad = role_escape_rad + 1
                elif each_role == 'Pusher':
                    role_pusher_rad = role_pusher_rad + 1
                elif each_role == 'Initiator':
                    role_initiator_rad = role_initiator_rad + 1
        except:
            continue
            
    for each_hero_id_dire in heroes_in_dire_list:
        try:
            specific_hero_dict = heroes[str(each_hero_id_dire)]
            role_array = specific_hero_dict['roles']
            for each_role in role_array:
                if each_role == 'Carry':
                    role_carry_dire = role_carry_dire + 1
                elif each_role == 'Support':
                    role_support_dire = role_support_dire + 1
                elif each_role == 'Nuker':
                    role_nuker_dire = role_nuker_dire + 1
                elif each_role == 'Disabler':
                    role_disabler_dire = role_disabler_dire + 1
                elif each_role == 'Jungler':
                    role_jungler_dire = role_jungler_dire + 1
                elif each_role == 'Durable':
                    role_durable_dire = role_durable_dire + 1
                elif each_role == 'Escape':
                    role_escape_dire = role_escape_dire + 1
                elif each_role == 'Pusher':
                    role_pusher_dire = role_pusher_dire + 1
                elif each_role == 'Initiator':
                    role_initiator_dire = role_initiator_dire + 1
        except:
            continue
        
    role_array_list_rad = list()
    role_array_list_rad = [role_carry_rad, role_support_rad, role_nuker_rad, role_disabler_rad, role_jungler_rad,
                           role_durable_rad, role_escape_rad, role_pusher_rad, role_initiator_rad]
    
    role_array_list_dire = list()
    role_array_list_dire = [role_carry_dire, role_support_dire, role_nuker_dire, role_disabler_dire, role_jungler_dire,
                          role_durable_dire, role_escape_dire, role_pusher_dire, role_initiator_dire]


    ####################### HISTORICAL STATS OF HEROES #######################
    
    match_patch = match['patch']
    for each_patch_id in patches:
        if each_patch_id['id'] == match_patch:
            patch_name = each_patch_id['name']
            break
    
    base_str_list_rad = list()
    base_agi_list_rad = list()
    base_int_list_rad = list()
    str_gain_list_rad = list()
    agi_gain_list_rad = list()
    int_gain_list_rad = list()
    base_health_list_rad = list()
    base_health_reg_list_rad = list()
    move_speed_list_rad = list()
    
    base_str_list_dire = list()
    base_agi_list_dire = list()
    base_int_list_dire = list()
    str_gain_list_dire = list()
    agi_gain_list_dire = list()
    int_gain_list_dire = list()
    base_health_list_dire = list()
    base_health_reg_list_dire = list()
    move_speed_list_dire = list()
    
    for each_hero_id_radiant in heroes_in_radiant_list:
        try:
            specific_hero_dict = heroes[str(each_hero_id_radiant)]    
            hero_name = specific_hero_dict['localized_name']

            hero_hist_stats_df = pd.read_csv(hero_historical_stats_dir + hero_name + "_historical_stats.csv")
            hero_hist_stats_df.drop(hero_hist_stats_df.columns[0], axis=1, inplace=True)
            stats_for_version = hero_hist_stats_df.loc[hero_hist_stats_df['version'] == patch_name]
            if stats_for_version.empty:
                for vers in hero_hist_stats_df['version']:
                    if vers < patch_name:
                        stats_for_version = hero_hist_stats_df.loc[hero_hist_stats_df['version'] == vers]
                        break        
            base_str = float(stats_for_version['base_strength'])
            base_agi = float(stats_for_version['base_agility'])
            base_int = float(stats_for_version['base_intelligence'])
            str_gain = float(stats_for_version['strength_gain'])
            agi_gain = float(stats_for_version['agility_gain'])
            int_gain = float(stats_for_version['intelligence_gain'])
            base_health = float(stats_for_version['base_health'])
            base_health_regeneration = float(stats_for_version['base_health_regeneration'])
            move_speed = float(stats_for_version['move_speed'])

            base_str_list_rad.append(base_str)
            base_agi_list_rad.append(base_agi)
            base_int_list_rad.append(base_int)
            str_gain_list_rad.append(str_gain)
            agi_gain_list_rad.append(agi_gain)
            int_gain_list_rad.append(int_gain)
            base_health_list_rad.append(base_health)
            base_health_reg_list_rad.append(base_health_regeneration)
            move_speed_list_rad.append(move_speed)
        
        except:
            continue
        
    for each_hero_id_dire in heroes_in_dire_list:
        try:
            specific_hero_dict = heroes[str(each_hero_id_dire)]    
            hero_name = specific_hero_dict['localized_name']
            hero_hist_stats_df = pd.read_csv(hero_historical_stats_dir + hero_name + "_historical_stats.csv")
            hero_hist_stats_df.drop(hero_hist_stats_df.columns[0], axis=1, inplace=True)
            stats_for_version = hero_hist_stats_df.loc[hero_hist_stats_df['version'] == patch_name]
            if stats_for_version.empty:
                for vers in hero_hist_stats_df['version']:
                    if vers < patch_name:
                        stats_for_version = hero_hist_stats_df.loc[hero_hist_stats_df['version'] == vers]
                        break
            base_str = float(stats_for_version['base_strength'])
            base_agi = float(stats_for_version['base_agility'])
            base_int = float(stats_for_version['base_intelligence'])
            str_gain = float(stats_for_version['strength_gain'])
            agi_gain = float(stats_for_version['agility_gain'])
            int_gain = float(stats_for_version['intelligence_gain'])
            base_health = float(stats_for_version['base_health'])
            base_health_regeneration = float(stats_for_version['base_health_regeneration'])
            move_speed = float(stats_for_version['move_speed'])

            base_str_list_dire.append(base_str)
            base_agi_list_dire.append(base_agi)
            base_int_list_dire.append(base_int)
            str_gain_list_dire.append(str_gain)
            agi_gain_list_dire.append(agi_gain)
            int_gain_list_dire.append(int_gain)
            base_health_list_dire.append(base_health)
            base_health_reg_list_dire.append(base_health_regeneration)
            move_speed_list_dire.append(move_speed)
        
        except:
            continue
        
        
    # get mean and median of each stat
    base_str_mean_rad = np.nanmean(base_str_list_rad)
    base_agi_mean_rad = np.nanmean(base_agi_list_rad)
    base_int_mean_rad = np.nanmean(base_int_list_rad)
    str_gain_mean_rad = np.nanmean(str_gain_list_rad)
    agi_gain_mean_rad = np.nanmean(agi_gain_list_rad)
    int_gain_mean_rad = np.nanmean(int_gain_list_rad)
    base_health_mean_rad = np.nanmean(base_health_list_rad)
    base_health_reg_mean_rad = np.nanmean(base_health_reg_list_rad)
    move_speed_mean_rad = np.nanmean(move_speed_list_rad)
    
    base_str_median_rad = np.nanmedian(base_str_list_rad)
    base_agi_median_rad = np.nanmedian(base_agi_list_rad)
    base_int_median_rad = np.nanmedian(base_int_list_rad)
    str_gain_median_rad = np.nanmedian(str_gain_list_rad)
    agi_gain_median_rad = np.nanmedian(agi_gain_list_rad)
    int_gain_median_rad = np.nanmedian(int_gain_list_rad)
    base_health_median_rad = np.nanmedian(base_health_list_rad)
    base_health_reg_median_rad = np.nanmedian(base_health_reg_list_rad)
    move_speed_median_rad = np.nanmedian(move_speed_list_rad)
    
    base_str_mean_dire = np.nanmean(base_str_list_dire)
    base_agi_mean_dire = np.nanmean(base_agi_list_dire)
    base_int_mean_dire = np.nanmean(base_int_list_dire)
    str_gain_mean_dire = np.nanmean(str_gain_list_dire)
    agi_gain_mean_dire = np.nanmean(agi_gain_list_dire)
    int_gain_mean_dire = np.nanmean(int_gain_list_dire)
    base_health_mean_dire = np.nanmean(base_health_list_dire)
    base_health_reg_mean_dire = np.nanmean(base_health_reg_list_dire)
    move_speed_mean_dire = np.nanmean(move_speed_list_dire)
    
    base_str_median_dire = np.nanmedian(base_str_list_dire)
    base_agi_median_dire = np.nanmedian(base_agi_list_dire)
    base_int_median_dire = np.nanmedian(base_int_list_dire)
    str_gain_median_dire = np.nanmedian(str_gain_list_dire)
    agi_gain_median_dire = np.nanmedian(agi_gain_list_dire)
    int_gain_median_dire = np.nanmedian(int_gain_list_dire)
    base_health_median_dire = np.nanmedian(base_health_list_dire)
    base_health_reg_median_dire = np.nanmedian(base_health_reg_list_dire)
    move_speed_median_dire = np.nanmedian(move_speed_list_dire)   
    
    
    # build array with stats for each hero in radiant
    hist_stats_list_rad = list()
    for i in range(5):
        hist_stats_list_rad.append(base_str_list_rad[i])
        hist_stats_list_rad.append(base_agi_list_rad[i])
        hist_stats_list_rad.append(base_int_list_rad[i])
        hist_stats_list_rad.append(str_gain_list_rad[i])
        hist_stats_list_rad.append(agi_gain_list_rad[i])
        hist_stats_list_rad.append(int_gain_list_rad[i])
        hist_stats_list_rad.append(base_health_list_rad[i])
        hist_stats_list_rad.append(base_health_reg_list_rad[i])
        hist_stats_list_rad.append(move_speed_list_rad[i])
    
    # build array with MEAN stats for each hero in radiant
    hist_stats_mean_list_rad = list()
    hist_stats_mean_list_rad = [base_str_mean_rad, base_agi_mean_rad, base_int_mean_rad, str_gain_mean_rad, agi_gain_mean_rad,
                                int_gain_mean_rad, base_health_mean_rad, base_health_reg_mean_rad, move_speed_mean_rad]
    
   
    # build array with MEDIAN stats for each hero in radiant
    hist_stats_median_list_rad = list()
    hist_stats_median_list_rad = [base_str_median_rad, base_agi_median_rad, base_int_median_rad, str_gain_median_rad, agi_gain_median_rad,
                                int_gain_median_rad, base_health_median_rad, base_health_reg_median_rad, move_speed_median_rad]
    
    
    # build array with stats for each hero in dire
    hist_stats_list_dire = list()
    for i in range(5):
        hist_stats_list_dire.append(base_str_list_dire[i])
        hist_stats_list_dire.append(base_agi_list_dire[i])
        hist_stats_list_dire.append(base_int_list_dire[i])
        hist_stats_list_dire.append(str_gain_list_dire[i])
        hist_stats_list_dire.append(agi_gain_list_dire[i])
        hist_stats_list_dire.append(int_gain_list_dire[i])
        hist_stats_list_dire.append(base_health_list_dire[i])
        hist_stats_list_dire.append(base_health_reg_list_dire[i])
        hist_stats_list_dire.append(move_speed_list_dire[i])
    
    # build array with MEAN stats for each hero in dire
    hist_stats_mean_list_dire = list()
    hist_stats_mean_list_dire = [base_str_mean_dire, base_agi_mean_dire, base_int_mean_dire, str_gain_mean_dire, agi_gain_mean_dire,
                                int_gain_mean_dire, base_health_mean_dire, base_health_reg_mean_dire, move_speed_mean_dire]
    
    # build array with MEDIAN stats for each hero in dire
    hist_stats_median_list_dire = list()
    hist_stats_median_list_dire = [base_str_median_dire, base_agi_median_dire, base_int_median_dire, str_gain_median_dire, agi_gain_median_dire,
                                int_gain_median_dire, base_health_median_dire, base_health_reg_median_dire, move_speed_median_dire]


    ####################### HERO WIN RATE - MATCHES PRIOR TO THIS ONE #######################
    
    # retrieve data frame for our match id
    previous_match_info = match_hist_stats[int(match_id)]
    previous_match_info_newdata = match_hist_stats_newdata[int(match_id)]

    win_rate_list_rad = list()
    win_rate_list_dire = list()
    
    win_rate_list_rad = list(previous_match_info['win_rate'][:5])
    win_rate_mean_rad = previous_match_info['win_rate'][5]
    win_rate_median_rad = previous_match_info['win_rate'][6]
    
    win_rate_list_dire = list(previous_match_info['win_rate'][7:12])
    win_rate_mean_dire = previous_match_info['win_rate'][12]
    win_rate_median_dire = previous_match_info['win_rate'][13]
    
    
    ####################### PLAYER WIN RATE - MATCHES PRIOR TO THIS ONE ####################### 

    win_rate_player_list_rad = list()
    win_rate_player_list_dire = list()
    
    win_rate_player_list_rad = list(previous_match_info['win_rate'][14:19])
    win_rate_player_mean_rad = previous_match_info['win_rate'][19]
    win_rate_player_median_rad = previous_match_info['win_rate'][20]
    
    win_rate_player_list_dire = list(previous_match_info['win_rate'][21:26])
    win_rate_player_mean_dire = previous_match_info['win_rate'][26]
    win_rate_player_median_dire = previous_match_info['win_rate'][27]

    
    ####################### PLAYER-HERO STATS- MATCHES PRIOR TO THIS ONE #######################
    
    # radiant
    win_rate_hero_player_list_rad = list(previous_match_info['win_rate'][28:33])
    win_rate_mean_hero_player_list_rad = previous_match_info['win_rate'][33]
    win_rate_median_hero_player_list_rad = previous_match_info['win_rate'][34]

    xp_min_hero_player_list_rad = list(previous_match_info['xp_min'][28:33])
    xp_min_mean_hero_player_list_rad = previous_match_info['xp_min'][33]
    xp_min_median_hero_player_list_rad = previous_match_info['xp_min'][34]

    gold_min_hero_player_list_rad = list(previous_match_info['gold_min'][28:33])
    gold_min_mean_hero_player_list_rad = previous_match_info['gold_min'][33]
    gold_min_median_hero_player_list_rad = previous_match_info['gold_min'][34]

    # new data dataframe
    deaths_min_hero_player_list_rad = list(previous_match_info_newdata['deaths_min'][:5])
    deaths_min_mean_hero_player_list_rad = previous_match_info_newdata['deaths_min'][5]
    deaths_min_median_hero_player_list_rad = previous_match_info_newdata['deaths_min'][6]
    
    # new data dataframe
    health_damage_min_hero_player_list_rad = list(previous_match_info_newdata['health_damage_min'][:5])
    health_damage_min_mean_hero_player_list_rad = previous_match_info_newdata['health_damage_min'][5]
    health_damage_min_median_hero_player_list_rad = previous_match_info_newdata['health_damage_min'][6]    
       

    kills_min_hero_player_list_rad = list(previous_match_info['kills_min'][28:33])
    kills_min_mean_hero_player_list_rad = previous_match_info['kills_min'][33]
    kills_min_median_hero_player_list_rad = previous_match_info['kills_min'][34]

    assists_min_hero_player_list_rad = list(previous_match_info['assists_min'][28:33])
    assists_min_mean_hero_player_list_rad = previous_match_info['assists_min'][33]
    assists_min_median_hero_player_list_rad = previous_match_info['assists_min'][34]
    
    damage_min_hero_player_list_rad = list(previous_match_info['damage_min'][28:33])
    damage_min_mean_hero_player_list_rad = previous_match_info['damage_min'][33]
    damage_min_median_hero_player_list_rad = previous_match_info['damage_min'][34]
    
    healing_min_hero_player_list_rad = list(previous_match_info['healing_min'][28:33])
    healing_min_mean_hero_player_list_rad = previous_match_info['healing_min'][33]
    healing_min_median_hero_player_list_rad = previous_match_info['healing_min'][34]

    
    # dire
    win_rate_hero_player_list_dire = list(previous_match_info['win_rate'][34:39])
    win_rate_mean_hero_player_list_dire = previous_match_info['win_rate'][39]
    win_rate_median_hero_player_list_dire = previous_match_info['win_rate'][40]

    xp_min_hero_player_list_dire = list(previous_match_info['xp_min'][34:39])
    xp_min_mean_hero_player_list_dire = previous_match_info['xp_min'][39]
    xp_min_median_hero_player_list_dire = previous_match_info['xp_min'][40]

    gold_min_hero_player_list_dire = list(previous_match_info['gold_min'][34:39])
    gold_min_mean_hero_player_list_dire = previous_match_info['gold_min'][39]
    gold_min_median_hero_player_list_dire = previous_match_info['gold_min'][40]

    # new data dataframe
    deaths_min_hero_player_list_dire = list(previous_match_info_newdata['deaths_min'][7:12])
    deaths_min_mean_hero_player_list_dire = previous_match_info_newdata['deaths_min'][12]
    deaths_min_median_hero_player_list_dire = previous_match_info_newdata['deaths_min'][13]
    
    # new data dataframe
    health_damage_min_hero_player_list_dire = list(previous_match_info_newdata['health_damage_min'][7:12])
    health_damage_min_mean_hero_player_list_dire = previous_match_info_newdata['health_damage_min'][12]
    health_damage_min_median_hero_player_list_dire = previous_match_info_newdata['health_damage_min'][13] 
    

    kills_min_hero_player_list_dire = list(previous_match_info['kills_min'][34:39])
    kills_min_mean_hero_player_list_dire = previous_match_info['kills_min'][39]
    kills_min_median_hero_player_list_dire = previous_match_info['kills_min'][40]

    assists_min_hero_player_list_dire = list(previous_match_info['assists_min'][34:39])
    assists_min_mean_hero_player_list_dire = previous_match_info['assists_min'][39]
    assists_min_median_hero_player_list_dire = previous_match_info['assists_min'][40]
    
    damage_min_hero_player_list_dire = list(previous_match_info['damage_min'][34:39])
    damage_min_mean_hero_player_list_dire = previous_match_info['damage_min'][39]
    damage_min_median_hero_player_list_dire = previous_match_info['damage_min'][40]
    
    healing_min_hero_player_list_dire = list(previous_match_info['healing_min'][34:39])
    healing_min_mean_hero_player_list_dire = previous_match_info['healing_min'][39]
    healing_min_median_hero_player_list_dire = previous_match_info['healing_min'][40]    
    
    
    #######################  RADIANT FIRST PICK? #######################
    
    try:
        picks_bans_array = match['picks_bans']
        for each_pick_ban in picks_bans_array:
            is_pick = each_pick_ban['is_pick']
            if is_pick:
                hero_id_is_pick_true = each_pick_ban['hero_id'] # get hero id of player who first picked
                break

        # check in which team the first player to pick is
        players_array = match['players']
        for each_player in players_array:
            if (each_player['hero_id']) == hero_id_is_pick_true:
                is_player_in_Radiant = each_player['isRadiant']
                break

        if is_player_in_Radiant:
            radiant_first_pick = 1  # True
        else:
            radiant_first_pick = 0  # True
            
    except:
        
        try:
            picks_bans_array = match['draft_timings'] # trying getting the data from the 'draft_timings' object
            for each_pick_ban in picks_bans_array:
                is_pick = each_pick_ban['pick']
                if is_pick:
                    hero_id_is_pick_true = each_pick_ban['hero_id'] # get hero id of player who first picked
                    break

            # check in which team the first player to pick is
            players_array = match['players']
            for each_player in players_array:
                if (each_player['hero_id']) == hero_id_is_pick_true:
                    is_player_in_Radiant = each_player['isRadiant']
                    break

            if is_player_in_Radiant:
                radiant_first_pick = 1  # True
            else:
                radiant_first_pick = 0  # True
                
        except:
            radiant_first_pick = np.nan

        
    
    
    ### Build array with features to save to dataframe
    
    list_to_add = ([match_id] + hero_binary_array_rad_use + hero_binary_array_dire_use + role_array_list_rad
    + role_array_list_dire + hist_stats_list_rad + hist_stats_mean_list_rad + hist_stats_median_list_rad
    + hist_stats_list_dire + hist_stats_mean_list_dire + hist_stats_median_list_dire + win_rate_list_rad
    + [win_rate_mean_rad] + [win_rate_median_rad] + win_rate_list_dire + [win_rate_mean_dire]
    + [win_rate_median_dire] + win_rate_player_list_rad + [win_rate_player_mean_rad] + [win_rate_player_median_rad]
    + win_rate_player_list_dire + [win_rate_player_mean_dire] + [win_rate_player_median_dire] 
    + win_rate_hero_player_list_rad + [win_rate_mean_hero_player_list_rad] + [win_rate_median_hero_player_list_rad]
    + xp_min_hero_player_list_rad + [xp_min_mean_hero_player_list_rad] + [xp_min_median_hero_player_list_rad]
    + gold_min_hero_player_list_rad + [gold_min_mean_hero_player_list_rad] + [gold_min_median_hero_player_list_rad]
    + deaths_min_hero_player_list_rad + [deaths_min_mean_hero_player_list_rad] + [deaths_min_median_hero_player_list_rad]
    + health_damage_min_hero_player_list_rad + [health_damage_min_mean_hero_player_list_rad] + [health_damage_min_median_hero_player_list_rad]               
    + kills_min_hero_player_list_rad + [kills_min_mean_hero_player_list_rad] + [kills_min_median_hero_player_list_rad]
    + assists_min_hero_player_list_rad + [assists_min_mean_hero_player_list_rad] + [assists_min_median_hero_player_list_rad]
    + damage_min_hero_player_list_rad + [damage_min_mean_hero_player_list_rad] + [damage_min_median_hero_player_list_rad]             
    + healing_min_hero_player_list_rad + [healing_min_mean_hero_player_list_rad] + [healing_min_median_hero_player_list_rad]    
    + win_rate_hero_player_list_dire + [win_rate_mean_hero_player_list_dire] + [win_rate_median_hero_player_list_dire]
    + xp_min_hero_player_list_dire + [xp_min_mean_hero_player_list_dire] + [xp_min_median_hero_player_list_dire]
    + gold_min_hero_player_list_dire + [gold_min_mean_hero_player_list_dire] + [gold_min_median_hero_player_list_dire]
    + deaths_min_hero_player_list_dire + [deaths_min_mean_hero_player_list_dire] + [deaths_min_median_hero_player_list_dire]
    + health_damage_min_hero_player_list_dire + [health_damage_min_mean_hero_player_list_dire] + [health_damage_min_median_hero_player_list_dire] 
    + kills_min_hero_player_list_dire + [kills_min_mean_hero_player_list_dire] + [kills_min_median_hero_player_list_dire]
    + assists_min_hero_player_list_dire  + [assists_min_mean_hero_player_list_dire] + [assists_min_median_hero_player_list_dire]
    + damage_min_hero_player_list_dire + [damage_min_mean_hero_player_list_dire] + [damage_min_median_hero_player_list_dire]
    + healing_min_hero_player_list_dire + [healing_min_mean_hero_player_list_dire] + [healing_min_median_hero_player_list_dire]
    + [radiant_first_pick]
    + [win_label])
    
    features_df.loc[index_add_data] = list_to_add
    index_add_data += 1

print("Finished!")

In [37]:
# Print the shape of the feature vector
features_df.shape

(36348, 539)

In [39]:
# Save features as a CSV file
features_df.to_csv(model_features_dir + 'dota2_regular-new_features.csv')