## Data Integration, EDA

Some imports, settings

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

def set_options():
    # matplotlib params
    matplotlib.rc_params['figure.figsize'] = [12, 8]
    # seaborn styles
    sns.set_style()

Load the main dataset

In [2]:
df = pd.concat(
    [pd.read_csv('./data/train_features.csv'),
     pd.read_csv('./data/train_targets.csv')],
    axis=1)
df = df.loc[:,~df.columns.duplicated()]

Optimize the memory a bit

In [3]:
# memory usage reduction code from: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
df = reduce_mem_usage(df)

Memory usage of dataframe is 75.41 MB
Memory usage after optimization is: 15.97 MB
Decreased by 78.8%


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39675 entries, 0 to 39674
Columns: 250 entries, match_id_hash to next_roshan_team
dtypes: category(2), float16(31), int16(67), int32(10), int8(140)
memory usage: 16.0 MB


In [5]:
df.nunique()

match_id_hash                 39675
game_time                      3417
game_mode                         8
lobby_type                        2
objectives_len                   43
chat_len                        158
r1_hero_id                      115
r1_kills                         33
r1_deaths                        26
r1_assists                       40
r1_denies                        70
r1_gold                       16407
r1_lh                           504
r1_xp                         17368
r1_health                      2803
r1_max_health                   716
r1_max_mana                    1298
r1_level                         25
r1_x                             62
r1_y                             59
r1_stuns                       4597
r1_creeps_stacked                49
r1_camps_stacked                 14
r1_rune_pickups                  41
r1_firstblood_claimed             2
r1_teamfight_participation      787
r1_towers_killed                  9
r1_roshans_killed           

In [6]:
'The number of missing values is %d. That is roughly %3.2f%% of the values' % (df.isnull().sum().sum(), 
                                                                               df.isnull().sum().sum() / np.prod(df.shape)*100)

'The number of missing values is 14350. That is roughly 0.14% of the values'

#### Let's dive in the extended dataset – the .jsonl file – and other external data to build a dataset of heroes.

Make a mapping for hero names to hero id – inside initial dataset

In [7]:
from tqdm import tqdm_notebook
from collections import OrderedDict
try:
    import ujson as json
except ModuleNotFoundError:
    import json

# Define utility function to iterate through all heroes in a match
def iter_hero_cols(col_desc='hero_id'):
    for c in ['r','d']:
        for i in range(1,6):
            yield f'{c}{i}_{col_desc}'

# First take a look at how many unique hero ID's we can find
unique_heroes = []
for col in iter_hero_cols():
    unique_heroes += df[col].unique().tolist()
hero_count = len(set(unique_heroes))

# Loop through the large JSON until we retrieved 115 hero names!
hero_to_id = {}
id_to_hero = {}
retrieved = 0
done = False
with open('data/train_matches.jsonl') as raw:
    for game in tqdm_notebook(raw, total=df.shape[0]):
        game = json.loads(game)
        for player in game['players']:
            hero_id = player['hero_id']
            hero_name = player['hero_name'][14:].replace('_',' ')
            if not hero_id in id_to_hero:
                retrieved += 1
                id_to_hero[hero_id] = hero_name
                hero_to_id[hero_name] = hero_id
            if retrieved == hero_count:
                done = True
                break
        if done:
            break
            
# Sort the hero mappings
hero_to_id = OrderedDict((key, hero_to_id[key]) for key in sorted(hero_to_id))
id_to_hero = OrderedDict((key, id_to_hero[key]) for key in sorted(id_to_hero))

# Correct the typos in both mappings
hero_to_id['zeus'] = hero_to_id.get('zuus', 'zeus')
if hero_to_id.get('zuus'): del hero_to_id['zuus']

zuus_key = [hero_id for hero_id in id_to_hero.keys() 
                    if id_to_hero[hero_id] == 'zuus']
if zuus_key: id_to_hero[zuus_key[0]] = 'zeus'
    
print('Done')

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


Done


Initialize the hero dataset.

In [8]:
hero_df = pd.DataFrame(index=id_to_hero.keys())
hero_df['name'] = id_to_hero.values()
hero_df['name_stripped'] = hero_df['name'].str.replace(' ','')
hero_df['stat'] = None
hero_df['types'] = [[] for _ in range(hero_count)]

The ```./data/hero_classes.json``` file was manually scrapped from https://dota2.gamepedia.com/Role on 26 March 2019. <br>
Merge it with our hero data frame.

In [9]:
# Read the hero classes file
f = open('./data/hero_classes.json')
hero_classes = json.load(f)
f.close()

# Define a function to match names between datasets.
alt_names = {'nevermore': 'shadowfiend',
             'windrunner': 'windranger',
             'necrolyte': 'necrophos',
             'skeletonking': 'wraithking',
             'rattletrap': 'clockwerk',
             'furion': 'naturesprophet',
             'obsidiandestroyer': 'outworlddevourer',
             'wisp': 'io',
             'magnataur': 'magnus',
             'shredder': 'timbersaw'}
def is_match(name, out_name):
    out_name = out_name.replace(' ','')\
                       .replace('-','')\
                       .replace("'",'').lower()
    direct_match = (name == out_name) or \
                    name.startswith(out_name) or \
                    name.endswith(out_name) or \
                    out_name.startswith(name) or \
                    out_name.endswith(name)
    alt_name = alt_names.get(name, False)
    
    return direct_match or (alt_name and is_match(alt_name, 
                                                  out_name))

# Get a list of all unique names of the 'external' dataset
external_names = []
for h_class, h_by_stat in hero_classes.items():
    for h_stat, h_list in h_by_stat.items():
        external_names += h_list
external_names = set(external_names)

# Create a mapping between hero names as found in our initial dataset and hero names from the secondary dataset 
external_name_map = {}
for e_name in external_names:
    for _, row in hero_df[['name', 'name_stripped']].iterrows():
        if is_match(row['name_stripped'], e_name):
            external_name_map[e_name] = row['name']
            break

# Check if we cover all the heroes in the 'internal' dataset
assert len(external_name_map) == hero_count

# Lastly, merge the stats in the 'external' dataset with the stats of the 'internal' dataset
hero_df['types'] = [[] for _ in range(hero_count)] # reinitialize column when before filling it up, as we use append to fill it
for h_class, h_by_stat in hero_classes.items():
    for h_stat, h_list in h_by_stat.items():
        for hero in h_list:
            i_name = external_name_map.get(hero)
            if i_name:
                hero_df.loc[hero_to_id[i_name], 'types'].append(h_class)
                hero_df.loc[hero_to_id[i_name], 'stat'] = h_stat
                
hero_df.sample(10) 

Unnamed: 0,name,name_stripped,stat,types
73,alchemist,alchemist,str,"[Carry, Support, Nuker, Disabler, Durable, Ini..."
20,vengefulspirit,vengefulspirit,agi,"[Support, Nuker, Disabler, Escape, Initiator]"
71,spirit breaker,spiritbreaker,str,"[Carry, Disabler, Durable, Escape, Initiator]"
66,chen,chen,int,"[Support, Jungler, Pusher]"
49,dragon knight,dragonknight,str,"[Carry, Nuker, Disabler, Durable, Pusher, Init..."
107,earth spirit,earthspirit,str,"[Nuker, Disabler, Durable, Escape, Initiator]"
103,elder titan,eldertitan,str,"[Nuker, Disabler, Durable, Initiator]"
120,pangolier,pangolier,agi,"[Carry, Nuker, Disabler, Durable, Escape, Init..."
97,magnataur,magnataur,str,"[Nuker, Disabler, Escape, Initiator]"
85,undying,undying,str,"[Support, Nuker, Disabler, Durable]"


We can also derive a new feature, ```num_types``` as the number of roles the hero can take.

In [10]:
hero_df['num_types'] = hero_df['types'].map(set).map(len)
hero_df.sample(10)

Unnamed: 0,name,name_stripped,stat,types,num_types
65,batrider,batrider,int,"[Disabler, Jungler, Escape, Initiator]",4
7,earthshaker,earthshaker,str,"[Support, Nuker, Disabler, Initiator]",4
23,kunkka,kunkka,str,"[Carry, Support, Nuker, Disabler, Durable, Ini...",6
61,broodmother,broodmother,agi,"[Carry, Nuker, Escape, Pusher]",4
62,bounty hunter,bountyhunter,agi,"[Nuker, Escape]",2
21,windrunner,windrunner,int,"[Carry, Support, Nuker, Disabler, Escape]",5
67,spectre,spectre,agi,"[Carry, Durable, Escape]",3
32,riki,riki,agi,"[Carry, Disabler, Escape]",3
85,undying,undying,str,"[Support, Nuker, Disabler, Durable]",4
17,storm spirit,stormspirit,int,"[Carry, Nuker, Disabler, Escape, Initiator]",5


Try to also merge with actual hero stat values from https://devilesk.com/dota2/heroes/herodata/

In [11]:
# Load the spreadsheet and preview
hero_stats = pd.read_csv('data/hero_stats.csv')
hero_stats.info()
hero_stats.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 33 columns):
name                         116 non-null object
agi                          116 non-null int64
agi_gain                     116 non-null float64
int                          116 non-null int64
int_gain                     116 non-null float64
str                          116 non-null int64
str_gain                     116 non-null float64
hp                           116 non-null int64
hp_regen                     116 non-null float64
mana                         116 non-null int64
mana_regen                   116 non-null float64
armor                        116 non-null float64
damage_reduction             116 non-null float64
movespeed                    116 non-null float64
turn_rate                    116 non-null float64
mean_base_damage             116 non-null float64
min_base_damage              116 non-null int64
max_base_damage              116 non-null int64
status_res

Unnamed: 0,name,agi,agi_gain,int,int_gain,str,str_gain,hp,hp_regen,mana,...,atk_time,atk_per_sec,atk_point,projectile_speed,physical_effective_health,magical_effective_health,range,vision_day,vision_night,dps
4,Anti-Mage,22,2.8,12,1.8,23,1.3,660,2.04,219,...,1.15,0.87,0.3,0,748.0,880.0,150,1800,800,46.11
63,Nyx Assassin,19,2.2,18,2.1,19,2.3,580,5.39,291,...,1.43,0.7,0.46,900,700.83,773.33,150,1800,800,33.6
1,Underlord,12,1.3,17,2.6,25,2.9,700,1.77,279,...,1.52,0.66,0.45,900,805.0,933.33,150,1800,800,42.9
80,Sand King,19,2.1,16,1.8,22,2.9,640,1.74,267,...,1.43,0.7,0.53,900,709.33,853.33,150,1800,800,36.4
113,Io,14,1.6,23,1.7,17,2.2,540,1.68,351,...,1.49,0.67,0.15,1200,549.0,720.0,575,1800,800,29.15


In [12]:
# Make a quick check to see if we can match all heroes based on the previous function
for i_name in hero_df['name_stripped']:
    for e_name in hero_stats['name']:
        if is_match(i_name, e_name):
            break
    else:
        print(f'{i_name} unmatched :(')

All hero names can be matched, great.

In [13]:
# Given hero name, get corresponding index of original dataframe
def get_index(e_name):
    matcher = lambda i_name: is_match(i_name, e_name)
    try:
        return hero_df[hero_df['name_stripped'].apply(matcher)].index.item()
    except ValueError:
        return None
    
# Reindex the stats dataframe for easy merging
hero_stats['other_index'] = hero_stats['name'].map(get_index)
hero_stats = hero_stats.dropna()
hero_stats['other_index'] = hero_stats['other_index'].astype(int)
hero_stats = hero_stats.set_index('other_index').sort_index()

Finally, merge all the hero information together.

In [14]:
heroes = hero_df.join(hero_stats.drop(columns='name'))

In [15]:
heroes.sort_values('dps', ascending=False).head()

Unnamed: 0,name,name_stripped,stat,types,num_types,agi,agi_gain,int,int_gain,str,...,atk_time,atk_per_sec,atk_point,projectile_speed,physical_effective_health,magical_effective_health,range,vision_day,vision_night,dps
83,treant,treant,str,"[Support, Disabler, Durable, Escape, Initiator]",5,15,2.0,20,1.8,25,...,1.65,0.61,0.6,900,717.5,933.33,150,1800,800,55.51
27,shadow shaman,shadowshaman,int,"[Support, Nuker, Disabler, Pusher, Initiator]",5,16,1.6,21,3.0,23,...,1.47,0.68,0.3,900,781.0,853.13,400,1800,800,52.02
12,phantom lancer,phantomlancer,agi,"[Carry, Nuker, Escape, Pusher]",4,29,2.8,19,2.0,19,...,1.32,0.76,0.5,900,720.17,773.33,150,1800,800,47.12
1,antimage,antimage,agi,"[Carry, Nuker, Escape]",3,22,2.8,12,1.8,23,...,1.15,0.87,0.3,0,748.0,880.0,150,1800,800,46.11
38,beastmaster,beastmaster,str,"[Nuker, Disabler, Durable, Initiator]",4,18,1.6,16,1.9,23,...,1.44,0.69,0.3,0,726.0,880.0,150,1800,800,45.54


### Generating some hero statistics from our data

In [16]:
hero_statistics = pd.DataFrame(heroes['name'])
hero_statistics['hero_id'] = hero_statistics['name'].map(hero_to_id.get)
hero_statistics.head()

Unnamed: 0,name,hero_id
1,antimage,1
2,axe,2
3,bane,3
4,bloodseeker,4
5,crystal maiden,5


In [17]:
def get_stat_data(hero_id, stat):
    all_data = pd.Series()
    for col_head in iter_hero_cols(col_desc=''):
        hero_id_col = col_head + 'hero_id'
        hero_filter = (df[hero_id_col] == hero_id)
        hero_stat = col_head + stat
        all_data = pd.concat([all_data, df.loc[hero_filter, hero_stat]],
                             ignore_index=True)
    return all_data

In [18]:
get_stat_data(hero_to_id['sven'], 'gold').describe()  ## pd.Series

count     3293.000000
mean      9193.671121
std       7932.136180
min         26.000000
25%       2894.000000
50%       7166.000000
75%      13435.000000
max      53878.000000
dtype: float64

In [19]:
def get_player_features():
    with open('data/train_matches.jsonl') as raw:
        for game in tqdm_notebook(raw, total=df.shape[0]):
            game = json.loads(game)
            return game['players'][0].keys()

sentinel = ["sen"+str(i) for i in range(1,6)]
scourge = ["srg"+str(i) for i in range(1,6)]
        
def construct_frame(features = ['hero_name','kills','deaths','assists','gold_t']):
    clipped_jsonl = []
    with open('data/train_matches.jsonl') as raw:
        for game in tqdm_notebook(raw, total=df.shape[0]):
            gameson = {}
            game = json.loads(game)
            gameson['win'] = game['targets']['radiant_win']
            sentinel_p = []
            scourge_p = []
            for j in range(0,len(game['players'])):
                p = game['players'][j]
                if j<5:
                    sentinel_p.append({k : p[k] for k in features})
                else:
                    scourge_p.append({k : p[k] for k in features})
            for j in range(0,len(sentinel)):
                gameson[scourge[j]] = scourge_p[j]  
                gameson[sentinel[j]] = sentinel_p[j]
            clipped_jsonl.append(gameson)
        return clipped_jsonl

jsonl = construct_frame()

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [24]:
get_player_features()

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




dict_keys(['player_slot', 'hero_id', 'hero_name', 'account_id_hash', 'ability_upgrades', 'obs_placed', 'sen_placed', 'creeps_stacked', 'camps_stacked', 'rune_pickups', 'firstblood_claimed', 'teamfight_participation', 'towers_killed', 'roshans_killed', 'observers_placed', 'stuns', 'max_hero_hit', 'times', 'gold_t', 'lh_t', 'dn_t', 'xp_t', 'obs_log', 'sen_log', 'obs_left_log', 'sen_left_log', 'purchase_log', 'kills_log', 'buyback_log', 'runes_log', 'obs', 'sen', 'actions', 'pings', 'purchase', 'gold_reasons', 'xp_reasons', 'killed', 'item_uses', 'ability_uses', 'hero_hits', 'damage', 'damage_taken', 'damage_inflictor', 'runes', 'killed_by', 'kill_streaks', 'multi_kills', 'life_state', 'healing', 'damage_inflictor_received', 'randomed', 'pred_vict', 'gold', 'lh', 'xp', 'x', 'y', 'hero_inventory', 'hero_stash', 'health', 'max_health', 'max_mana', 'level', 'kills', 'deaths', 'assists', 'denies', 'nearby_creep_death_count'])

In [25]:
match_df = pd.read_json(json.dumps(jsonl))
match_df.head(10)

Unnamed: 0,sen1,sen2,sen3,sen4,sen5,srg1,srg2,srg3,srg4,srg5,win
0,"{'hero_name': 'npc_dota_hero_nevermore', 'kill...","{'hero_name': 'npc_dota_hero_brewmaster', 'kil...","{'hero_name': 'npc_dota_hero_pudge', 'kills': ...","{'hero_name': 'npc_dota_hero_huskar', 'kills':...","{'hero_name': 'npc_dota_hero_lycan', 'kills': ...","{'hero_name': 'npc_dota_hero_phantom_lancer', ...","{'hero_name': 'npc_dota_hero_windrunner', 'kil...","{'hero_name': 'npc_dota_hero_night_stalker', '...","{'hero_name': 'npc_dota_hero_ogre_magi', 'kill...","{'hero_name': 'npc_dota_hero_tinker', 'kills':...",False
1,"{'hero_name': 'npc_dota_hero_razor', 'kills': ...","{'hero_name': 'npc_dota_hero_centaur', 'kills'...","{'hero_name': 'npc_dota_hero_shadow_shaman', '...","{'hero_name': 'npc_dota_hero_weaver', 'kills':...","{'hero_name': 'npc_dota_hero_naga_siren', 'kil...","{'hero_name': 'npc_dota_hero_enchantress', 'ki...","{'hero_name': 'npc_dota_hero_pudge', 'kills': ...","{'hero_name': 'npc_dota_hero_antimage', 'kills...","{'hero_name': 'npc_dota_hero_clinkz', 'kills':...","{'hero_name': 'npc_dota_hero_visage', 'kills':...",True
2,"{'hero_name': 'npc_dota_hero_skywrath_mage', '...","{'hero_name': 'npc_dota_hero_rattletrap', 'kil...",{'hero_name': 'npc_dota_hero_phantom_assassin'...,"{'hero_name': 'npc_dota_hero_dragon_knight', '...","{'hero_name': 'npc_dota_hero_furion', 'kills':...","{'hero_name': 'npc_dota_hero_sven', 'kills': 0...","{'hero_name': 'npc_dota_hero_spectre', 'kills'...","{'hero_name': 'npc_dota_hero_viper', 'kills': ...","{'hero_name': 'npc_dota_hero_venomancer', 'kil...","{'hero_name': 'npc_dota_hero_storm_spirit', 'k...",True
3,"{'hero_name': 'npc_dota_hero_pudge', 'kills': ...","{'hero_name': 'npc_dota_hero_bristleback', 'ki...","{'hero_name': 'npc_dota_hero_skywrath_mage', '...","{'hero_name': 'npc_dota_hero_lion', 'kills': 1...","{'hero_name': 'npc_dota_hero_faceless_void', '...","{'hero_name': 'npc_dota_hero_sven', 'kills': 0...","{'hero_name': 'npc_dota_hero_shredder', 'kills...","{'hero_name': 'npc_dota_hero_juggernaut', 'kil...","{'hero_name': 'npc_dota_hero_doom_bringer', 'k...","{'hero_name': 'npc_dota_hero_rubick', 'kills':...",True
4,"{'hero_name': 'npc_dota_hero_skeleton_king', '...","{'hero_name': 'npc_dota_hero_doom_bringer', 'k...","{'hero_name': 'npc_dota_hero_shadow_shaman', '...",{'hero_name': 'npc_dota_hero_legion_commander'...,"{'hero_name': 'npc_dota_hero_batrider', 'kills...","{'hero_name': 'npc_dota_hero_kunkka', 'kills':...","{'hero_name': 'npc_dota_hero_zuus', 'kills': 1...","{'hero_name': 'npc_dota_hero_sniper', 'kills':...","{'hero_name': 'npc_dota_hero_gyrocopter', 'kil...","{'hero_name': 'npc_dota_hero_antimage', 'kills...",False
5,"{'hero_name': 'npc_dota_hero_omniknight', 'kil...","{'hero_name': 'npc_dota_hero_morphling', 'kill...","{'hero_name': 'npc_dota_hero_chaos_knight', 'k...","{'hero_name': 'npc_dota_hero_shredder', 'kills...","{'hero_name': 'npc_dota_hero_dark_willow', 'ki...","{'hero_name': 'npc_dota_hero_luna', 'kills': 0...",{'hero_name': 'npc_dota_hero_ancient_apparitio...,"{'hero_name': 'npc_dota_hero_night_stalker', '...","{'hero_name': 'npc_dota_hero_abaddon', 'kills'...","{'hero_name': 'npc_dota_hero_juggernaut', 'kil...",False
6,"{'hero_name': 'npc_dota_hero_dark_willow', 'ki...","{'hero_name': 'npc_dota_hero_spirit_breaker', ...",{'hero_name': 'npc_dota_hero_phantom_assassin'...,"{'hero_name': 'npc_dota_hero_sniper', 'kills':...",{'hero_name': 'npc_dota_hero_abyssal_underlord...,"{'hero_name': 'npc_dota_hero_viper', 'kills': ...","{'hero_name': 'npc_dota_hero_shredder', 'kills...","{'hero_name': 'npc_dota_hero_lion', 'kills': 2...",{'hero_name': 'npc_dota_hero_legion_commander'...,{'hero_name': 'npc_dota_hero_keeper_of_the_lig...,True
7,"{'hero_name': 'npc_dota_hero_phantom_lancer', ...","{'hero_name': 'npc_dota_hero_dragon_knight', '...","{'hero_name': 'npc_dota_hero_wisp', 'kills': 2...","{'hero_name': 'npc_dota_hero_monkey_king', 'ki...","{'hero_name': 'npc_dota_hero_dazzle', 'kills':...","{'hero_name': 'npc_dota_hero_rattletrap', 'kil...","{'hero_name': 'npc_dota_hero_viper', 'kills': ...","{'hero_name': 'npc_dota_hero_luna', 'kills': 2...","{'hero_name': 'npc_dota_hero_dark_willow', 'ki...",{'hero_name': 'npc_dota_hero_ancient_apparitio...,False
8,"{'hero_name': 'npc_dota_hero_invoker', 'kills'...","{'hero_name': 'npc_dota_hero_pudge', 'kills': ...","{'hero_name': 'npc_dota_hero_kunkka', 'kills':...","{'hero_name': 'npc_dota_hero_dragon_knight', '...","{'hero_name': 'npc_dota_hero_slark', 'kills': ...","{'hero_name': 'npc_dota_hero_silencer', 'kills...","{'hero_name': 'npc_dota_hero_centaur', 'kills'...","{'hero_name': 'npc_dota_hero_shadow_shaman', '...","{'hero_name': 'npc_dota_hero_gyrocopter', 'kil...",{'hero_name': 'npc_dota_hero_obsidian_destroye...,False
9,"{'hero_name': 'npc_dota_hero_spirit_breaker', ...","{'hero_name': 'npc_dota_hero_ogre_magi', 'kill...","{'hero_name': 'npc_dota_hero_nevermore', 'kill...","{'hero_name': 'npc_dota_hero_bane', 'kills': 1...","{'hero_name': 'npc_dota_hero_skeleton_king', '...","{'hero_name': 'npc_dota_hero_rattletrap', 'kil...","{'hero_name': 'npc_dota_hero_bounty_hunter', '...","{'hero_name': 'npc_dota_hero_witch_doctor', 'k...","{'hero_name': 'npc_dota_hero_lone_druid', 'kil...","{'hero_name': 'npc_dota_hero_storm_spirit', 'k...",False


Clean hero names, team lists for comparison

In [29]:
def rem_front(x):
    x['hero_name'] = x['hero_name'][x['hero_name'].rfind("_")+1:]

def clean_hero_names():
    match_df.applymap(rem_front)

def create_team_lists():
    match_df['radiant'] = match_df.apply(lambda row: [row[s]['hero_name'] for s in sentinel] ,axis=1)
    match_df['dire'] = match_df.apply(lambda row: [row[s]['hero_name'] for s in scourge] ,axis=1)
    
    match_df['radiant'].apply(lambda l: l.sort())
    match_df['dire'].apply(lambda l: l.sort())

    match_df['radiant'] = match_df['radiant'].apply(lambda l: ";".join(l))
    match_df['dire'] = match_df['dire'].apply(lambda l: ";".join(l))

# clean_hero_names()
# create_team_lists()
match_df.head(10)
# len(match_df)

Unnamed: 0,sen1,sen2,sen3,sen4,sen5,srg1,srg2,srg3,srg4,srg5,win
0,"{'hero_name': 'nevermore', 'kills': 0, 'deaths...","{'hero_name': 'brewmaster', 'kills': 0, 'death...","{'hero_name': 'pudge', 'kills': 0, 'deaths': 1...","{'hero_name': 'huskar', 'kills': 0, 'deaths': ...","{'hero_name': 'lycan', 'kills': 0, 'deaths': 0...","{'hero_name': 'lancer', 'kills': 0, 'deaths': ...","{'hero_name': 'windrunner', 'kills': 0, 'death...","{'hero_name': 'stalker', 'kills': 0, 'deaths':...","{'hero_name': 'magi', 'kills': 1, 'deaths': 0,...","{'hero_name': 'tinker', 'kills': 0, 'deaths': ...",False
1,"{'hero_name': 'razor', 'kills': 7, 'deaths': 2...","{'hero_name': 'centaur', 'kills': 3, 'deaths':...","{'hero_name': 'shaman', 'kills': 1, 'deaths': ...","{'hero_name': 'weaver', 'kills': 4, 'deaths': ...","{'hero_name': 'siren', 'kills': 1, 'deaths': 0...","{'hero_name': 'enchantress', 'kills': 1, 'deat...","{'hero_name': 'pudge', 'kills': 1, 'deaths': 6...","{'hero_name': 'antimage', 'kills': 1, 'deaths'...","{'hero_name': 'clinkz', 'kills': 0, 'deaths': ...","{'hero_name': 'visage', 'kills': 0, 'deaths': ...",True
2,"{'hero_name': 'mage', 'kills': 0, 'deaths': 0,...","{'hero_name': 'rattletrap', 'kills': 0, 'death...","{'hero_name': 'assassin', 'kills': 0, 'deaths'...","{'hero_name': 'knight', 'kills': 0, 'deaths': ...","{'hero_name': 'furion', 'kills': 0, 'deaths': ...","{'hero_name': 'sven', 'kills': 0, 'deaths': 0,...","{'hero_name': 'spectre', 'kills': 0, 'deaths':...","{'hero_name': 'viper', 'kills': 0, 'deaths': 0...","{'hero_name': 'venomancer', 'kills': 0, 'death...","{'hero_name': 'spirit', 'kills': 0, 'deaths': ...",True
3,"{'hero_name': 'pudge', 'kills': 1, 'deaths': 0...","{'hero_name': 'bristleback', 'kills': 1, 'deat...","{'hero_name': 'mage', 'kills': 3, 'deaths': 1,...","{'hero_name': 'lion', 'kills': 1, 'deaths': 1,...","{'hero_name': 'void', 'kills': 0, 'deaths': 0,...","{'hero_name': 'sven', 'kills': 0, 'deaths': 0,...","{'hero_name': 'shredder', 'kills': 1, 'deaths'...","{'hero_name': 'juggernaut', 'kills': 0, 'death...","{'hero_name': 'bringer', 'kills': 0, 'deaths':...","{'hero_name': 'rubick', 'kills': 0, 'deaths': ...",True
4,"{'hero_name': 'king', 'kills': 0, 'deaths': 1,...","{'hero_name': 'bringer', 'kills': 1, 'deaths':...","{'hero_name': 'shaman', 'kills': 0, 'deaths': ...","{'hero_name': 'commander', 'kills': 0, 'deaths...","{'hero_name': 'batrider', 'kills': 1, 'deaths'...","{'hero_name': 'kunkka', 'kills': 1, 'deaths': ...","{'hero_name': 'zuus', 'kills': 1, 'deaths': 0,...","{'hero_name': 'sniper', 'kills': 0, 'deaths': ...","{'hero_name': 'gyrocopter', 'kills': 2, 'death...","{'hero_name': 'antimage', 'kills': 0, 'deaths'...",False
5,"{'hero_name': 'omniknight', 'kills': 0, 'death...","{'hero_name': 'morphling', 'kills': 0, 'deaths...","{'hero_name': 'knight', 'kills': 0, 'deaths': ...","{'hero_name': 'shredder', 'kills': 0, 'deaths'...","{'hero_name': 'willow', 'kills': 0, 'deaths': ...","{'hero_name': 'luna', 'kills': 0, 'deaths': 0,...","{'hero_name': 'apparition', 'kills': 0, 'death...","{'hero_name': 'stalker', 'kills': 0, 'deaths':...","{'hero_name': 'abaddon', 'kills': 0, 'deaths':...","{'hero_name': 'juggernaut', 'kills': 0, 'death...",False
6,"{'hero_name': 'willow', 'kills': 0, 'deaths': ...","{'hero_name': 'breaker', 'kills': 2, 'deaths':...","{'hero_name': 'assassin', 'kills': 2, 'deaths'...","{'hero_name': 'sniper', 'kills': 1, 'deaths': ...","{'hero_name': 'underlord', 'kills': 4, 'deaths...","{'hero_name': 'viper', 'kills': 3, 'deaths': 4...","{'hero_name': 'shredder', 'kills': 2, 'deaths'...","{'hero_name': 'lion', 'kills': 2, 'deaths': 2,...","{'hero_name': 'commander', 'kills': 4, 'deaths...","{'hero_name': 'light', 'kills': 0, 'deaths': 1...",True
7,"{'hero_name': 'lancer', 'kills': 3, 'deaths': ...","{'hero_name': 'knight', 'kills': 0, 'deaths': ...","{'hero_name': 'wisp', 'kills': 2, 'deaths': 1,...","{'hero_name': 'king', 'kills': 1, 'deaths': 1,...","{'hero_name': 'dazzle', 'kills': 0, 'deaths': ...","{'hero_name': 'rattletrap', 'kills': 2, 'death...","{'hero_name': 'viper', 'kills': 2, 'deaths': 2...","{'hero_name': 'luna', 'kills': 2, 'deaths': 0,...","{'hero_name': 'willow', 'kills': 2, 'deaths': ...","{'hero_name': 'apparition', 'kills': 2, 'death...",False
8,"{'hero_name': 'invoker', 'kills': 1, 'deaths':...","{'hero_name': 'pudge', 'kills': 1, 'deaths': 2...","{'hero_name': 'kunkka', 'kills': 0, 'deaths': ...","{'hero_name': 'knight', 'kills': 0, 'deaths': ...","{'hero_name': 'slark', 'kills': 0, 'deaths': 1...","{'hero_name': 'silencer', 'kills': 3, 'deaths'...","{'hero_name': 'centaur', 'kills': 0, 'deaths':...","{'hero_name': 'shaman', 'kills': 1, 'deaths': ...","{'hero_name': 'gyrocopter', 'kills': 0, 'death...","{'hero_name': 'destroyer', 'kills': 3, 'deaths...",False
9,"{'hero_name': 'breaker', 'kills': 1, 'deaths':...","{'hero_name': 'magi', 'kills': 4, 'deaths': 1,...","{'hero_name': 'nevermore', 'kills': 5, 'deaths...","{'hero_name': 'bane', 'kills': 1, 'deaths': 3,...","{'hero_name': 'king', 'kills': 3, 'deaths': 3,...","{'hero_name': 'rattletrap', 'kills': 1, 'death...","{'hero_name': 'hunter', 'kills': 2, 'deaths': ...","{'hero_name': 'doctor', 'kills': 1, 'deaths': ...","{'hero_name': 'druid', 'kills': 6, 'deaths': 3...","{'hero_name': 'spirit', 'kills': 1, 'deaths': ...",False
