## Data Integration, EDA

Some imports, settings

In [35]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

def set_options():
    # matplotlib params
    matplotlib.rc_params['figure.figsize'] = [12, 8]
    # seaborn styles
    sns.set_style()

Load the main dataset

In [36]:
df = pd.concat(
    [pd.read_csv('./data/train_features.csv'),
     pd.read_csv('./data/train_targets.csv')],
    axis=1)
df = df.loc[:,~df.columns.duplicated()]

Optimize the memory a bit

In [37]:
# memory usage reduction code from: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
df = reduce_mem_usage(df)

Memory usage of dataframe is 75.41 MB
Memory usage after optimization is: 15.97 MB
Decreased by 78.8%


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39675 entries, 0 to 39674
Columns: 250 entries, match_id_hash to next_roshan_team
dtypes: category(2), float16(31), int16(67), int32(10), int8(140)
memory usage: 16.0 MB


In [4]:
df.nunique()

match_id_hash                 39675
game_time                      3417
game_mode                         8
lobby_type                        2
objectives_len                   43
chat_len                        158
r1_hero_id                      115
r1_kills                         33
r1_deaths                        26
r1_assists                       40
r1_denies                        70
r1_gold                       16407
r1_lh                           504
r1_xp                         17368
r1_health                      2803
r1_max_health                   716
r1_max_mana                    1298
r1_level                         25
r1_x                             62
r1_y                             59
r1_stuns                       4597
r1_creeps_stacked                49
r1_camps_stacked                 14
r1_rune_pickups                  41
r1_firstblood_claimed             2
r1_teamfight_participation      787
r1_towers_killed                  9
r1_roshans_killed           

In [52]:
'The number of missing values is %d. That is roughly %3.2f%% of the values' % (df.isnull().sum().sum(), 
                                                                               df.isnull().sum().sum() / np.prod(df.shape)*100)

'The number of missing values is 14350. That is roughly 0.14% of the values'

#### Let's dive in the extended dataset – the .jsonl file – and other external data to build a dataset of heroes.

Make a mapping for hero names to hero id – inside initial dataset

In [265]:
from tqdm import tqdm_notebook
from collections import OrderedDict
try:
    import ujson as json
except ModuleNotFoundError:
    import json

# Define utility function to iterate through all heroes in a match
def iter_hero_cols(col_desc='hero_id'):
    for c in ['r','d']:
        for i in range(1,6):
            yield f'{c}{i}_{col_desc}'

# First take a look at how many unique hero ID's we can find
unique_heroes = []
for col in iter_hero_cols():
    unique_heroes += df[col].unique().tolist()
hero_count = len(set(unique_heroes))

# Loop through the large JSON until we retrieved 115 hero names!
hero_to_id = {}
id_to_hero = {}
retrieved = 0
done = False
with open('data/train_matches.jsonl') as raw:
    for game in tqdm_notebook(raw, total=df.shape[0]):
        game = json.loads(game)
        for player in game['players']:
            hero_id = player['hero_id']
            hero_name = player['hero_name'][14:].replace('_',' ')
            if not hero_id in id_to_hero:
                retrieved += 1
                id_to_hero[hero_id] = hero_name
                hero_to_id[hero_name] = hero_id
            if retrieved == hero_count:
                done = True
                break
        if done:
            break
            
# Sort the hero mappings
hero_to_id = OrderedDict((key, hero_to_id[key]) for key in sorted(hero_to_id))
id_to_hero = OrderedDict((key, id_to_hero[key]) for key in sorted(id_to_hero))

# Correct the typos in both mappings
hero_to_id['zeus'] = hero_to_id.get('zuus', 'zeus')
if hero_to_id.get('zuus'): del hero_to_id['zuus']

zuus_key = [hero_id for hero_id in id_to_hero.keys() 
                    if id_to_hero[hero_id] == 'zuus']
if zuus_key: id_to_hero[zuus_key[0]] = 'zeus'
    
print('Done')

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


Done


Initialize the hero dataset.

In [226]:
hero_df = pd.DataFrame(index=id_to_hero.keys())
hero_df['name'] = id_to_hero.values()
hero_df['name_stripped'] = hero_df['name'].str.replace(' ','')
hero_df['stat'] = None
hero_df['types'] = [[] for _ in range(hero_count)]

The ```./data/hero_classes.json``` file was manually scrapped from https://dota2.gamepedia.com/Role on 26 March 2019. <br>
Merge it with our hero data frame.

In [227]:
# Read the hero classes file
f = open('./data/hero_classes.json')
hero_classes = json.load(f)
f.close()

# Define a function to match names between datasets.
alt_names = {'nevermore': 'shadowfiend',
             'windrunner': 'windranger',
             'necrolyte': 'necrophos',
             'skeletonking': 'wraithking',
             'rattletrap': 'clockwerk',
             'furion': 'naturesprophet',
             'obsidiandestroyer': 'outworlddevourer',
             'wisp': 'io',
             'magnataur': 'magnus',
             'shredder': 'timbersaw'}
def is_match(name, out_name):
    out_name = out_name.replace(' ','')\
                       .replace('-','')\
                       .replace("'",'').lower()
    direct_match = (name == out_name) or \
                    name.startswith(out_name) or \
                    name.endswith(out_name) or \
                    out_name.startswith(name) or \
                    out_name.endswith(name)
    alt_name = alt_names.get(name, False)
    
    return direct_match or (alt_name and is_match(alt_name, 
                                                  out_name))

# Get a list of all unique names of the 'external' dataset
external_names = []
for h_class, h_by_stat in hero_classes.items():
    for h_stat, h_list in h_by_stat.items():
        external_names += h_list
external_names = set(external_names)

# Create a mapping between hero names as found in our initial dataset and hero names from the secondary dataset 
external_name_map = {}
for e_name in external_names:
    for _, row in hero_df[['name', 'name_stripped']].iterrows():
        if is_match(row['name_stripped'], e_name):
            external_name_map[e_name] = row['name']
            break

# Check if we cover all the heroes in the 'internal' dataset
assert len(external_name_map) == hero_count

# Lastly, merge the stats in the 'external' dataset with the stats of the 'internal' dataset
hero_df['types'] = [[] for _ in range(hero_count)] # reinitialize column when before filling it up, as we use append to fill it
for h_class, h_by_stat in hero_classes.items():
    for h_stat, h_list in h_by_stat.items():
        for hero in h_list:
            i_name = external_name_map.get(hero)
            if i_name:
                hero_df.loc[hero_to_id[i_name], 'types'].append(h_class)
                hero_df.loc[hero_to_id[i_name], 'stat'] = h_stat
                
hero_df.sample(10) 

Unnamed: 0,name,name_stripped,stat,types
1,antimage,antimage,agi,"[Carry, Nuker, Escape]"
44,phantom assassin,phantomassassin,agi,"[Carry, Escape]"
63,weaver,weaver,agi,"[Carry, Escape]"
38,beastmaster,beastmaster,str,"[Nuker, Disabler, Durable, Initiator]"
22,zeus,zeus,int,[Nuker]
87,disruptor,disruptor,int,"[Support, Nuker, Disabler, Initiator]"
94,medusa,medusa,agi,"[Carry, Disabler, Durable]"
25,lina,lina,int,"[Carry, Support, Nuker, Disabler]"
53,furion,furion,int,"[Carry, Nuker, Jungler, Escape, Pusher]"
45,pugna,pugna,int,"[Nuker, Pusher]"


We can also derive a new feature, ```num_types``` as the number of roles the hero can take.

In [228]:
hero_df['num_types'] = hero_df['types'].map(set).map(len)
hero_df.sample(10)

Unnamed: 0,name,name_stripped,stat,types,num_types
53,furion,furion,int,"[Carry, Nuker, Jungler, Escape, Pusher]",5
45,pugna,pugna,int,"[Nuker, Pusher]",2
39,queenofpain,queenofpain,int,"[Carry, Nuker, Escape]",3
50,dazzle,dazzle,int,"[Support, Nuker, Disabler]",3
56,clinkz,clinkz,agi,"[Carry, Escape, Pusher]",3
99,bristleback,bristleback,str,"[Carry, Nuker, Durable, Initiator]",4
34,tinker,tinker,int,"[Carry, Nuker, Pusher]",3
2,axe,axe,str,"[Disabler, Jungler, Durable, Initiator]",4
85,undying,undying,str,"[Support, Nuker, Disabler, Durable]",4
101,skywrath mage,skywrathmage,int,"[Support, Nuker, Disabler]",3


Try to also merge with actual hero stat values from https://devilesk.com/dota2/heroes/herodata/

In [229]:
# Load the spreadsheet and preview
hero_stats = pd.read_csv('data/hero_stats.csv')
hero_stats.info()
hero_stats.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 33 columns):
name                         116 non-null object
agi                          116 non-null int64
agi_gain                     116 non-null float64
int                          116 non-null int64
int_gain                     116 non-null float64
str                          116 non-null int64
str_gain                     116 non-null float64
hp                           116 non-null int64
hp_regen                     116 non-null float64
mana                         116 non-null int64
mana_regen                   116 non-null float64
armor                        116 non-null float64
damage_reduction             116 non-null float64
movespeed                    116 non-null float64
turn_rate                    116 non-null float64
mean_base_damage             116 non-null float64
min_base_damage              116 non-null int64
max_base_damage              116 non-null int64
status_res

Unnamed: 0,name,agi,agi_gain,int,int_gain,str,str_gain,hp,hp_regen,mana,...,atk_time,atk_per_sec,atk_point,projectile_speed,physical_effective_health,magical_effective_health,range,vision_day,vision_night,dps
77,Razor,22,1.8,21,1.8,22,2.6,640,1.74,327,...,1.39,0.72,0.3,2000,725.33,853.33,475,1800,800,33.12
7,Bane,23,2.4,23,2.4,23,2.4,660,1.75,351,...,1.38,0.72,0.3,900,819.5,850.65,400,1800,800,43.92
90,Spectre,23,1.8,16,1.9,21,2.3,620,1.72,267,...,1.38,0.72,0.3,900,738.83,826.67,150,1800,800,34.56
58,Morphling,24,3.7,13,1.1,20,2.3,600,1.71,231,...,1.21,0.83,0.5,1300,660.0,800.0,350,1800,800,31.13
22,Dazzle,21,1.7,25,3.4,18,2.3,560,1.69,375,...,1.4,0.71,0.3,1200,630.0,719.68,550,1800,800,35.5


In [230]:
# Make a quick check to see if we can match all heroes based on the previous function
for i_name in hero_df['name_stripped']:
    for e_name in hero_stats['name']:
        if is_match(i_name, e_name):
            break
    else:
        print(f'{i_name} unmatched :(')

All hero names can be matched, great.

In [231]:
# Given hero name, get corresponding index of original dataframe
def get_index(e_name):
    matcher = lambda i_name: is_match(i_name, e_name)
    try:
        return hero_df[hero_df['name_stripped'].apply(matcher)].index.item()
    except ValueError:
        return None
    
# Reindex the stats dataframe for easy merging
hero_stats['other_index'] = hero_stats['name'].map(get_index)
hero_stats = hero_stats.dropna()
hero_stats['other_index'] = hero_stats['other_index'].astype(int)
hero_stats = hero_stats.set_index('other_index').sort_index()

Finally, merge all the hero information together.

In [250]:
heroes = hero_df.join(hero_stats.drop(columns='name'))

In [252]:
heroes.sort_values('dps', ascending=False).head()

Unnamed: 0,name,name_stripped,stat,types,num_types,agi,agi_gain,int,int_gain,str,...,atk_time,atk_per_sec,atk_point,projectile_speed,physical_effective_health,magical_effective_health,range,vision_day,vision_night,dps
83,treant,treant,str,"[Support, Disabler, Durable, Escape, Initiator]",5,15,2.0,20,1.8,25,...,1.65,0.61,0.6,900,717.5,933.33,150,1800,800,55.51
27,shadow shaman,shadowshaman,int,"[Support, Nuker, Disabler, Pusher, Initiator]",5,16,1.6,21,3.0,23,...,1.47,0.68,0.3,900,781.0,853.13,400,1800,800,52.02
12,phantom lancer,phantomlancer,agi,"[Carry, Nuker, Escape, Pusher]",4,29,2.8,19,2.0,19,...,1.32,0.76,0.5,900,720.17,773.33,150,1800,800,47.12
1,antimage,antimage,agi,"[Carry, Nuker, Escape]",3,22,2.8,12,1.8,23,...,1.15,0.87,0.3,0,748.0,880.0,150,1800,800,46.11
38,beastmaster,beastmaster,str,"[Nuker, Disabler, Durable, Initiator]",4,18,1.6,16,1.9,23,...,1.44,0.69,0.3,0,726.0,880.0,150,1800,800,45.54


### Generating some hero statistics from our data

In [267]:
hero_statistics = pd.DataFrame(heroes['name'])
hero_statistics['hero_id'] = hero_statistics['name'].map(hero_to_id.get)
hero_statistics.head()

Unnamed: 0,name,hero_id
1,antimage,1
2,axe,2
3,bane,3
4,bloodseeker,4
5,crystal maiden,5


In [269]:
def get_stat_data(hero_id, stat):
    all_data = pd.Series()
    for col_head in iter_hero_cols(col_desc=''):
        hero_id_col = col_head + 'hero_id'
        hero_filter = (df[hero_id_col] == hero_id)
        hero_stat = col_head + stat
        all_data = pd.concat([all_data, df.loc[hero_filter, hero_stat]],
                             ignore_index=True)
    return all_data

In [271]:
get_stat_data(hero_to_id['sven'], 'gold').describe()  ## pd.Series

count     3293.000000
mean      9193.671121
std       7932.136180
min         26.000000
25%       2894.000000
50%       7166.000000
75%      13435.000000
max      53878.000000
dtype: float64