### Data Integration, EDA

In [1]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

def set_options():
    # matplotlib params
    matplotlib.rc_params['figure.figsize'] = [12, 8]
    # seaborn styles
    sns.set_style()

In [2]:
df = pd.concat(
    [pd.read_csv('./data/train_features.csv'),
     pd.read_csv('./data/train_targets.csv')],
    axis=1)
df = df.loc[:,~df.columns.duplicated()]

In [3]:
# Use memory usage reducing code from: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtypes
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
df = reduce_mem_usage(df)

Memory usage of dataframe is 75.41 MB
Memory usage after optimization is: 15.97 MB
Decreased by 78.8%


In [4]:
df.nunique()

match_id_hash                 39675
game_time                      3417
game_mode                         8
lobby_type                        2
objectives_len                   43
chat_len                        158
r1_hero_id                      115
r1_kills                         33
r1_deaths                        26
r1_assists                       40
r1_denies                        70
r1_gold                       16407
r1_lh                           504
r1_xp                         17368
r1_health                      2803
r1_max_health                   716
r1_max_mana                    1298
r1_level                         25
r1_x                             62
r1_y                             59
r1_stuns                       4597
r1_creeps_stacked                49
r1_camps_stacked                 14
r1_rune_pickups                  41
r1_firstblood_claimed             2
r1_teamfight_participation      787
r1_towers_killed                  9
r1_roshans_killed           

#### Let's dive in the extended dataset - the jsonl file.

##### Make a mapping for hero names

In [123]:
from tqdm import tqdm_notebook
import ujson

# First take a look at how many unique hero ID's we can find
unique_heroes = []
for c in ['r','d']:
    for i in range(1,6):
        unique_heroes += df[f'{c}{i}_hero_id'].unique().tolist()
hero_count = len(set(unique_heroes))

# Loop through the large JSON until we retrieved 115 hero names!
hero_mapping = {}
retrieved = 0
done = False
with open('data/train_matches.jsonl') as raw:
    for game in tqdm_notebook(raw, total=df.shape[0]):
        game = ujson.loads(game)
        for player in game['players']:
            hero_id = player['hero_id']
            if not hero_id in hero_mapping:
                retrieved += 1
                hero_mapping[hero_id] = player['hero_name'].split('_')[-1]
            if retrieved == hero_count:
                done = True
                break
        if done:
            break
hero_mapping      
# game1 = game
# # game1 = ujson.loads(next(open('data/train_matches.jsonl')))
# print(game1.keys())


HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))

{11: 'nevermore',
 78: 'brewmaster',
 14: 'pudge',
 59: 'huskar',
 77: 'lycan',
 12: 'lancer',
 21: 'windrunner',
 60: 'stalker',
 84: 'magi',
 34: 'tinker',
 15: 'razor',
 96: 'centaur',
 27: 'shaman',
 63: 'weaver',
 89: 'siren',
 58: 'enchantress',
 1: 'antimage',
 56: 'clinkz',
 92: 'visage',
 101: 'mage',
 51: 'rattletrap',
 44: 'assassin',
 49: 'knight',
 53: 'furion',
 18: 'sven',
 67: 'spectre',
 47: 'viper',
 40: 'venomancer',
 17: 'spirit',
 99: 'bristleback',
 26: 'lion',
 41: 'void',
 98: 'shredder',
 8: 'juggernaut',
 69: 'bringer',
 86: 'rubick',
 42: 'king',
 104: 'commander',
 65: 'batrider',
 23: 'kunkka',
 22: 'zuus',
 35: 'sniper',
 72: 'gyrocopter',
 57: 'omniknight',
 10: 'morphling',
 81: 'knight',
 119: 'willow',
 48: 'luna',
 68: 'apparition',
 102: 'abaddon',
 71: 'breaker',
 108: 'underlord',
 90: 'light',
 91: 'wisp',
 114: 'king',
 50: 'dazzle',
 74: 'invoker',
 93: 'slark',
 75: 'silencer',
 76: 'destroyer',
 3: 'bane',
 62: 'hunter',
 30: 'doctor',
 80: 'd

In [34]:
game1['players'][0].keys()

dict_keys(['player_slot', 'hero_id', 'hero_name', 'account_id_hash', 'ability_upgrades', 'obs_placed', 'sen_placed', 'creeps_stacked', 'camps_stacked', 'rune_pickups', 'firstblood_claimed', 'teamfight_participation', 'towers_killed', 'roshans_killed', 'observers_placed', 'stuns', 'max_hero_hit', 'times', 'gold_t', 'lh_t', 'dn_t', 'xp_t', 'obs_log', 'sen_log', 'obs_left_log', 'sen_left_log', 'purchase_log', 'kills_log', 'buyback_log', 'runes_log', 'obs', 'sen', 'actions', 'pings', 'purchase', 'gold_reasons', 'xp_reasons', 'killed', 'item_uses', 'ability_uses', 'hero_hits', 'damage', 'damage_taken', 'damage_inflictor', 'runes', 'killed_by', 'kill_streaks', 'multi_kills', 'life_state', 'healing', 'damage_inflictor_received', 'randomed', 'pred_vict', 'gold', 'lh', 'xp', 'x', 'y', 'hero_inventory', 'hero_stash', 'health', 'max_health', 'max_mana', 'level', 'kills', 'deaths', 'assists', 'denies', 'nearby_creep_death_count'])

In [78]:
game1['game_mode']

22