In [5]:
import os
import gzip
import json
import numpy as np
import pandas as pd
import tqdm
pd.set_option('display.max_rows', 100)

## Heroes information and stats

In [6]:
with open("../data/heroes.json", "r") as fp: 
    heroes = json.load(fp)
    
df_heroes = pd.DataFrame(heroes)
df_heroes

Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles,legs
0,1,npc_dota_hero_antimage,Anti-Mage,agi,Melee,"[Carry, Escape, Nuker]",2
1,2,npc_dota_hero_axe,Axe,str,Melee,"[Initiator, Durable, Disabler, Jungler, Carry]",2
2,3,npc_dota_hero_bane,Bane,int,Ranged,"[Support, Disabler, Nuker, Durable]",4
3,4,npc_dota_hero_bloodseeker,Bloodseeker,agi,Melee,"[Carry, Disabler, Jungler, Nuker, Initiator]",2
4,5,npc_dota_hero_crystal_maiden,Crystal Maiden,int,Ranged,"[Support, Disabler, Nuker, Jungler]",2
...,...,...,...,...,...,...,...
116,123,npc_dota_hero_hoodwink,Hoodwink,agi,Ranged,"[Support, Nuker, Escape, Disabler]",4
117,126,npc_dota_hero_void_spirit,Void Spirit,int,Melee,"[Carry, Escape, Nuker, Disabler]",2
118,128,npc_dota_hero_snapfire,Snapfire,str,Ranged,"[Support, Nuker, Disabler, Escape]",2
119,129,npc_dota_hero_mars,Mars,str,Melee,"[Carry, Initiator, Disabler, Durable]",2


In [7]:
df_heroes_dict = df_heroes[["id", "localized_name"]]
df_heroes_dict

Unnamed: 0,id,localized_name
0,1,Anti-Mage
1,2,Axe
2,3,Bane
3,4,Bloodseeker
4,5,Crystal Maiden
...,...,...
116,123,Hoodwink
117,126,Void Spirit
118,128,Snapfire
119,129,Mars


As seen in the table above, the heroids in dota are not incremental, but have gaps. We have to make a `id to heroid` map and vice versa for lookup.

### Notes:

* We will look from the alliance side
* There are 121 heroes as of 7.30 patch
* Newest hero is Dawnbreaker
* 63 ranged, 58 melee
* 43 int, 40 str, 38 str

## Preprocessing loop

In [8]:
def get_comp_by_training_sample(sample):
    radiant_win = sample[-1]
    radiant = []
    dire = []
    for i, h in enumerate(sample[:-1]):
        if h == 1:
            radiant.append(_get_hero_name_by_id(i))
        elif h == -1:
            dire.append(_get_hero_name_by_id(i))
    print(radiant, dire, radiant_win)

def _get_hero_name_by_id(idx: int):
    return df_heroes[df_heroes["id"].index == idx]["localized_name"].values[0]

In [10]:
id_to_hid = df_heroes[["id"]].to_dict()["id"]
hid_to_id = {v:k for k,v in id_to_hid.items()}

results = []
seen_matches = set()
dup_counter = 0
invalid = 0
# [[1, 0, -1, ... , n_heroes_121, winner], ... ]
count = 0
with gzip.open("../data/raw/matches_5148330922-5148330922.gz", "r") as fp:
    for line in tqdm.tqdm(fp):
        training_sample = np.zeros((122,))
        match = json.loads(line)
        players = match["players"]
        match_id = match["match_id"]
        invalid_index = False
        
        if not "match_id" in match:
            # invalid += 1 
            continue
        
        if len(players) != 10:
            # invalid += 1
            continue
        
        if match_id in seen_matches:
            # dup_counter += 1
            continue
        
        for p in players:
            if p["hero_id"] not in hid_to_id.keys():
                invalid_index = True
                # invalid += 1
                break

            hero_i = hid_to_id[p["hero_id"]]

            is_radiant = True if p["player_slot"] <= 7 else False # DETTE ER RIKTIG!!! 

            if is_radiant: # radiant slot
                training_sample[hero_i] = 1
            else: 
                training_sample[hero_i] = -1
        
        if invalid_index:
            continue

        training_sample[-1] = match["radiant_win"]
        results.append(training_sample)
        seen_matches.add(match_id)

        
# print("# duplicates", dup_counter)
print("seen matches: {}, len samples: {}".format(len(seen_matches), len(results)))
# print("# invalid", invalid)

with open("../data/preprocessed/matches_5148330922-5148330922.npy", "wb") as fp:
    np.save(fp, np.array(results, dtype=np.int8), allow_pickle=True)

905547it [03:40, 4103.05it/s]


seen matches: 905536, len samples: 905536


In [28]:
pp_matches = np.load("../data/matches_5148330922-5148330922.npy")

print(len(pp_matches))

401
