In [41]:
import json
import os
import copy
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [42]:
n_heroes = 119
max_hero_id = 128

heroes_path = "../data/heroes_clean.json"
    
with open(heroes_path, 'r') as fp:
    heroes = json.load(fp)

In [43]:
def is_match_valid(match):
    # check if hero ids is within range
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

In [44]:
data_path = "../data/dota_games.zip"
hero_selection = []
match_winner = []

out_path = "../data/hero_selection_onehot.npz"
assert not os.path.isfile(out_path)
    
with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
        
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
        
        players = match["players"]
        heroes = np.zeros(128, dtype=np.float)

        for player in players:
            hero_id = player["hero_id"]
            is_dire = 1 if player["player_slot"] < 7 else -1
            heroes[hero_id] = is_dire
        
        
        radiant = heroes 
        dire = copy.deepcopy(heroes)
        radiant[radiant == -1] = 0
        dire[dire == 1] = 0
        dire[dire == -1] = 1

        heroes_onehot = np.concatenate((radiant, dire))
        
        # stack X
        s_x = sp.sparse.csr_matrix(heroes_onehot)
        hero_selection.append(s_x)
        
        # y
        match_winner.append(int(match["radiant_win"]))
        
        
    sparse_hero_selection = sp.sparse.vstack(hero_selection)
    
    # save 
    sp.sparse.save_npz(out_path, sparse_training)
    np.save("../data/match_winner_onehot.npy", np.array(match_winner))
    
    print("done")

 11%|█▏        | 267916/2338044 [04:12<34:56, 987.50it/s]  

Exception:  {'error': 'Match ID not found'}


 31%|███       | 723718/2338044 [09:22<17:03, 1577.92it/s]

Exception:  {'error': 'Match ID not found'}


 32%|███▏      | 750730/2338044 [09:35<13:10, 2007.20it/s]

Exception:  {'error': 'Match ID not found'}


 45%|████▍     | 1040809/2338044 [12:09<15:09, 1426.02it/s]


KeyboardInterrupt: 

### test code

#reconstruct heroes data

heroes_path = "../data/heroes.json"
with open(heroes_path, 'r') as fp:
    heroes = json.load(fp)
    
new_heroes_d = {}
for hero in heroes:
    new_heroes_d[int(hero["id"])] = hero["name"]
    
with open("../data/heroes_clean.json", "w") as fp:
    fp.write(json.dumps(new_heroes_d))

# extract test match
data_path = "../data/dota_games.zip"
test_match_path = "../data/test_match_5607724594.json"

with ZipFile(data_path) as z:
    z.extract('dota_games/5607724594.json', "../data/test_match.json")

with open(test_match_path, 'r') as fp:
    match = json.load(fp)

training_examples = []
matches = [match]

for match in matches:
    players = match["result"]["players"]
    
    # check if match req is fulfilled
    if not is_match_valid(match):
        continue
    
    heroes_onehot = np.zeros(128, dtype=np.float)
    
    for player in players:
        hero_id = player["hero_id"]
        is_dire = 1 if player["player_slot"] < 7 else -1
        heroes_onehot[hero_id] = is_dire
        
    # stack X
    s_x = sp.sparse.csr_matrix(heroes_onehot)
    training_examples.append(s_x)
    
     
sparse_training = sp.sparse.vstack(training_examples)

# save 
sp.sparse.save_npz("../data/test_match.npz", sparse_training)

print(sparse_training)

In [40]:
test_match_path = "../data/test_match_5607724594.json"

training_examples = []
match_winner = []
with open(test_match_path, 'r') as fp:
    match = json.load(fp)

out_path = "../data/hero_selection_test.npz"

match = match["result"]
# check if match req is fulfilled
if not is_match_valid(match):
    raise False

players = match["players"]
heroes = np.zeros(128, dtype=np.float)

for player in players:
    hero_id = player["hero_id"]
    is_dire = 1 if player["player_slot"] < 7 else -1
    heroes[hero_id] = is_dire

# create list for dire and radiant
# print(heroes_onehot)

heroes = np.array(heroes)
radiant = heroes 
dire = copy.deepcopy(heroes)
radiant[radiant == -1] = 0
dire[dire == 1] = 0
dire[dire == -1] = 1

heroes_onehot = np.concatenate((radiant, dire))
print(heroes_onehot)

# stack X
s_x = sp.sparse.csr_matrix(heroes_onehot)
training_examples.append(s_x)

# y
match_winner.append(int(match["radiant_win"]))


sparse_training = sp.sparse.vstack(training_examples)

print(sparse_training)
# save 
# sp.sparse.save_npz(out_path, sparse_training)
# np.save("../data/test_match_winner.npy", np.array(match_winner))


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

### Load npz and npy

In [19]:
a = sp.sparse.load_npz("../data/hero_selection.npz")
b = np.load("../data/match_winner.npy")
print(a.shape)
print(b.shape)

(1635712, 128)
(1635712,)


In [108]:
# check if hero_id matches the match info
row, col = sparse_training.nonzero()
    
for i in col:
    print(heroes[str(i)])

Mirana
Sand King
Shadow Shaman
Lich
Sniper
Necrophos
Faceless Void
Spectre
Timbersaw
Skywrath Mage
Elder Titan
