In [8]:
import json
import os
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [9]:
n_heroes = 119
max_hero_id = 128

heroes_path = "../data/heroes_clean.json"
    
with open(heroes_path, 'r') as fp:
    heroes = json.load(fp)

In [15]:
def is_match_valid(match):
    # check if hero ids is within range
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

In [16]:
data_path = "../data/dota_games.zip"
training_examples = []
match_winner = []

out_path = "../data/hero_selection.npz"
assert not os.path.isfile(out_path)
    
with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
        
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
        
        players = match["players"]
        heroes_onehot = np.zeros(128, dtype=np.float)

        for player in players:
            hero_id = player["hero_id"]
            is_dire = 1 if player["player_slot"] < 7 else -1
            heroes_onehot[hero_id] = is_dire

        # stack X
        s_x = sp.sparse.csr_matrix(heroes_onehot)
        training_examples.append(s_x)
        
        # y
        match_winner.append(int(match["radiant_win"]))
        
        
    sparse_training = sp.sparse.vstack(training_examples)
    
    # save 
    sp.sparse.save_npz(out_path, sparse_training)
    np.save("../data/match_winner.npy", np.array(match_winner))
    
    print("done")

 11%|█▏        | 268074/2338044 [02:16<16:25, 2099.87it/s] 

Exception:  {'error': 'Match ID not found'}


 31%|███       | 723837/2338044 [06:03<13:13, 2034.10it/s]

Exception:  {'error': 'Match ID not found'}


 32%|███▏      | 750742/2338044 [06:17<13:02, 2027.61it/s]

Exception:  {'error': 'Match ID not found'}


 58%|█████▊    | 1353853/2338044 [11:39<09:11, 1783.04it/s]

Exception:  {'error': 'Match ID not found'}


 65%|██████▍   | 1509508/2338044 [13:01<06:55, 1995.12it/s]

Exception:  {'error': 'Match ID not found'}


 65%|██████▌   | 1520564/2338044 [13:06<06:48, 2001.83it/s]

Exception:  {'error': 'Match ID not found'}


 99%|█████████▊| 2304913/2338044 [19:22<00:16, 2019.80it/s]

Exception:  {'error': 'Match ID not found'}


100%|██████████| 2338044/2338044 [19:39<00:00, 1982.50it/s]


  (0, 9)	-1.0
  (0, 16)	-1.0
  (0, 27)	1.0
  (0, 31)	1.0
  (0, 35)	-1.0
  (0, 36)	1.0
  (0, 41)	1.0
  (0, 67)	-1.0
  (0, 98)	1.0
  (0, 103)	-1.0
  (1, 10)	1.0
  (1, 14)	1.0
  (1, 18)	1.0
  (1, 28)	-1.0
  (1, 40)	1.0
  (1, 63)	-1.0
  (1, 67)	-1.0
  (1, 75)	-1.0
  (1, 97)	1.0
  (1, 100)	-1.0
  (2, 14)	-1.0
  (2, 22)	1.0
  (2, 41)	-1.0
  (2, 49)	1.0
  (2, 63)	-1.0
  :	:
  (1635709, 99)	-1.0
  (1635709, 101)	1.0
  (1635709, 104)	1.0
  (1635709, 119)	1.0
  (1635709, 121)	-1.0
  (1635710, 6)	1.0
  (1635710, 11)	1.0
  (1635710, 14)	-1.0
  (1635710, 26)	-1.0
  (1635710, 31)	1.0
  (1635710, 35)	-1.0
  (1635710, 67)	-1.0
  (1635710, 104)	-1.0
  (1635710, 105)	1.0
  (1635710, 126)	1.0
  (1635711, 7)	-1.0
  (1635711, 10)	-1.0
  (1635711, 26)	1.0
  (1635711, 44)	1.0
  (1635711, 47)	1.0
  (1635711, 54)	-1.0
  (1635711, 64)	-1.0
  (1635711, 71)	1.0
  (1635711, 83)	1.0
  (1635711, 84)	-1.0


### test code

# reconstruct heroes data

heroes_path = "../data/heroes.json"
with open(heroes_path, 'r') as fp:
    heroes = json.load(fp)
    
new_heroes_d = {}
for hero in heroes:
    new_heroes_d[int(hero["id"])] = hero["name"]
    
with open("../data/heroes_clean.json", "w") as fp:
    fp.write(json.dumps(new_heroes_d))

# extract test match
data_path = "../data/dota_games.zip"
test_match_path = "../data/test_match_5607724594.json"

with ZipFile(data_path) as z:
    z.extract('dota_games/5607724594.json', "../data/test_match.json")

with open(test_match_path, 'r') as fp:
    match = json.load(fp)

training_examples = []
matches = [match]

for match in matches:
    players = match["result"]["players"]
    
    # check if match req is fulfilled
    if not is_match_valid(match):
        continue
    
    heroes_onehot = np.zeros(128, dtype=np.float)
    
    for player in players:
        hero_id = player["hero_id"]
        is_dire = 1 if player["player_slot"] < 7 else -1
        heroes_onehot[hero_id] = is_dire
        
    # stack X
    s_x = sp.sparse.csr_matrix(heroes_onehot)
    training_examples.append(s_x)
    
     
sparse_training = sp.sparse.vstack(training_examples)

# save 
sp.sparse.save_npz("../data/test_match.npz", sparse_training)

print(sparse_training)

### Load npz and npy

In [19]:
a = sp.sparse.load_npz("../data/hero_selection.npz")
b = np.load("../data/match_winner.npy")
print(a.shape)
print(b.shape)

(1635712, 128)
(1635712,)


In [108]:
# check if hero_id matches the match info
row, col = sparse_training.nonzero()
    
for i in col:
    print(heroes[str(i)])

Mirana
Sand King
Shadow Shaman
Lich
Sniper
Necrophos
Faceless Void
Spectre
Timbersaw
Skywrath Mage
Elder Titan
