In [20]:
import json
import os
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [21]:
def is_match_valid(match):
    # check if hero ids is within range
    max_hero_id = 129 + 1 # hero id range 1-129
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

In [22]:
def rid_to_hid(rid):
    d_heroes_path = "../data/hid_to_rid_dict.json"
    with open(d_heroes_path, 'r') as fp:
        d_heroes = json.load(fp)
        d_heroes = {int(v): int(k) for k, v in d_heroes.items()}    
    
    return d_heroes[rid]

In [23]:
data_path = "../data/dota_games.zip"
training_examples = []
match_winner = []
seen_matches = set()
n_heroes = 118 + 1
draft_out_path = "../data/ds4_hero_selection.npz"
match_winner_out = "../data/ds4_match_winner.npy"
assert not os.path.isfile(draft_out_path)
    
with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
        
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        if "error" in match:
            continue
        
        match_id = match["match_id"]
        
        # check if match has been seen
        if match_id in seen_matches:
            continue
        
        seen_matches.add(match_id)
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
        
        players = match["players"]
        heroes_onehot = np.zeros(n_heroes, dtype=np.float)

        for player in players:
            rid = player["hero_id"]
            hid = rid_to_hid(rid)
            
            is_radiant = -1 if player["player_slot"] < 7 else 1
            heroes_onehot[hid] = is_radiant

        # stack X
        s_x = sp.sparse.csr_matrix(heroes_onehot)
        training_examples.append(s_x)
        
        # y
        match_winner.append(int(match["radiant_win"]))
        
        
    sparse_training = sp.sparse.vstack(training_examples)
    
    # save 
    sp.sparse.save_npz(draft_out_path, sparse_training)
    np.save(match_winner_out, np.array(match_winner))
    
    print("done")

100%|██████████| 2338044/2338044 [37:52<00:00, 1028.85it/s] 


done
