In [1]:
import json
import os
import random
import copy
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [2]:
heroes_path = "../data/heroes.json"
    
with open(heroes_path, 'r') as fp:
    heroes_raw = json.load(fp)
    heroes = dict(((int(k), v) for k,v in heroes_raw.items()))

def is_match_valid(match):
    # check if hero ids is within range
    try:
#         e = ""
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < 129 else False for x in players]
        if not all(hero_ids):
#             print("Hero ids not in range")
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        game_mode = match["game_mode"]
        if not game_mode in valid_game_modes:
#             print(f"Matchtype not in range (1, 3, 22), got {game_mode}")
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
#             print("Matchduration is less than 10 minutes")
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
#             print("n players are less than 10")
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
#             print("Leavers in game")
            return False
    except:
#         print("Invalid match")
        return False
    
    return True



def create_feature_index():
    feature_set = set()
    for hero_id in heroes.keys():
        feature_set.add(("hero", hero_id))

        for hero_opp in heroes.keys():
            if hero_id == hero_opp:
                continue

            key = [hero_id, hero_opp]
            key.sort()
            feature_set.add(("pair_opp", tuple(key)))

        for hero_same in heroes.keys():
            if hero_id == hero_same:
                continue

            key = [hero_id, hero_same]
            key.sort()

            feature_set.add(("pair_same", tuple(key)))

    print("#feature_set:", len(feature_set))
    features_to_index = {}
    index_to_features = {}
    for k, v in enumerate(feature_set):
        features_to_index[v] = k
        index_to_features[k] = v
    
    return features_to_index, index_to_features
    
features_to_index, index_to_features = create_feature_index()

def create_training_example(match):
    winner = int(match["radiant_win"])
    players = match["players"]
    t0 = []
    t1 = []
    
    for player in players:
        hero_id = player["hero_id"]
        is_dire = 1 if player["player_slot"] < 7 else 0
        if is_dire:
            t0.append(hero_id)
        else:
            t1.append(hero_id)
    
    # random flip 
    if random.random() > 0.5:
        t0, t1 = t1, t0
        winner = (winner + 1) % 2
    
    n_features = len(features_to_index)
    example = np.zeros(n_features)
    teams = [t0, t1]
    for team, marker in zip([0, 1], [1, -1]):

        for hero_id in teams[team]:
            opp_team = (team + 1) % 2
            i = features_to_index[("hero", hero_id)]
            example[i] = marker
            
            # add same team pair
            for hero_id_same in teams[team]:
                if hero_id_same != hero_id:
                    key = [hero_id, hero_id_same]
                    key.sort() 
                    i_same = features_to_index[("pair_same", tuple(key))]
                    example[i_same] = marker
    
    # add opp pair
    for hero_id in t0:
        for hero_id_opp in t1:
            key = [hero_id, hero_id_opp]
            key.sort() 
            i_opp = features_to_index[("pair_opp", tuple(key))]
            example[i_opp] = 1
                
    return sp.sparse.csr_matrix(example, dtype=np.int8), winner


#feature_set: 14161


In [None]:
data_path = "../data/dota_games.zip"
hero_selection = []
match_winner = []

out_path = "../data/game_features.npz"
assert not os.path.isfile(out_path)
    
with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
        
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
        
        # stack X
        x, y = create_training_example(match)
        hero_selection.append(x)
        
        # y
        match_winner.append(int(y))
        
        
    sparse_hero_selection = sp.sparse.vstack(hero_selection)
    
    # save 
    sp.sparse.save_npz(out_path, sparse_hero_selection)
    np.save("../data/match_winners.npy", np.array(match_winner))
    
    print("done")

 13%|█▎        | 303144/2338044 [05:06<1214:11:59,  2.15s/it]

In [21]:
# test_match_path = "../data/test_match_5607724594.json"
# out_path = "../data/hero_selection_test.npz"

# training_examples = []
# match_winner = []
# with open(test_match_path, 'r') as fp:
#     match = json.load(fp)["result"]

# # check if match req is fulfilled
# if not is_match_valid(match):
#     raise False

# x, y = create_training_example(match)
# x, y

(<1x14161 sparse matrix of type '<class 'numpy.int8'>'
 	with 55 stored elements in Compressed Sparse Column format>,
 0)

### Load npz and npy

In [19]:
a = sp.sparse.load_npz("../data/hero_selection.npz")
b = np.load("../data/match_winner.npy")
print(a.shape)
print(b.shape)

(1635712, 128)
(1635712,)


In [108]:
# check if hero_id matches the match info
row, col = sparse_training.nonzero()
    
for i in col:
    print(heroes[str(i)])

Mirana
Sand King
Shadow Shaman
Lich
Sniper
Necrophos
Faceless Void
Spectre
Timbersaw
Skywrath Mage
Elder Titan
