In [5]:
import json
import os
import random
import copy
import joblib
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [6]:
def is_match_valid(match):
    # check if hero ids is within range
    max_hero_id = 129 + 1 # hero id range 1-129
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 2, 16, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

def rid_to_hid(rid):
    d_heroes_path = "../data/hid_to_rid_dict.json"
    with open(d_heroes_path, 'r') as fp:
        d_heroes = json.load(fp)
        d_heroes = {int(v): int(k) for k, v in d_heroes.items()}    
    
    return d_heroes[rid]

In [7]:
def create_feature_index():
    heroes_path = "../data/hid_to_rid_dict.json"

    with open(heroes_path, 'r') as fp:
        heroes = json.load(fp)
        heroes = {int(k): int(v) for k, v in heroes.items()} 
        
    feature_set = set()
    for hero_id in heroes.keys():
        feature_set.add(("hero", hero_id))

        for hero_opp in heroes.keys():
            if hero_id == hero_opp:
                continue

            key = [hero_id, hero_opp]
            key.sort()
            feature_set.add(("pair_opp", tuple(key)))

        for hero_same in heroes.keys():
            if hero_id == hero_same:
                continue

            key = [hero_id, hero_same]
            key.sort()

            feature_set.add(("pair_same", tuple(key)))

    print("#feature_set:", len(feature_set))
    features_to_index = {}
    index_to_features = {}
    for k, v in enumerate(feature_set):
        features_to_index[v] = k
        index_to_features[k] = v
    
    return features_to_index, index_to_features
    
features_to_index, index_to_features = create_feature_index()

def create_training_example(match):
    winner = int(match["radiant_win"])
    players = match["players"]
    t0 = []
    t1 = []
    
    for player in players:
        rid = player["hero_id"]
        hid = rid_to_hid(rid)

        is_radiant = False if player["player_slot"] < 7 else True
        
        if is_radiant:
            t0.append(hid)
        else:
            t1.append(hid)
    
    # random flip teams and match winner
    if random.random() > 0.5:
        t0, t1 = t1, t0
        winner = (winner + 1) % 2
    
    n_features = len(features_to_index)
    example = np.zeros(n_features)
    teams = [t0, t1]
    for team, marker in zip([0, 1], [1, -1]):

        for hero_id in teams[team]:
            opp_team = (team + 1) % 2
            i = features_to_index[("hero", hero_id)]
            example[i] = marker
            
            # add same team pair
            for hero_id_same in teams[team]:
                if hero_id_same != hero_id:
                    key = [hero_id, hero_id_same]
                    key.sort() 
                    i_same = features_to_index[("pair_same", tuple(key))]
                    example[i_same] = marker
    
    # add opp pair
    for hero_id in t0:
        for hero_id_opp in t1:
            key = [hero_id, hero_id_opp]
            key.sort() 
            i_opp = features_to_index[("pair_opp", tuple(key))]
            example[i_opp] = 1
                
    return sp.sparse.csr_matrix(example, dtype=np.int8), winner

#feature_set: 14161


In [9]:
data_path = "../data/dota_games.zip"
training_examples = []
seen_matches = set()
match_winner = []
n_heroes = 118 + 1

draft_out_path = "../data/big1_game_features.npz"
match_winner_out = "../data/big1_match_winners.npy"

assert not os.path.isfile(draft_out_path)
    
with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
        
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        
        if "error" in match:
            continue
        
        match_id = match["match_id"]
        
        # check if match has been seen
        if match_id in seen_matches:
            continue
        
        seen_matches.add(match_id)
        
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
        
        # stack X
        x, y = create_training_ex_with_rules(match)
        training_examples.append(x)
        
        # y
        match_winner.append(int(y))
        
        
    sparse_training = sp.sparse.vstack(training_examples)
    
    # save 
    sp.sparse.save_npz(draft_out_path, sparse_training)
    np.save(match_winner_out, np.array(match_winner))
    
    print("done")

  0%|          | 1/2338044 [00:00<3:02:48, 213.16it/s]


AttributeError: 'list' object has no attribute 'items'