In [1]:
import json
import os
import random
import copy
import joblib
from zipfile import ZipFile, ZipInfo

import scipy as sp
import scipy.sparse
import numpy as np
import pandas as pd
import tqdm

In [2]:
def is_match_valid(match):
    # check if hero ids is within range
    max_hero_id = 129 + 1 # hero id range 1-129
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

def rid_to_hid(rid):
    d_heroes_path = "../data/hid_to_rid_dict.json"
    with open(d_heroes_path, 'r') as fp:
        d_heroes = json.load(fp)
        d_heroes = {int(v): int(k) for k, v in d_heroes.items()}    
    
    return d_heroes[rid]

def create_feature_index():
    heroes_path = "../data/hid_to_rid_dict.json"

    with open(heroes_path, 'r') as fp:
        heroes = json.load(fp)
        heroes = {int(k): int(v) for k, v in heroes.items()} 
        
    feature_set = set()
    for hero_id in heroes.keys():
        feature_set.add(("hero", hero_id))

        for hero_opp in heroes.keys():
            if hero_id == hero_opp:
                continue

            key = [hero_id, hero_opp]
            key.sort()
            feature_set.add(("pair_opp", tuple(key)))

        for hero_same in heroes.keys():
            if hero_id == hero_same:
                continue

            key = [hero_id, hero_same]
            key.sort()

            feature_set.add(("pair_same", tuple(key)))

    print("#feature_set:", len(feature_set))
    features_to_index = {}
    index_to_features = {}
    for k, v in enumerate(feature_set):
        features_to_index[v] = k
        index_to_features[k] = v
    
    return features_to_index, index_to_features

# new rules 
features_to_index, index_to_features = create_feature_index()
rfe = joblib.load("../models/rfe_logreg_7080.joblib")

chosen_feats = np.where(rfe.ranking_ == 1)[0]
rules = [index_to_features[x] for x in chosen_feats]

new_index_to_features = {}
new_i = 0

for k,v in index_to_features.items():
    if rfe.support_[k]:
        new_index_to_features[new_i] = v
        new_i += 1
new_features_to_index = {v:k for (k,v) in new_index_to_features.items()}


#feature_set: 14161


In [3]:
# save new feature dicts
# import pickle
# rule_dicts = {
#     "index_to_features": new_index_to_features,
#     "features_to_index": new_features_to_index
# }

# with open("../data/etc/rules_7080.pkl", "wb") as fp:
#     pickle.dump(rule_dicts, fp)

In [4]:
def rules_to_training_example(rules, t0, t1):
#     t0 = set(sorted(draft[::2], key=lambda x: x))
#     t1 = set(sorted(draft[1::2], key=lambda x: x))
    t0 = set(sorted(t0))
    t1 = set(sorted(t1))
    
    t0_opp_pairs = np.array(np.meshgrid(t0, t1)).T.reshape(-1, 2)
    t1_opp_pairs = np.array(np.meshgrid(t1, t0)).T.reshape(-1, 2)
    
    training_example = np.zeros(len(rules))

    for rule, rule_i in rules.items():
        r_type = rule[0]
        r_heroes = rule[1]
        if type(r_heroes) is tuple:
            r_heroes = set(r_heroes)

        if r_type == "pair_same":
            if r_heroes.issubset(t0):
                training_example[rule_i] = 1.
            if r_heroes.issubset(t1):
                training_example[rule_i] = -1.

        elif r_type == "pair_opp":
            if r_heroes in t0_opp_pairs:
                print("yep")
                training_example[rule_i] = 1.
            if r_heroes in t1_opp_pairs:
                print("yep")
                training_example[rule_i] = -1.

        elif r_type == "hero":
            if r_heroes in t0:
                training_example[rule_i] = 1.
            if r_heroes in t1:
                training_example[rule_i] = -1.
        else:
            raise Exception(f"No rule with type {r_type}")

    return training_example


def create_training_ex_with_rules(match):
    winner = int(match["radiant_win"])
    t0 = match["radiant"]
    t1 = match["dire"]
    
    # random flip teams and match winner
    if random.random() > 0.5:
        t0, t1 = t1, t0
        winner = (winner + 1) % 2
    
    n_features = len(rules)
    example =  rules_to_training_example(new_features_to_index, t0, t1)
                
    return sp.sparse.csr_matrix(example, dtype=np.int8), winner



In [5]:
len(new_features_to_index)

7080

In [6]:
training_examples = []
seen_matches = set()
match_winner = []
n_heroes = 118 + 1
draft_out_path = "../data/big4_7080_game_features.npz"
match_winner_out = "../data/big4_7080_match_winners.npy"


with open ("../data/dota_games_pickles.pkl", "rb") as fp:
    d_games = pd.read_pickle(fp)


assert not os.path.isfile(draft_out_path)

for match in tqdm.tqdm(d_games):
    # stack X
    x, y = create_training_ex_with_rules(match)
    training_examples.append(x)

    # y
    match_winner.append(int(y))


sparse_training = sp.sparse.vstack(training_examples)

# save 
sp.sparse.save_npz(draft_out_path, sparse_training)
np.save(match_winner_out, np.array(match_winner))

print("done")

  3%|▎         | 71547/2162047 [48:45<23:44:52, 24.45it/s]


KeyboardInterrupt: 

In [23]:
sparse_training.shape

(2162047, 14161)

In [26]:
sparse_training[:,:7080].shape

(2162047, 1)