In [11]:
import json
import os
import random
import copy
import joblib
import tqdm
import pickle
import numpy as np
from zipfile import ZipFile, ZipInfo


In [17]:
def is_match_valid(match):
    # check if hero ids is within range
    max_hero_id = 129 + 1 # hero id range 1-129
    try:
        players = match["players"]
        hero_ids = [True if 0 < x["hero_id"] and x["hero_id"] < max_hero_id else False for x in players]
        if not all(hero_ids):
            return False

        # check matchtype is not all pick, random draft and ranked matchmaking
        valid_game_modes = {1, 3, 22}
        if not match["game_mode"] in valid_game_modes:
            return False

        # check match duration less than 10 minutes
        if match["duration"] <= 600: 
            return False

        # check if match is not 10 players
        if not len(match["players"]) == 10:
            return False

        # check for leavers
        no_dc = [True if x["leaver_status"] in [0, 1] else False for x in players]
        if not all(no_dc):
            return False
    except:
        print("Exception: ", match)
        return False
    
    return True

def rid_to_hid(rid):
    d_heroes_path = "../data/hid_to_rid_dict.json"
    with open(d_heroes_path, 'r') as fp:
        d_heroes = json.load(fp)
        d_heroes = {int(v): int(k) for k, v in d_heroes.items()}    
    
    return d_heroes[rid]

def get_match_info(match) -> dict:
    match_info = {}
    winner = int(match["radiant_win"])
    players = match["players"]
    t0 = []
    t1 = []
    
    for player in players:
        rid = player["hero_id"]
        hid = rid_to_hid(rid)

        is_radiant = False if player["player_slot"] < 7 else True
        
        if is_radiant:
            t0.append(hid)
        else:
            t1.append(hid)
    
    match_info["radiant"] = t0
    match_info["dire"] = t1
    match_info["radiant_win"] = winner
    
    return match_info

In [18]:
data_path = "../data/dota_games.zip"
seen_matches = set()
training_examples = []

with ZipFile(data_path) as z:
    for item in tqdm.tqdm(z.filelist):
        if not item.filename.endswith(".json"):
            continue
            
        raw_match = z.read(item.filename)
        match = json.loads(raw_match)["result"]
        
        if "error" in match:
            continue
            
        match_id = match["match_id"]
        
        # check if match has been seen
        if match_id in seen_matches:
            continue
        
        seen_matches.add(match_id)
        
        # check if match req is fulfilled
        if not is_match_valid(match):
            continue
            
        match_info = get_match_info(match)
        training_examples.append(match_info)
    
with open("../data/dota_games_pickles.pkl", "wb") as fp:
    pickle.dump(training_examples, fp, pickle.HIGHEST_PROTOCOL)

100%|██████████| 2338044/2338044 [32:50<00:00, 1186.25it/s] 


In [20]:
import pandas as pd



with open ("../data/dota_games_pickles.pkl", "rb") as fp:
    d_games = pd.read_pickle(fp)
    


In [23]:
df = pd.DataFrame(d_games)

In [24]:
df.head(5)

Unnamed: 0,radiant,dire,radiant_win
0,"[33, 101, 15, 65, 8]","[34, 25, 39, 29, 96]",0
1,"[98, 65, 26, 61, 73]","[13, 38, 95, 9, 17]",1
2,"[72, 13, 61, 39, 105]","[104, 21, 83, 89, 47]",0
3,"[6, 104, 66, 118, 42]","[60, 102, 82, 68, 25]",1
4,"[102, 45, 85, 8, 65]","[42, 77, 58, 21, 110]",0
