In [1]:
import pandas as pd
import numpy as np
import os

from data_processing.GameFeaturesGenerator import GameFeaturesGenerator
from data_processing.utils.download_functions import *

In [228]:
# Write some utility functions
def get_game_data_full(games_data):
    # Iterate through t events of the match (could consist of one or many games) 
    # Look in the ['games'][t]['id'] field to get the game ID
    # Look in the ['games'][t]['state'] field to see if the game is 'completed'
    # Look in the ['games'][t]['teams'] field to get the team IDs
    # Look in the ['games'][t]['teams'][x]['result']['outcome'] field to get the result of the game for each team
    # We technically only need the 'state' to verify completion and 'id' to fetch details of the game, but load in other fields for verification
    match_id = games_data['id']  # ID for the full match
    game_tables = []
    for game in games_data['games']:
        game_state = game['state']
        if game_state == 'completed':
            game_id = game['id']  # ID for the specific games in the match 
            team_ids, team_outcomes = [], []
            for team in game['teams']:
                team_ids.append(team['id'])
                team_outcome = 1 if team['result']['outcome']=='win' else 0
                team_outcomes.append(team_outcome)
            game_tables.append(pd.DataFrame({'match_id': match_id, 'esportsGameId': game_id, 
                                             'team_id_1': team_ids[0], 'outcome_1': team_outcomes[0],
                                             'team_id_2': team_ids[1], 'outcome_2': team_outcomes[1]}, index=[0]))
    return pd.concat(game_tables, ignore_index=True)

def get_game_data_ids(games_data):
    # Only returns the necessary information for lookup in the mapping table
    game_ids = []
    for game in games_data:
        if game['state']=='completed':
            game_ids.append(game['id'])
    return pd.DataFrame({'esportsGameId': game_ids})

In [230]:
os.chdir('esports-data')
os.listdir()

['games',
 'leagues.json',
 'mapping_data.json',
 'players.json',
 'teams.json',
 'tournaments.json']

In [4]:
# Read in mappings data
with open("mapping_data.json", "r") as json_file:
   mappings_data = json.load(json_file)
   
mappings = {
   esports_game["esportsGameId"]: esports_game for esports_game in mappings_data
}

In [323]:
# Read in tournament data
with open("tournaments.json", "r") as json_file:
   tournament_data_all = json.load(json_file)

# Only retrieve LCS tournament data ['lcs_summer_2023']
tournament_data = [x for x in tournament_data_all if x['slug']=='lec_spring_2023'][0]

# Delete tournament data to save memory
del tournament_data_all

In [324]:
# See what rounds are featured in the tournament 
def get_tournament_stages(tournament_data):
    return [[x['name'], len(x['sections'])] for x in tournament_data['stages']]

get_tournament_stages(tournament_data)

[['Regular Season', 1], ['Groups', 2], ['Playoffs', 1]]

In [328]:
tournament_data['stages'][1]['sections'][1]['matches'][1]

{'id': '109919226378791856',
 'type': 'normal',
 'state': 'completed',
 'mode': 'classic',
 'strategy': {'type': 'bestOf', 'count': 3},
 'teams': [{'id': '98767991926151025',
   'side': 'blue',
   'record': {'wins': 2, 'losses': 1, 'ties': 0},
   'result': {'outcome': 'loss', 'gameWins': 1},
   'players': [{'id': '105554439426664062', 'role': 'mid'},
    {'id': '98767975968177297', 'role': 'mid'},
    {'id': '99566406047571736', 'role': 'top'},
    {'id': '105537190986692036', 'role': 'jungle'},
    {'id': '98767975961872793', 'role': 'bottom'},
    {'id': '99322214629661297', 'role': 'support'}]},
  {'id': '101383793574360315',
   'side': 'red',
   'record': {'wins': 1, 'losses': 2, 'ties': 0},
   'result': {'outcome': 'win', 'gameWins': 2},
   'players': [{'id': '100482247959137902', 'role': 'mid'},
    {'id': '105519724699493915', 'role': 'support'},
    {'id': '99566406053904433', 'role': 'jungle'},
    {'id': '103536921420956640', 'role': 'bottom'},
    {'id': '102181576087728793'

In [231]:
# Read in regular season games as training data 
reg_season_games = []
for game_data in tournament_data['stages'][0]['sections'][0]['matches']:
    reg_season_games.append(get_game_data_full(game_data))
reg_season_games = pd.concat(reg_season_games)

# Read in playoff games as testing data 
playoff_games = []
for game_data in tournament_data['stages'][1]['sections'][0]['matches']:
    playoff_games.append(get_game_data_full(game_data))
playoff_games = pd.concat(playoff_games)

In [232]:
reg_season_games.head()

Unnamed: 0,match_id,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110303581088069312,110303581088134849,99294153828264740,0,98767991877340524,1
0,110303581088331458,110303581088331459,98767991877340524,1,103461966951059521,0
0,110303581088331460,110303581088331461,98767991877340524,1,99294153824386385,0
0,110303581088331462,110303581088331463,98926509885559666,1,98767991877340524,0
0,110303581088331464,110303581088331465,98767991877340524,1,98926509892121852,0


In [233]:
playoff_games.head()

Unnamed: 0,match_id,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110767955468214620,110767955468280157,98926509885559666,0,106972778172351142,1
1,110767955468214620,110767955468411230,98926509885559666,0,106972778172351142,1
2,110767955468214620,110767955468411231,98926509885559666,1,106972778172351142,0
3,110767955468214620,110767955468411232,98926509885559666,0,106972778172351142,1
0,110767955468411234,110767955468411235,103461966951059521,0,98767991860392497,1


In [234]:
directory = "games"
if not os.path.exists(directory):
   os.makedirs(directory)

# Load each game and process them
game_rows = []
for game_id in reg_season_games['esportsGameId']:
    try:
        game_mapping_data = mappings[game_id]
        platform_game_id = game_mapping_data['platformGameId']
        # download_gzip_and_write_to_json(f"{directory}/{platform_game_id}")
        with open(f"games/{platform_game_id}.json", "r") as json_file:
            game_data = json.load(json_file)
        game_features = GameFeaturesGenerator(game_data, game_mapping_data).process_game()
        game_rows.append(game_features)
    except KeyError:
        print(f"Match {game_id} was not found")


Match 110303581088462659 was not found


In [274]:
game_data_featurized = pd.concat(game_rows)
# If the column name contains ['_ind'] then impute the missing values with 0
for col in game_data_featurized.columns:
    if '_ind' in col:
        game_data_featurized[col] = game_data_featurized[col].fillna(0)

game_data_featurized.head()

Unnamed: 0,platformGameId,esportsGameId,team_id,start_time,outcome,first_riftHerald_ind,first_riftHerald_time,num_riftHerald,first_dragon_ind,first_dragon_time,...,first_turret_ind,first_turret_time,num_turret,first_inhibitor_ind,first_inhibitor_time,num_inhibitor,first_kill_ind,first_kill_time,num_kills,game_end_time
0,ESPORTSTMNT02:3207804,110303581088134849,99294153828264740,2023-06-22 22:42:47.452,0,0,,1,0,,...,0,,2,0,,0,0,,3,1612.09
1,ESPORTSTMNT02:3207804,110303581088134849,98767991877340524,2023-06-22 22:42:47.452,1,1,503.584,1,1,503.584,...,1,1041.249,9,1,1041.249,1,1,710.596,12,1612.09
0,ESPORTSTMNT02:3214865,110303581088331459,98767991877340524,2023-06-30 21:08:08.783,1,1,602.508,2,1,602.508,...,1,815.009,11,1,815.009,2,1,185.993,25,1337.233
1,ESPORTSTMNT02:3214865,110303581088331459,103461966951059521,2023-06-30 21:08:08.783,0,0,,0,0,,...,0,,2,0,,0,0,,3,1337.233
0,ESPORTSTMNT01:3373466,110303581088331461,98767991877340524,2023-06-14 21:13:07.907,1,1,428.612,2,1,428.612,...,1,868.25,7,1,868.25,1,0,,9,1604.087


In [275]:
"""
1. Sort the game_rows by team_id and start_time 
2. Create features for each team by averaging stats of the last 5 games that they played (or the most recent statistics if possible, with a "num_prev_games) column
    This yields a row of data where the features are the average of the last 5 games that the team played BEFORE the game of ['platformGameId', 'esportsGameId', 'team_id', 'start_time']
3. Later join to the reg_season_games data for training the model 
4. Use the trained model to predict on the playoff_games data for testing 
"""

non_game_features = ['platformGameId', 'esportsGameId', 'team_id', 'start_time']
game_features = [x for x in game_data_featurized.columns if x not in non_game_features]

all_team_ids = np.unique(game_data_featurized['team_id'])
processed_game_data = []
for team in all_team_ids:
    team_data = game_data_featurized[game_data_featurized['team_id']==team]
    team_data = team_data.sort_values(by=['start_time'])
    team_data['num_prev_games'] = np.arange(len(team_data))
    team_data = team_data.set_index('start_time')
    # First lag by 1 game so that the current game is not included in the average
    team_data_features = team_data[game_features].shift(1).rolling(window=50, min_periods=1).mean()
    team_data[game_features] = team_data_features
    # Drop rows where num_prev_games == 0 since we can't average over 0 games
    team_data = team_data[team_data['num_prev_games']!=0]
    # Add one more column to indicate if the row is the last of the team's games
    team_data['last_game'] = team_data['num_prev_games']==team_data['num_prev_games'].max()
    processed_game_data.append(team_data)


In [276]:
game_data_featurized = pd.concat(processed_game_data)
del processed_game_data

In [277]:
game_data_featurized

Unnamed: 0_level_0,platformGameId,esportsGameId,team_id,outcome,first_riftHerald_ind,first_riftHerald_time,num_riftHerald,first_dragon_ind,first_dragon_time,num_dragon,...,num_turret,first_inhibitor_ind,first_inhibitor_time,num_inhibitor,first_kill_ind,first_kill_time,num_kills,game_end_time,num_prev_games,last_game
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-16 01:27:24.238,ESPORTSTMNT04:2685321,110303581088331491,103461966951059521,1.000000,1.000000,465.809000,2.000000,1.000000,465.809000,3.000000,...,8.000000,1.000000,899.621000,1.000000,1.000000,200.948000,16.000000,1492.375000,1,False
2023-06-16 22:52:41.066,ESPORTSTMNT01:3378516,110303581088331497,103461966951059521,1.000000,0.500000,465.809000,1.000000,0.500000,465.809000,3.500000,...,8.000000,0.500000,899.621000,1.000000,0.500000,200.948000,16.000000,1690.629000,2,False
2023-06-22 00:52:03.902,ESPORTSTMNT02:3209665,110303581088331475,103461966951059521,0.666667,0.666667,427.855500,1.000000,0.666667,427.855500,3.000000,...,6.666667,0.333333,899.621000,0.666667,0.666667,217.018500,14.000000,1843.170667,3,False
2023-06-23 00:38:49.113,ESPORTSTMNT02:3207834,110303581088331495,103461966951059521,0.750000,0.750000,527.912333,1.250000,0.750000,527.912333,2.750000,...,7.250000,0.500000,949.078000,0.750000,0.750000,327.077333,15.500000,1845.182750,4,False
2023-06-23 23:04:00.293,ESPORTSTMNT02:3211039,110303581088331493,103461966951059521,0.800000,0.600000,527.912333,1.400000,0.600000,527.912333,2.800000,...,7.600000,0.600000,988.545000,0.800000,0.800000,402.995500,15.600000,1935.653400,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-14 21:59:47.496,ESPORTSTMNT03:3187377,110303581088462651,99294153828264740,0.357143,0.285714,475.252250,0.642857,0.285714,475.252250,2.071429,...,4.714286,0.285714,893.369000,0.642857,0.428571,416.782500,8.357143,1904.102571,14,False
2023-07-20 00:43:48.333,ESPORTSTMNT03:3192430,110303581088462645,99294153828264740,0.400000,0.333333,466.117000,0.733333,0.333333,466.117000,2.133333,...,5.133333,0.333333,904.307000,0.733333,0.466667,391.692000,9.133333,1873.603467,15,False
2023-07-20 22:53:03.586,ESPORTSTMNT03:3192666,110303581088462647,99294153828264740,0.375000,0.375000,465.743500,0.687500,0.375000,465.743500,2.125000,...,5.125000,0.312500,904.307000,0.687500,0.437500,391.692000,9.000000,1916.555250,16,False
2023-07-21 22:52:27.546,ESPORTSTMNT03:3194810,110303581088462639,99294153828264740,0.411765,0.352941,465.743500,0.705882,0.352941,465.743500,2.176471,...,5.352941,0.294118,904.307000,0.764706,0.470588,363.483625,9.411765,1928.149765,17,False


In [278]:
# Join reg_season_games with game_data_featurized based on reg_season_games['esportsGameId'] == game_data_featurized['esportsGameId']
# For the team features, join on reg_season_games['team_id_1'] == game_data_featurized['team_id'] and reg_season_games['team_id_2'] == game_data_featurized['team_id']
# When doing so, rename the features of game_data_featurized to be team_1_feature and team_2_feature
def rename_features(df, features, prefix):
    return df.rename(columns=dict(zip(features, [f'{prefix}_{x}' for x in features])))

def get_last_game(df):
    return df[df['last_game']==True]

reg_season_games.head()

Unnamed: 0,match_id,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110303581088069312,110303581088134849,99294153828264740,0,98767991877340524,1
0,110303581088331458,110303581088331459,98767991877340524,1,103461966951059521,0
0,110303581088331460,110303581088331461,98767991877340524,1,99294153824386385,0
0,110303581088331462,110303581088331463,98926509885559666,1,98767991877340524,0
0,110303581088331464,110303581088331465,98767991877340524,1,98926509892121852,0


In [279]:
# First process reg_season_games data by flipping the team_id_1 and team_id_2 and outcome_1 and outcome_2
def swap_columns(df, col1, col2):
    colvals = df[col1].copy()
    df[col1] = df[col2]
    df[col2] = colvals
    return df

reg_season_games_flipped = reg_season_games.copy(deep=True)
reg_season_games_flipped = swap_columns(reg_season_games_flipped, 'team_id_1', 'team_id_2')
reg_season_games_flipped = swap_columns(reg_season_games_flipped, 'outcome_1', 'outcome_2')

reg_season_games_full = pd.concat([reg_season_games, reg_season_games_flipped])
# Join reg_season_games with game_data_featurized based on reg_season_games['esportsGameId', 'team_id_1'] == game_data_featurized['esportsGameId', 'team_id']
reg_season_games_full = reg_season_games_full.merge(rename_features(game_data_featurized, game_features, "team_1"), 
                                               how='left', 
                                               left_on=['esportsGameId', 'team_id_1'], 
                                               right_on=['esportsGameId', 'team_id'])
reg_season_games_full = reg_season_games_full.merge(rename_features(game_data_featurized, game_features, "team_2"), 
                                                    how='left', 
                                                    left_on=['esportsGameId', 'team_id_2'], 
                                                    right_on=['esportsGameId', 'team_id'])
# drop the games where the outcome is NaN, those games are when one team has not had any games yet
reg_season_games_full = reg_season_games_full.dropna(subset=['team_1_outcome', 'team_2_outcome'])

reg_season_games_full.shape

(172, 54)

In [307]:
with open("teams.json", "r") as json_file:
   teams_data = json.load(json_file)
   
# Create a table for the teams data with team_id as one column and team_name as another column
teams_table = pd.DataFrame({'team_id': [x['team_id'] for x in teams_data], 'team_name': [x['name'] for x in teams_data]})

del teams_data

In [310]:
get_last_game(game_data_featurized).merge(teams_table, how='inner', left_on='team_id', right_on='team_id')[['team_name', 'team_id', 'outcome']].sort_values(by=['outcome'], ascending=False)

Unnamed: 0,team_name,team_id,outcome
4,Cloud9,98767991877340524,0.722222
9,Golden Guardians,99294153824386385,0.722222
0,Evil Geniuses LG,103461966951059521,0.6
1,NRG,106972778172351142,0.529412
7,Team Liquid Honda,98926509885559666,0.529412
2,TSM,98767991860392497,0.470588
3,TSM,98767991860392497,0.470588
6,Dignitas,98926509883054987,0.411765
10,100 Thieves,99294153828264740,0.388889
8,FlyQuest,98926509892121852,0.352941


In [281]:
# Join playoff_games with game_data_featurized based on playoff_games['team_id_1'] == game_data_featurized['team_id']
# Since we only want the last game of each team, we can use the get_last_game function to get the last game of each team (don't need esportGameId as unique identifier anymore)
playoff_games_full = playoff_games.merge(rename_features(get_last_game(game_data_featurized), game_features, "team_1"), 
                                         how='left', 
                                         left_on=['team_id_1'], 
                                         right_on=['team_id'])
playoff_games_full = playoff_games_full.merge(rename_features(get_last_game(game_data_featurized), game_features, "team_2"), 
                                              how='left', 
                                              left_on=['team_id_2'], 
                                              right_on=['team_id'])

In [318]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Use a xgboost model to predict the ['outcome_1'] of each game
model_features = ['team_1_'+x for x in game_features] + ['team_2_'+x for x in game_features]

# Fit a 5-fold cross validated xgboost model on the reg_season_games_full data
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 10, 15],
    'n_estimators': [50, 100, 200],
    'objective': ['binary:logistic'],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [1],
    'reg_alpha': [0, 0.1, 0.2]
}

model = RandomizedSearchCV(XGBClassifier(), param_grid, n_iter=30, scoring='accuracy', n_jobs=-1, cv=5, verbose=3)
model.fit(reg_season_games_full[model_features], reg_season_games_full['outcome_1'])


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [319]:
model.best_params_

{'subsample': 0.5,
 'reg_alpha': 0,
 'objective': 'binary:logistic',
 'n_estimators': 200,
 'max_depth': 5,
 'learning_rate': 0.01,
 'gamma': 0.1,
 'colsample_bytree': 1}

In [320]:
# Check the accuracy of the model predictions on the training set
np.mean(model.predict(reg_season_games_full[model_features])==reg_season_games_full['outcome_1'])

0.9476744186046512

In [321]:
np.mean(model.predict(playoff_games_full[model_features])==playoff_games_full['outcome_1'])

0.5510204081632653

In [322]:
# Compare to the baseline where we just predict the team with the higher win rate to win
np.mean(np.array(reg_season_games_full['team_1_outcome']>=reg_season_games_full['team_2_outcome']) == reg_season_games_full['outcome_1'])

0.563953488372093

In [317]:
# Check feature importances
pd.DataFrame({'feature': model_features, 'importance': model.best_estimator_.feature_importances_}).sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
3,team_1_num_riftHerald,0.051983
23,team_2_num_riftHerald,0.050823
9,team_1_num_baron,0.047792
29,team_2_num_baron,0.043079
31,team_2_first_turret_time,0.042204
32,team_2_num_turret,0.041375
6,team_1_num_dragon,0.04122
16,team_1_first_kill_ind,0.041034
30,team_2_first_turret_ind,0.039002
10,team_1_first_turret_ind,0.038764
