In [1]:
import pandas as pd
import numpy as np
import os

from data_processing.GameFeaturesGenerator import GameFeaturesGenerator
from data_processing.utils.download_functions import *

In [2]:
# Write some utility functions
def get_game_data_full(games_data):
    # Iterate through t events of the match (could consist of one or many games) 
    # Look in the ['games'][t]['id'] field to get the game ID
    # Look in the ['games'][t]['state'] field to see if the game is 'completed'
    # Look in the ['games'][t]['teams'] field to get the team IDs
    # Look in the ['games'][t]['teams'][x]['result']['outcome'] field to get the result of the game for each team
    # We technically only need the 'state' to verify completion and 'id' to fetch details of the game, but load in other fields for verification
    game_tables = []
    for game in games_data['games']:
        game_state = game['state']
        if game_state == 'completed':
            game_id = game['id']
            team_ids, team_outcomes = [], []
            for team in game['teams']:
                team_ids.append(team['id'])
                team_outcome = 1 if team['result']['outcome']=='win' else 0
                team_outcomes.append(team_outcome)
            game_tables.append(pd.DataFrame({'esportsGameId': game_id, 
                                             'team_id_1': team_ids[0], 'outcome_1': team_outcomes[0],
                                             'team_id_2': team_ids[1], 'outcome_2': team_outcomes[1]}, index=[0]))
    return pd.concat(game_tables, ignore_index=True)

def get_game_data_ids(games_data):
    # Only returns the necessary information for lookup in the mapping table
    game_ids = []
    for game in games_data:
        if game['state']=='completed':
            game_ids.append(game['id'])
    return pd.DataFrame({'esportsGameId': game_ids})

In [3]:
os.chdir('esports-data')
os.listdir()

['games',
 'leagues.json',
 'mapping_data.json',
 'players.json',
 'teams.json',
 'tournaments.json']

In [4]:
# Read in mappings data
with open("mapping_data.json", "r") as json_file:
   mappings_data = json.load(json_file)
   
mappings = {
   esports_game["esportsGameId"]: esports_game for esports_game in mappings_data
}

In [5]:
# Read in tournament data
with open("tournaments.json", "r") as json_file:
   tournament_data_all = json.load(json_file)

# Only retrieve LCS tournament data ['lcs_summer_2023']
tournament_data = [x for x in tournament_data_all if x['slug']=='lcs_summer_2023'][0]

# Delete tournament data to save memory
del tournament_data_all

In [6]:
# See what rounds are featured in the tournament 
def get_tournament_stages(tournament_data):
    return [[x['name'], len(x['sections'])] for x in tournament_data['stages']]

get_tournament_stages(tournament_data)

[['Regular Season', 1], ['Playoffs', 1]]

In [7]:
# Read in regular season games as training data 
reg_season_games = []
for game_data in tournament_data['stages'][0]['sections'][0]['matches']:
    reg_season_games.append(get_game_data_full(game_data))
reg_season_games = pd.concat(reg_season_games)

# Read in playoff games as testing data 
playoff_games = []
for game_data in tournament_data['stages'][1]['sections'][0]['matches']:
    playoff_games.append(get_game_data_full(game_data))
playoff_games = pd.concat(playoff_games)

In [8]:
reg_season_games.head()

Unnamed: 0,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110303581088134849,99294153828264740,0,98767991877340524,1
0,110303581088331459,98767991877340524,1,103461966951059521,0
0,110303581088331461,98767991877340524,1,99294153824386385,0
0,110303581088331463,98926509885559666,1,98767991877340524,0
0,110303581088331465,98767991877340524,1,98926509892121852,0


In [9]:
playoff_games.head()

Unnamed: 0,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110767955468280157,98926509885559666,0,106972778172351142,1
1,110767955468411230,98926509885559666,0,106972778172351142,1
2,110767955468411231,98926509885559666,1,106972778172351142,0
3,110767955468411232,98926509885559666,0,106972778172351142,1
0,110767955468411235,103461966951059521,0,98767991860392497,1


In [24]:
directory = "games"
if not os.path.exists(directory):
   os.makedirs(directory)

# Load each game and process them
game_rows = []
for game_id in reg_season_games['esportsGameId']:
    try:
        game_mapping_data = mappings[game_id]
        platform_game_id = game_mapping_data['platformGameId']
        download_gzip_and_write_to_json(f"{directory}/{platform_game_id}")
        with open(f"games/{platform_game_id}.json", "r") as json_file:
            game_data = json.load(json_file)
        game_features = GameFeaturesGenerator(game_data, game_mapping_data).process_game()
        game_rows.append(game_features)
    except KeyError:
        print(f"Match {game_id} was not found")


games/ESPORTSTMNT02:3216144.json written
games/ESPORTSTMNT04:2684346.json written
games/ESPORTSTMNT01:3373576.json written
games/ESPORTSTMNT02:3214306.json written
games/ESPORTSTMNT01:3376629.json written
games/ESPORTSTMNT02:3212009.json written
games/ESPORTSTMNT02:3214927.json written
games/ESPORTSTMNT04:2685321.json written
games/ESPORTSTMNT02:3211039.json written
games/ESPORTSTMNT02:3207834.json written
games/ESPORTSTMNT01:3378516.json written
games/ESPORTSTMNT01:3374524.json written
games/ESPORTSTMNT02:3214601.json written
games/ESPORTSTMNT02:3215249.json written
games/ESPORTSTMNT02:3213927.json written
games/ESPORTSTMNT01:3378493.json written
games/ESPORTSTMNT02:3208561.json written
games/ESPORTSTMNT02:3214284.json written
games/ESPORTSTMNT02:3209828.json written
games/ESPORTSTMNT02:3208994.json written
games/ESPORTSTMNT02:3214268.json written
games/ESPORTSTMNT02:3208818.json written
games/ESPORTSTMNT02:3209624.json written
games/ESPORTSTMNT01:3376571.json written
games/ESPORTSTMN

In [80]:
game_data_featurized = pd.concat(game_rows)
game_data_featurized.head()

Unnamed: 0,platformGameId,esportsGameId,team_id,start_time,outcome,first_riftHerald_ind,first_riftHerald_time,num_riftHerald,first_dragon_ind,first_dragon_time,...,first_turret_ind,first_turret_time,num_turret,first_inhibitor_ind,first_inhibitor_time,num_inhibitor,first_kill_ind,first_kill_time,num_kills,game_end_time
0,ESPORTSTMNT02:3207804,110303581088134849,99294153828264740,2023-06-22 22:42:47.452,0,,,1,,,...,,,2,,,0,,,3,1612.09
1,ESPORTSTMNT02:3207804,110303581088134849,98767991877340524,2023-06-22 22:42:47.452,1,1.0,503.584,1,1.0,503.584,...,1.0,1041.249,9,1.0,1041.249,1,1.0,710.596,12,1612.09
0,ESPORTSTMNT02:3214865,110303581088331459,98767991877340524,2023-06-30 21:08:08.783,1,1.0,602.508,2,1.0,602.508,...,1.0,815.009,11,1.0,815.009,2,1.0,185.993,25,1337.233
1,ESPORTSTMNT02:3214865,110303581088331459,103461966951059521,2023-06-30 21:08:08.783,0,,,0,,,...,,,2,,,0,,,3,1337.233
0,ESPORTSTMNT01:3373466,110303581088331461,98767991877340524,2023-06-14 21:13:07.907,1,1.0,428.612,2,1.0,428.612,...,1.0,868.25,7,1.0,868.25,1,,,9,1604.087


In [81]:
"""
1. Sort the game_rows by team_id and start_time 
2. Create features for each team by averaging stats of the last 5 games that they played (or the most recent statistics if possible, with a "num_prev_games) column
    This yields a row of data where the features are the average of the last 5 games that the team played BEFORE the game of ['platformGameId', 'esportsGameId', 'team_id', 'start_time']
3. Later join to the reg_season_games data for training the model 
4. Use the trained model to predict on the playoff_games data for testing 
"""

non_game_features = ['platformGameId', 'esportsGameId', 'team_id', 'start_time']
game_features = [x for x in game_data_featurized.columns if x not in non_game_features]

all_team_ids = np.unique(game_data_featurized['team_id'])
processed_game_data = []
for team in all_team_ids:
    team_data = game_data_featurized[game_data_featurized['team_id']==team]
    team_data = team_data.sort_values(by=['start_time'])
    team_data['num_prev_games'] = np.arange(len(team_data))
    team_data = team_data.set_index('start_time')
    # First lag by 1 game so that the current game is not included in the average
    team_data_features = team_data[game_features].shift(1).rolling(window=5, min_periods=1).mean()
    team_data[game_features] = team_data_features
    # Drop rows where num_prev_games == 0 since we can't average over 0 games
    team_data = team_data[team_data['num_prev_games']!=0]
    # Add one more column to indicate if the row is the last of the team's games
    team_data['last_game'] = team_data['num_prev_games']==team_data['num_prev_games'].max()
    processed_game_data.append(team_data)


In [82]:
game_data_featurized = pd.concat(processed_game_data)
del processed_game_data

In [83]:
# Join reg_season_games with game_data_featurized based on reg_season_games['esportsGameId'] == game_data_featurized['esportsGameId']
# For the team features, join on reg_season_games['team_id_1'] == game_data_featurized['team_id'] and reg_season_games['team_id_2'] == game_data_featurized['team_id']
# When doing so, rename the features of game_data_featurized to be team_1_feature and team_2_feature
def rename_features(df, features, prefix):
    return df.rename(columns=dict(zip(features, [f'{prefix}_{x}' for x in features])))

def get_last_game(df):
    return df[df['last_game']==True]

reg_season_games.head()

Unnamed: 0,esportsGameId,team_id_1,outcome_1,team_id_2,outcome_2
0,110303581088134849,99294153828264740,0,98767991877340524,1
0,110303581088331459,98767991877340524,1,103461966951059521,0
0,110303581088331461,98767991877340524,1,99294153824386385,0
0,110303581088331463,98926509885559666,1,98767991877340524,0
0,110303581088331465,98767991877340524,1,98926509892121852,0


In [84]:
# Join reg_season_games with game_data_featurized based on reg_season_games['esportsGameId', 'team_id_1'] == game_data_featurized['esportsGameId', 'team_id']
reg_season_games_full = reg_season_games.merge(rename_features(game_data_featurized, game_features, "team_1"), how='left', left_on=['esportsGameId', 'team_id_1'], right_on=['esportsGameId', 'team_id'])
reg_season_games_full = reg_season_games_full.merge(rename_features(game_data_featurized, game_features, "team_2"), how='left', left_on=['esportsGameId', 'team_id_2'], right_on=['esportsGameId', 'team_id'])
# drop the games where the outcome is NaN, those games are when one team has not had any games yet
reg_season_games_full = reg_season_games_full.dropna(subset=['team_1_outcome', 'team_2_outcome'])


# Join playoff_games with game_data_featurized based on playoff_games['team_id_1'] == game_data_featurized['team_id']
# Since we only want the last game of each team, we can use the get_last_game function to get the last game of each team (don't need esportGameId as unique identifier anymore)
playoff_games_full = playoff_games.merge(rename_features(get_last_game(game_data_featurized), game_features, "team_1"), how='left', left_on=['team_id_1'], right_on=['team_id'])
playoff_games_full = playoff_games_full.merge(rename_features(get_last_game(game_data_featurized), game_features, "team_2"), how='left', left_on=['team_id_2'], right_on=['team_id'])


In [115]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Use a xgboost model to predict the ['outcome_1'] of each game
model_features = ['team_1_'+x for x in game_features] + ['team_2_'+x for x in game_features]

# Fit a 5-fold cross validated xgboost model on the reg_season_games_full data
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [5, 10, 15],
    'n_estimators': [50, 100, 200],
    'objective': ['binary:logistic'],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [1],
    'reg_alpha': [0, 0.1, 0.2]
}

model = RandomizedSearchCV(XGBClassifier(), param_grid, n_iter=30, scoring='accuracy', n_jobs=-1, cv=5, verbose=3)
model.fit(reg_season_games_full[model_features], reg_season_games_full['outcome_1'])


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [116]:
model.best_params_

{'subsample': 0.5,
 'reg_alpha': 0,
 'objective': 'binary:logistic',
 'n_estimators': 100,
 'max_depth': 15,
 'learning_rate': 0.01,
 'gamma': 0,
 'colsample_bytree': 1}

In [117]:
# Check the accuracy of the model predictions
np.mean(model.predict(playoff_games_full[model_features])==playoff_games_full['outcome_1'])

0.4897959183673469