In [None]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import numpy as np
import os
from tqdm.notebook import tqdm
from data_processing.utils.download_functions import *
from copy import deepcopy
import pickle 

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna

os.chdir('esports-data')
os.listdir()

In [None]:
class DataAggregator:
    """
    Class to aggregate the data from the different years into one dataframe
    """
    def __init__(self):
        # SET CONSTANTS FOR DATA PROCESSING
        self._window_size = 20
        self._ewm_alpha = 0.05
        self._ma_min_periods = 1

        self.league_indicators_to_drop = ['League_TFT Rising Legends', 'League_All-Star Event', 'League_MSI', 'League_Worlds', 'League_EMEA Masters']
        self.non_game_features = ['platformGameId', 'esportsGameId', 'team_id', 'start_time', 'tournament_name']
        self.league_indicators = ['League_Arabian League','League_CBLOL','League_CBLOL Academy','League_College Championship','League_Elite Series','League_Esports Balkan League',
                         'League_Greek Legends League','League_Hitpoint Masters','League_LCK','League_LCK Academy','League_LCK Challengers','League_LCL','League_LCO','League_LCS',
                         'League_LCS Challengers','League_LCS Challengers Qualifiers','League_LEC','League_LJL','League_LJL Academy','League_LLA','League_LPL','League_La Ligue FranÃ§aise',
                         'League_Liga Portuguesa','League_NLC','League_North Regional League','League_PCS','League_PG Nationals','League_Prime League',
                         'League_South Regional League','League_SuperLiga','League_TCL','League_Ultraliga','League_VCS']
        # Include a set of features that are known to be important for the model 
        self.mandatory_features = ['outcome', 'outcome_domestic', 'outcome_international', 'domestic_game_ind', 
                                   'eSportsLeague_1', 'eSportsLeague_2', 'eliteLeague_1', 'eliteLeague_2', 'majorLeague_1', 'majorLeague_2', 'year']
        # Mark special features that require a different style of processing 
        self.special_features = ['outcome_domestic', 'outcome_international']

        self. important_features = ['team_1', 'team_2', 'outcome', 'outcome_domestic', 'outcome_international', 'eSportsLeague_1', 'eSportsLeague_2', 
                      'team_share_of_totalGold_at_20', 'team_share_of_totalGold_at_game_end', 
                      'team_share_of_towerKills_at_20', 'team_share_of_towerKills_at_game_end', 'team_share_of_VISION_SCORE_at_game_end']

        
        # Maintain a manual dictionary of team_id to league_indicator. This is necessary to mark the regions for teams in international tournaments (MSI/worlds since LPL teams don't have data)
        # Loop through this and mark the league_indicator for each team_id as =1 for the rows where the team_id is present
        self.league_indicator_dict = {
            98767991954244555: 'League_VCS', # GAM
            107251245690956393: 'League_VCS', # SAIGON BUFFALOS
            98767991892579754: 'League_LPL',  # RNG
            104367068120825486: 'League_PCS',  # PSG Talon
            98767991882270868: 'League_LPL',  # EDG
            99566404850008779: 'League_LPL',  # LNG
            99566404855553726: 'League_LPL',  # FPX
            99566404852189289: 'League_LPL',  # JDG
            99566404854685458: 'League_LPL',  # TES
            105520788833075738: 'League_Elite Series', # KV Mechelen
            105520824521753126: 'League_NLC', # PSV Esports
            105543843212923183: 'League_Ultraliga', # Goskilla
            105548000936710641: 'League_Ultraliga', # Method2Madness
            103935642731826448: 'League_Elite Series', # Sector One
            104710682193583854: 'League_Ultraliga', # Topo Centras Iron Wolves
            105520822049210915: 'League_Elite Series', # Team mCon
            106334794714373670: 'League_Ultraliga', # Goexanimo
        }
        
        # Read in teams data
        with open("teams.json", "r") as json_file:
           teams_data = json.load(json_file)
        teams_dict = {}
        for team in teams_data:
            teams_dict[team['team_id']] = team['name']
        self.teams_dict = teams_dict
        
    def get_featurized_data(self, folder_paths, years):
        """
        We do the following steps to process the game data
        1) Read in the tournament rows data, which specifies the match ID, the participating teams, and the winner of the match 
        2) Read in the game rows data, which contains all the granular information about each game 
        3) Additionally process the game rows data
            i) Sort the game rows by team_id and start_time
            ii) Create features based on the stats of the team over historical games 
            iii) Handle the league region indicators for each time (as 'eSportLeague')
            iv) 
        :param folder_paths: list of strings specifying the folder paths
        :param years: list of strings specifying the years 
        :return: 
            model_data - dataframe containing the diff between the two teams for each game used for training
            processed_game_data_inf_inf_inf - dataframe containing the processed game data for each individual team used for inference
        """
        tournament_rows = pd.DataFrame()
        game_rows = pd.DataFrame()
        for (folder_path, year) in zip(folder_paths, years):
            file_names = os.listdir(folder_path)

            # Get the unique tournament names by stripping out '_game_rows.csv' and '_tournament_rows.csv'
            unique_tournament_names = [file_name.split('_game_rows.csv')[0] for file_name in file_names]
            unique_tournament_names = [x.replace('_tournament_rows.csv', '') for x in unique_tournament_names]
            unique_tournament_names = list(set(unique_tournament_names))

            # Aggregate all the game rows into one dataframe, start with an empty dataframe and append onto it to save memory
            
            for tournament_name in tqdm(unique_tournament_names):
                df_tmp = pd.read_csv(f'{folder_path}/' + tournament_name + '_tournament_rows.csv')
                # Add a column to indicate the tournament name
                df_tmp['tournament_name'] = tournament_name
                tournament_rows = pd.concat([tournament_rows, df_tmp])
            print("Tournament rows shape: ", tournament_rows.shape)

            
            for tournament_name in tqdm(unique_tournament_names):
                df_tmp = pd.read_csv(f'{folder_path}/' + tournament_name + '_game_rows.csv', index_col=0)
                # Add a column to indicate the tournament name
                df_tmp['tournament_name'] = tournament_name
                df_tmp['year'] = year
                game_rows = pd.concat([game_rows, df_tmp])
            print("Game rows shape: ", game_rows.shape)

        print("Completed data loading")
        print("Tourament rows shape: ", tournament_rows.shape)
        print("Game rows shape: ", game_rows.shape)

        game_rows = game_rows.drop(columns=self.league_indicators_to_drop, axis=1)
        game_features = [x for x in game_rows.columns if x not in self.non_game_features + self.league_indicators + self.special_features + ['year']]
        self.game_features = game_features
        
        # Get a set of all team IDs as we will iterate through them to generate the row data for each team 
        all_team_ids = np.unique(game_rows['team_id'])
        processed_game_data = self.featurize_game_rows(game_rows, all_team_ids)
        processed_game_data = self.refine_league_indicator_data(processed_game_data)
        self.game_features = game_features + self.special_features

        valid_games = self.get_valid_game_rows(tournament_rows, processed_game_data)
        model_data = self.get_model_data(valid_games, processed_game_data)
        return model_data, processed_game_data
    
    def featurize_game_rows(self, game_rows, all_team_ids, averaging_method='ewm'):
        """
        averaging_method must be either 'ewm' or 'mean'
        """
        processed_game_data = []
        for team in tqdm(all_team_ids):
            team_data = game_rows[game_rows['team_id']==team].reset_index()
            team_data = team_data.sort_values(by=['start_time'])
            team_data['num_prev_games'] = np.arange(len(team_data))
            team_data['outcome_domestic'] = np.nan
            team_data['outcome_international'] = np.nan
            # Set outcome_international for worlds and msi tournaments
            team_data.loc[team_data['tournament_name'].str.contains('worlds|msi'), 'outcome_international'] = team_data['outcome']
            # Set outcome_domestic for non-worlds and non-msi tournaments
            team_data.loc[~team_data['tournament_name'].str.contains('worlds|msi'), 'outcome_domestic'] = team_data['outcome']
            
            # First lag by 1 game so that the current game is not included in the average. Then take the mean as the trailing average 
            if averaging_method == 'ewm':
                team_data_features = team_data[self.game_features + self.special_features].shift(1).ewm(alpha=self._ewm_alpha, min_periods=self._ma_min_periods, ignore_na=False).mean()
            elif averaging_method == 'mean':
                team_data_features = team_data[self.game_features + self.special_features].shift(1).rolling(window=self._window_size, min_periods=1).mean()
            else:
                raise ValueError('averaging_method must be either "ewm" or "mean"')
            
            team_data[self.game_features + self.special_features] = team_data_features 
            # Drop rows where num_prev_games == 0 as this indicates that it's the team's first game 
            team_data = team_data[team_data['num_prev_games']!=0]
        
            # Add the team name to the dataframe
            try:
                team_name = self.teams_dict[str(team)]
            except KeyError:
                team_name = "NULL"
            # Add a column for the team name
            team_data['team_name'] = team_name
        
            # Determine the team's primary league
            team_league = team_data[self.league_indicators].mean(axis=0).idxmax()
            # Determine if it's a valid league (otherwise it'll just mark the first one)
            team_league_check = team_data[team_league].sum() > 0 # If false, then do not mark based on history, have to manually mark 
            
            # check if there are any rows where the team does not have a league_indicator (i.e., np.sum(team_data[league_indicators]) == 0) and if so, mark the team_league as 1 for those rows
            # This happens when a team plays in international tournaments 
            if team_league_check:
                team_data.loc[np.sum(team_data[self.league_indicators], axis=1)==0, team_league] = 1
            else:
                pass
        
            # update the processed_game_data with the new league_indicator values
            processed_game_data.append(team_data)
        
        del game_rows  # Don't need this anymore once we're done processing them
        
        processed_game_data = pd.concat(processed_game_data)
        processed_game_data.drop('index', axis=1, inplace=True)
        return processed_game_data
    
    def refine_league_indicator_data(self, processed_game_data):
        # Next deal with marking specific team's leagues 
        for team_id, league_indicator in self.league_indicator_dict.items():
            processed_game_data.loc[processed_game_data['team_id']==team_id, league_indicator] = 1
            
        # Create two additional features related to the esport league
        # If the team is an LPL or LCK team, mark indicator 'eliteLeague' as 1
        # If the team is an LEC, LCS, LPL, or LCK team, mark indicator 'majorLeague' as 1
        processed_game_data['eliteLeague'] = (processed_game_data['League_LPL'] == 1) | (processed_game_data['League_LCK'] == 1)
        processed_game_data['majorLeague'] = (processed_game_data['League_LPL'] == 1) | (processed_game_data['League_LCK'] == 1) | \
                                             (processed_game_data['League_LCS'] == 1) | (processed_game_data['League_LEC'] == 1)
        
        # Check that the one-hot encoding worked correctly
        if (np.sum(processed_game_data[self.league_indicators], axis=1) == 1).all():
            # Convert the league_indicator one-hot encoded columns to categorical variables
            for league in [x.replace('League_', '') for x in self.league_indicators]:
                processed_game_data['League_' + league] = processed_game_data['League_' + league].apply(lambda x: league if x==1 else '')
            # Combine it into a single column
            processed_game_data['eSportLeague'] = processed_game_data[self.league_indicators].apply(lambda x: ''.join(x), axis=1)
            processed_game_data = processed_game_data.drop(columns=self.league_indicators, axis=1)
            # Convert it to a categorical variable
            processed_game_data['eSportLeague'] = processed_game_data['eSportLeague'].astype('category')
        else:
            raise ValueError('One-hot encoding of league_indicators did not work correctly')
        return processed_game_data
        
    def get_valid_game_rows(self, tournament_rows, processed_game_data):
        valid_games = tournament_rows.merge(processed_game_data[['esportsGameId', 'team_id']], how='inner', 
                                    left_on=['esportsGameId', 'team_id_1'], 
                                    right_on=['esportsGameId', 'team_id'], 
                                    suffixes=['_to_drop','_to_drop'])
        valid_games = valid_games.merge(processed_game_data[['esportsGameId', 'team_id']], how='inner', 
                                            left_on=['esportsGameId', 'team_id_2'], 
                                            right_on=['esportsGameId', 'team_id'],
                                            suffixes=['_to_drop','_to_drop'])
        valid_games.drop([x for x in valid_games.columns if '_to_drop' in x], axis=1, inplace=True)
        return valid_games 
        
    def get_model_data(self, valid_games, processed_game_data):
        # Merge processed_game_data with tournament_rows for team 1
        team_1_data = valid_games.merge(processed_game_data, how='inner', 
                                            left_on=['esportsGameId', 'team_id_1'], 
                                            right_on=['esportsGameId', 'team_id'],
                                            suffixes=['_to_drop','_to_drop'])
        
        # Merge processed_game_data with tournament_rows for team 2
        team_2_data = valid_games.merge(processed_game_data, how='inner', 
                                            left_on=['esportsGameId', 'team_id_2'], 
                                            right_on=['esportsGameId', 'team_id'],
                                            suffixes=['_to_drop','_to_drop'])
        
        # Calculate the difference between the two teams for esportsGameId and for each feature
        check_esportsGameId = np.all(team_1_data['esportsGameId'] == team_2_data['esportsGameId'])
        check_team1_id = np.all(team_1_data['team_id_1'] == team_2_data['team_id_1'])
        check_team2_id = np.all(team_1_data['team_id_2'] == team_2_data['team_id_2'])
        
        if check_esportsGameId and check_team1_id and check_team2_id:
            # Calculate the difference between the two teams for each feature
            difference_data = team_1_data[self.game_features].subtract(team_2_data[self.game_features])
            # Apply special logic for computing the difference for the "outcome_domestic" and "outcome_international" features 
            for feature in self.special_features:
                # Calculate the difference between columns A and B
                diff = team_1_data[feature].fillna(0).sub(team_2_data[feature].fillna(0)) 
                # If both columns are nan then mark the difference as nan
                diff[(team_1_data[feature].isna()) & (team_2_data[feature].isna())] = np.nan
        else:
            raise Exception('esportsGameId is not the same for the two teams')
        
        self.difference_data = difference_data.columns
        # Add the difference data to the tournament_rows dataframe as well as the league data for each team
        training_data = deepcopy(valid_games)
        training_data = pd.concat([training_data.reset_index(), difference_data], axis=1)
        training_data['eSportsLeague_1'] = team_1_data['eSportLeague']
        training_data['eSportsLeague_2'] = team_2_data['eSportLeague']
        training_data['domestic_game_ind'] = training_data['eSportsLeague_1'] == training_data['eSportsLeague_2'] 
        training_data['eliteLeague_1'] = team_1_data['eliteLeague']
        training_data['eliteLeague_2'] = team_2_data['eliteLeague']
        training_data['majorLeague_1'] = team_1_data['majorLeague']
        training_data['majorLeague_2'] = team_2_data['majorLeague']
        training_data['team_1'] = team_1_data['team_name']
        training_data['team_2'] = team_2_data['team_name']
        training_data['start_time'] = team_1_data['start_time']
        training_data['year'] = team_1_data['year']
        
        # Drop the columns that were used for joining (have '_to_drop' suffix). 
        training_data.drop([x for x in training_data.columns if '_to_drop' in x] + ['index'], axis=1, inplace=True)
        
        # drop the games where the outcome is NaN, those games are when one team has not had any games yet
        training_data.dropna(subset=['outcome'], inplace=True)
        
        del team_1_data, team_2_data, difference_data
        return training_data

In [None]:
data_aggregator = DataAggregator()
model_data, game_data = data_aggregator.get_featurized_data(['2021_raw_game_data', '2022_raw_game_data', '2023_raw_game_data'] ,['2021', '2022', '2023'])

In [None]:
# Note that this cell was run in a separate notebook for feature selection, at this point it is already completed so we just read in the 200 selected features 
"""
# First do some feature selection on X_train using catboost algorithms, use default parameters 
from catboost import EShapCalcType, EFeaturesSelectionAlgorithm
model = CatBoostClassifier(iterations=10, learning_rate=0.05, depth=10, l2_leaf_reg=5, random_strength= 1.5, task_type="GPU", devices='0:1', silent=True)
summary = model.select_features(
        train_data,
        eval_set=val_data,
        features_for_select=list(range(X_train.shape[1])),     # we will select from all features
        num_features_to_select=200, 
        steps=1,                                     # more steps - more accurate selection
        algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        shap_calc_type=EShapCalcType.Approximate,            # can be Approximate, Regular and Exact
        train_final_model=True,                          # to train model with selected features
        logging_level='Verbose',
        plot=False
)
"""
    
# Read in the selected features as a list
with open("selected_features_RSV_Approx_200.txt", "rb") as fp:   # Unpickling
    selected_features = pickle.load(fp)

In [None]:
# Make sure that important features are in selected_features and if not, add them
# Do this by taking the union of the selected features and the mandatory features
model_features = list(set(selected_features + data_aggregator.mandatory_features + data_aggregator.special_features)) + ['team_1', 'team_2']
model_features.remove('outcome')

with open("model_features.txt", "wb") as fp:  #Pickling
    pickle.dump(model_features, fp)

target_col = 'outcome_1'
X = model_data.drop(['match_id', 'esportsGameId', 'league', 'team_id_1', 'outcome_1', 'team_id_2', 'outcome_2', 'start_time'], axis=1)
y = model_data[target_col]

# Split into train/val/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=102)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=22)

# Get all indices where X_train['tournment_name'] contains ['worlds', 'msi'] 
international_ix_train = X_train['tournament_name'].str.contains('worlds|msi')
international_ix_test = X_val['tournament_name'].str.contains('worlds|msi')

# Set the tournament weights based on prior on which regions play the game "well"
tournamnent_weights = {
    'lcs': 5,
    'lec': 10,
    'lck': 20,
    'lpl': 20,
    'worlds|msi': 50}

def set_tournament_weights(X_train, tournament_weights):
    """
    Set the tournament weights for the training data based on the given dictionary of tournament names and weights.
    :param X_train: pandas dataframe containing the training data
    :param tournament_weights: dictionary containing the tournament names and weights
    :return: numpy array containing the weights for the training data
    """
    weights_train = np.ones(len(X_train))
    for tournament_name, weight in tournament_weights.items():
        tournament_ix_train = X_train['tournament_name'].str.contains(tournament_name)
        weights_train[tournament_ix_train] = weight
    return weights_train

weights_train = set_tournament_weights(X_train, tournament_weights=tournamnent_weights)

X_train = X_train.drop('tournament_name', axis=1)
X_train = X_train[model_features]
train_data = Pool(data=X_train, label=y_train, weight=weights_train, cat_features=['eSportsLeague_1', 'eSportsLeague_2', 'team_1', 'team_2'])
X_val = X_val.drop('tournament_name', axis=1)
X_val = X_val[model_features]
val_data = Pool(data=X_val, label=y_val, cat_features=['eSportsLeague_1', 'eSportsLeague_2', 'team_1', 'team_2'])

In [None]:
# Set prior on feature importances 
important_features = ['outcome_domestic', 'outcome_international', 'eSportsLeague_1', 'eSportsLeague_2', 
                      'team_share_of_totalGold_at_game_end', 'team_share_of_towerKills_at_game_end', 'team_share_of_VISION_SCORE_at_game_end']
important_features_weights = {
    'outcome_domestic': 10, 
    'outcome_international': 0, 
    'eSportsLeague_1': 3, 
    'eSportsLeague_2': 3, 
    'team_share_of_totalGold_at_game_end': 8, 
    'team_share_of_towerKills_at_game_end': 8, 
    'team_share_of_VISION_SCORE_at_game_end': 8
}
important_features_ix = [X_train.columns.tolist().index(x) for x in important_features]

# Assign a feature weight of 2 for all of the important features and 1 for everything else
feature_weights = [1 for x in range(len(X_train.columns))]
for feature, weight in important_features_weights.items():
    feature_ix = X_train.columns.tolist().index(feature)
    feature_weights[feature_ix] = weight

In [None]:
# Hyperparameter tuning - tune over common boosting parameters 

def objective(trial):
    params = {
        "iterations": trial.suggest_categorical("iterations", [25, 50, 100, 200]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "depth": trial.suggest_int("depth", 5, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 30),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 30),
        "random_strength": trial.suggest_categorical("random_strength", [0.5, 1, 1.5, 2, 3, 4])
    }

    model = CatBoostClassifier(**params, cat_features=['eSportsLeague_1', 'eSportsLeague_2', 'team_1', 'team_2'], feature_weights=feature_weights, task_type="GPU", devices='0:1', silent=True)
    model.fit(train_data)
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
# Print the best parameters and fit a new model on the data based on it
print(study.best_params)
model = CatBoostClassifier(**study.best_params, cat_features=['eSportsLeague_1', 'eSportsLeague_2', 'team_1', 'team_2'], feature_weights=feature_weights, task_type="GPU", devices='0:1', silent=True)

# Train on the FULL data 
model.fit(X.drop('tournament_name', axis=1)[model_features], y)

In [None]:
# model.save_model('catboost_model.cbm', pool = Pool(X.drop('tournament_name', axis=1)[model_features], y, cat_features=['eSportsLeague_1', 'eSportsLeague_2', 'team_1', 'team_2']))
model = CatBoostClassifier()
model.load_model("catboost_model.cbm")

In [None]:
# Check the feature importance of the model
pd.DataFrame({x: y for x, y in zip(X_train.columns, model.get_feature_importance())}, index = [0]).T.sort_values(by=0, ascending=False)[:10]

In [None]:
# Run the model on a specific game 
tournament_data = model_data[model_data['tournament_name']=='lcs_spring_2023']

In [None]:
# Function for helping to find the closest key in a dictionary 
from difflib import SequenceMatcher

def find_closest_key(string, dictionary):
    closest_key = None
    closest_distance = 0
    
    for key in dictionary.keys():
        distance = SequenceMatcher(None, string, key).ratio()
        if distance > closest_distance:
            closest_key = key
            closest_distance = distance
    print(closest_key)
    return dictionary[closest_key]

In [None]:
teams_dict_rev = {v:k for (k,v) in data_aggregator.teams_dict.items()}

In [None]:
worlds_teams = ['Gen.G', 'T1', 'KT Rolster', 'Dplus', 'Cloud9', 'NRG', 'Team Liquid', 'WeiboGaming FAW Audi', 'Beijing JDG Intel', 'bilibili', 'suzhou LNG',
    'g2 esports', 'fnatic', 'MAD lions', 'golden guardians', 'team BDS', 'LOUD', 'GAM esports', 'TEAM WHALES', 'psg talon', 'ctbc flying oyster', 'detonation focusme', 
    'Movistar R7']

worlds_team_ids = [int(find_closest_key(x, teams_dict_rev)) for x in worlds_teams]

In [None]:
class InferenceDataGenerator:
    """
    Class to generate inference data for Tournament, Team, and Global rankings 
    """
    def __init__(self, game_data, model_features) -> None:
        self.model_features = model_features
        self.numeric_model_features = np.intersect1d(game_data._get_numeric_data().columns, model_features)
        self.special_features = ['outcome_domestic', 'outcome_international']
        self.game_data = game_data
        # It turns out that adding boost factors is important for maintaining the general ordering of regions (based on previous international tournaments)
        self.additive_boost_factors = {
            # Major regions
            'LCK': 0.3,
            'LPL': 0.3,
            'LEC': 0.20,
            'LCS': 0.15,
            # Minor regions below
            # 'PCS': 0.05,
            # 'LLA': 0.04,
            # 'CBLOL': 0.03,
            # 'VCS': 0.02,
            # 'LJL': 0.01,
        }
        # Mark important features to boost in international competitions
        self.columns_to_boost = ['outcome_domestic', 'team_share_of_totalGold_at_game_end', 'team_share_of_towerKills_at_game_end', 'team_share_of_VISION_SCORE_at_game_end']

    def get_inference_data_by_team_id(self, team_ids):
        game_data = self.get_game_data_by_team_id(team_ids)
        for column in self.columns_to_boost:
            game_data[column] *= game_data['eSportLeague'].map(self.additive_boost_factors).fillna(0)
        tournament_rows = self.get_tournament_rows_by_team_id(team_ids)
        inference_data = self.get_inference_data(tournament_rows, game_data)
        return inference_data 

    def get_game_data_by_team_id(self, team_ids):
        """
        Gets the last game played for each team_id 
        """
        game_data = self.game_data[self.game_data['team_id'].isin(team_ids)]
        game_data = game_data.sort_values(by=['team_id', 'start_time'])
        game_data = game_data.drop_duplicates(subset=['team_id'], keep='last')
        return game_data

    def get_tournament_rows_by_team_id(self, team_ids):
        """
        Creates an imaginary round-robin tournament
        """
        tournament_rows = []
        for team_1 in team_ids:
            for team_2 in team_ids:
                tournament_rows.append([team_1, team_2])
                    
        # Format this as a table
        tournament_rows = pd.DataFrame(tournament_rows, columns=['team_id_1', 'team_id_2'])
        return tournament_rows

    def get_inference_data(self, tournament_rows, game_data):
        tournament_rows_featurized_1 = tournament_rows.merge(game_data, how='left', left_on=['team_id_1'], right_on=['team_id'])
        tournament_rows_featurized_2 = tournament_rows.merge(game_data, how='left', left_on=['team_id_2'], right_on=['team_id'])

        # Compute the difference between the two teams (with additional checks to ensure that no shuffling occurred during the join)
        check_team1_id = np.all(tournament_rows_featurized_1['team_id_1'] == tournament_rows_featurized_2['team_id_1'])
        check_team2_id = np.all(tournament_rows_featurized_1['team_id_2'] == tournament_rows_featurized_2['team_id_2'])
        check_team1_id_base = np.all(tournament_rows['team_id_1'] == tournament_rows_featurized_1['team_id_1'])
        check_team2_id_base = np.all(tournament_rows['team_id_2'] == tournament_rows_featurized_1['team_id_2'])

        if check_team1_id and check_team2_id and check_team1_id_base and check_team2_id_base:
            # Calculate the difference between the two teams for each feature
            difference_data = tournament_rows_featurized_1[self.numeric_model_features].subtract(tournament_rows_featurized_2[self.numeric_model_features])
            for feature in self.special_features:
                # Calculate the difference between columns A and B
                diff = tournament_rows_featurized_1[feature].fillna(0).sub(tournament_rows_featurized_2[feature].fillna(0)) 
                # If both columns are nan then mark the difference as nan
                diff[(tournament_rows_featurized_1[feature].isna()) & (tournament_rows_featurized_2[feature].isna())] = np.nan
                difference_data[feature] = diff
        else:
            raise Exception('esportsGameId is not the same for the two teams')
                
        # Add the difference data to the tournament_rows dataframe as well as the league data for each team
        training_data = deepcopy(tournament_rows)
        training_data = pd.concat([training_data.reset_index(), difference_data], axis=1)
        training_data['eSportsLeague_1'] = tournament_rows_featurized_1['eSportLeague']
        training_data['eSportsLeague_2'] = tournament_rows_featurized_2['eSportLeague']
        training_data['domestic_game_ind'] = training_data['eSportsLeague_1'] == training_data['eSportsLeague_2']
        training_data['eliteLeague_1'] = tournament_rows_featurized_1['eliteLeague']
        training_data['eliteLeague_2'] = tournament_rows_featurized_2['eliteLeague']
        training_data['majorLeague_1'] = tournament_rows_featurized_1['majorLeague']
        training_data['majorLeague_2'] = tournament_rows_featurized_2['majorLeague']
        training_data['team_1'] = tournament_rows_featurized_1['team_name']
        training_data['team_2'] = tournament_rows_featurized_2['team_name']
        training_data['start_time'] = tournament_rows_featurized_1['start_time']
        training_data['year'] = tournament_rows_featurized_1['year']

        # Drop the columns that were used for joining (have '_to_drop' suffix). 
        training_data.drop([x for x in training_data.columns if '_to_drop' in x] + ['index'], axis=1, inplace=True)
        training_data = training_data.drop(['team_id_1', 'team_id_2', 'start_time'], axis=1)

        return training_data[self.model_features]
    
    
class InferenceModel:
    def __init__(model) -> None:
        pass

In [None]:
# Below is code to generate the inference data 

numeric_model_features = np.intersect1d(game_data._get_numeric_data().columns, model_features)

# Check for features that are created during processing steps that aren't numeric
set(model_features) - set(numeric_model_features)

# excluded_features are features that are generated during processing, so it's not necessary to save them
excluded_features = set(['eSportsLeague_1', 'eSportsLeague_2', 'eliteLeague_1', 'eliteLeague_2', 'majorLeague_1', 'majorLeague_2', 'team_1', 'team_2', 'domestic_game_ind'])
model_features_save = list(set(model_features) - excluded_features)

# Include features that are excluded from the data processing (or are processed in a separate manner than the other features)
game_data_inf = game_data[['platformGameId', 'esportsGameId', 'team_id', 'start_time'] + ['team_name', 'eSportLeague', 'eliteLeague', 'majorLeague'] + model_features_save]

In [None]:
# Save the formatted data to run inference on later 
# game_data_inf.to_csv('game_data.csv', index=False)

# Read in data and artifacts required for inference
game_data_inf = pd.read_csv('game_data.csv')

with open("model_features.txt", "rb") as fp:   # Unpickling
    model_features = pickle.load(fp)

game_data_inf = pd.read_csv('game_data.csv')

model = CatBoostClassifier()
model.load_model("catboost_model.cbm")

In [None]:
# Once we have the model we can make pairwise predictions and use those 

team_ids = worlds_team_ids

inference_data_generator = InferenceDataGenerator(game_data_inf, model_features)
X_inference = inference_data_generator.get_inference_data_by_team_id(team_ids)

## Probabilistic prediction
# Prediction creates an N x N matrix where N is the len(unique_team_ids)
tournament_preds = model.predict_proba(X_inference)[:,1]
# Format the predictions into an N x N matrix (fill by row) 
tournament_preds = tournament_preds.reshape(len(team_ids), len(team_ids))
# Multiply the values less than 0.5 by -1 to get the correct sign
tournament_preds[tournament_preds < 0.5] = tournament_preds[tournament_preds < 0.5]*-1
# Mark the diagonal as 0's
np.fill_diagonal(tournament_preds, 0.0)

## Deterministic prediction
# Prediction creates an N x N matrix where N is the len(unique_team_ids)
# tournament_preds = model.predict(X_inference)
# # Format the predictions into an N x N matrix (fill by row) 
# tournament_preds = tournament_preds.reshape(len(team_ids), len(team_ids))
# # Convert the 0's into -1 (means they lost)
# tournament_preds[tournament_preds==0] = -1
# # Mark the diagonal as 0's
# np.fill_diagonal(tournament_preds, 0.0)


In [None]:
from data_processing.utils.serialrank import SerialRank

# We can use serial rank if we want to apply a spectral ranking algorithm
# The idea for spectral ranking is that teamms that win/lose against similar teams should be ranked similarly
# Empirically, this tends to not work as well as the probabilistic predictions
serial_rank = SerialRank(tournament_preds)
serial_rank.fit()
team_scores = serial_rank.r.squeeze()

# Rank the teams based on the scores
team_names = [data_aggregator.teams_dict[str(x)] for x in team_ids]
team_ranks = pd.DataFrame({'team_name': team_names, 'score': team_scores}).sort_values(by='score', ascending=True)
team_ranks

In [None]:
# Another option is to rank with the pairwise probabilistic predictions. This ignores who wins against who, and simply places the winningest teams at the top
team_scores = np.sum(tournament_preds, axis=1)
team_names = [data_aggregator.teams_dict[str(x)] for x in team_ids]
team_ranks = pd.DataFrame({'team_name': team_names, 'score': team_scores}).sort_values(by='score', ascending=False)
team_ranks

In [None]:
# Below is the logic for computing shapley values

# Take the (column-wise) mean of the shap_values for each n-sized block of rows (representing the games that a specific team plays)
# Skip the first row of each block since tat's the difference between the first team and itself
N_teams = len(team_ids)
for i in range(0, len(shap_values), N_teams):
    if i == 0:
        team_shap_values = shap_values[i+1:i+N_teams].values.mean(axis=0)
    else:
        team_shap_values = np.vstack((team_shap_values, shap_values[i+1:i+N_teams].values.mean(axis=0)))

# Label the shap values with the team names and feature names
# team_shap_values = pd.DataFrame(team_shap_values, columns=model_features, index=team_names)
team_shap_values = pd.DataFrame(team_shap_values, columns=X_inference.columns, index=team_names)

# shap_features_to_exclude = ['team_1', 'team_2', 'outcome_domestic', 'outcome_international', 'domestic_game_ind', 'eliteLeague_1', 'eliteLeague_2', 
#     'majorLeague_1', 'majorLeague_2', 'eSportsLeague_1', 'eSportsLeague_2', 'year', 'team_share_of_totalGold_at_20', 'team_share_of_totalGold_at_game_end',
#     'team_share_of_towerKills_at_20', 'team_share_of_towerKills_at_game_end', 'team_share_of_VISION_SCORE_at_game_end', 'support_NEUTRAL_MINIONS_KILLED_at_30']
# shap_features_to_exclude = ['team_1', 'team_2']
shap_features_to_exclude = []
# Drop the features that are in the shap_features_to_exclude list (not interpretable)
team_shap_values = team_shap_values.drop(shap_features_to_exclude, axis=1)

# For each team, get the top 5 most positive features and top 5 most negative features and put them into the columns ['top_1_pos', 'top_2_pos', ..., 'top_1_neg', 'top_2_neg', ..., 'top_5_neg']
# Define a custom function to get the top 5 most positive and negative features for a team
def get_top_features(row):
    top_pos = row.sort_values(ascending=False)[:5].index.tolist()
    top_neg = row.sort_values(ascending=True)[:5].index.tolist()
    return pd.Series(top_pos + top_neg, index=[f'top_{i}_pos' for i in range(1, 6)] + [f'top_{i}_neg' for i in range(1, 6)])

# Apply the custom function to each row of the team_shap_values dataframe
team_top_features = team_shap_values.apply(get_top_features, axis=1)

team_top_features