In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import pickle

In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
BASE_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/Final Project'
print(os.listdir(BASE_FILE_PATH))

['Keys', 'Setting up connection with Git-Hub.ipynb', 'entries.csv', 'summoner_details.csv', 'updated_dataset.csv', 'filtered_dataset.csv', 'filtered_dataset.gsheet', 'filtered_+30_matches_dataset.csv', 'EDA.ipynb', 'merged_dataset.csv', 'Merging match with entries and classifying accounts.ipynb', 'merged_dataset.gsheet', 'Feature engineering and cleaning vol 1.ipynb', 'dataset_after_normalization.csv', 'keepign only win feature .ipynb', 'Models without data normalization.ipynb', 'Normalization_Parameters.csv', 'Normalization_Parameters_by_Game_Mode.csv', 'Normalization_Parameters_by_Team_Position.csv', 'summoner_ids_used_in_model.csv', 'gbm_model.pkl', 'train_df.csv', 'test_df.csv', 'validation_df.csv']


In [66]:
df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'train_df.csv'))
test_df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'test_df.csv'))

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56434 entries, 0 to 56433
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   game_duration         56434 non-null  int64 
 1   game_mode             56434 non-null  object
 2   summoner_id           56434 non-null  object
 3   kills                 56434 non-null  int64 
 4   deaths                56434 non-null  int64 
 5   assists               56434 non-null  int64 
 6   total_damage_dealt    56434 non-null  int64 
 7   total_damage_taken    56434 non-null  int64 
 8   gold_earned           56434 non-null  int64 
 9   total_minions_killed  56434 non-null  int64 
 10  game_creation_dt      56434 non-null  object
 11  binary_time_group     56434 non-null  object
dtypes: int64(8), object(4)
memory usage: 5.2+ MB


In [68]:
#get_dummies for game_mode
dummies = pd.get_dummies(df['game_mode'], prefix='game_mode')

#add dummies back to df
df = pd.concat([df, dummies], axis=1)

#drop original game_mode column
df = df.drop('game_mode', axis=1)

In [69]:
#winsorizing features by game_mode = CHERRY to then normalize all features by game_mode
# Winsorize features only for CHERRY Game Mode
df_cherry = df[df['game_mode_CHERRY'] == True].copy()
features_to_winsorize = [
    'kills', 'deaths', 'assists', 'gold_earned', 'game_duration',
    'total_damage_dealt', 'total_damage_taken', 'total_minions_killed'
]

for feature in features_to_winsorize:
    df_cherry[f'{feature}_winsorized'] = winsorize(df_cherry[feature], limits=[0.01, 0.01])

# Update the main DataFrame only for CHERRY game mode rows
for feature in features_to_winsorize:
    df.loc[df['game_mode_CHERRY'], feature] = df_cherry[f'{feature}_winsorized']

# Normalize the features for each game mode separately
game_modes = ['game_mode_ARAM', 'game_mode_CHERRY', 'game_mode_CLASSIC']
for mode in game_modes:
    df_mode = df[df[mode] == True]
    scaler = StandardScaler()
    for feature in features_to_winsorize:
        if df_mode.shape[0] > 0:  # Check if there are entries for the mode
            df.loc[df[mode], f'{feature}_normalized'] = scaler.fit_transform(df_mode[[feature]])

# Check the dataframe info and head to confirm the changes
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56434 entries, 0 to 56433
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   game_duration                    56434 non-null  int64  
 1   summoner_id                      56434 non-null  object 
 2   kills                            56434 non-null  int64  
 3   deaths                           56434 non-null  int64  
 4   assists                          56434 non-null  int64  
 5   total_damage_dealt               56434 non-null  int64  
 6   total_damage_taken               56434 non-null  int64  
 7   gold_earned                      56434 non-null  int64  
 8   total_minions_killed             56434 non-null  int64  
 9   game_creation_dt                 56434 non-null  object 
 10  binary_time_group                56434 non-null  object 
 11  game_mode_ARAM                   56434 non-null  bool   
 12  game_mode_CHERRY  

Unnamed: 0,game_duration,summoner_id,kills,deaths,assists,total_damage_dealt,total_damage_taken,gold_earned,total_minions_killed,game_creation_dt,...,game_mode_CHERRY,game_mode_CLASSIC,kills_normalized,deaths_normalized,assists_normalized,gold_earned_normalized,game_duration_normalized,total_damage_dealt_normalized,total_damage_taken_normalized,total_minions_killed_normalized
0,2059,-1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4,2,7,6,270069,29008,12840,24,2023-05-04 22:29:09.942,...,False,True,-0.871619,0.378477,-0.250063,0.297558,0.695226,1.235736,0.156215,-1.171882
1,916,-1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4,5,0,0,108790,10054,8147,24,2023-05-04 23:10:10.125,...,False,True,-0.304148,-1.672292,-1.302698,-0.763869,-1.566604,-0.427659,-1.133083,-1.171882
2,1779,-1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4,16,5,15,280417,32597,16849,19,2023-05-05 01:12:30.961,...,False,True,1.776578,-0.207457,1.32889,1.204283,0.141147,1.342463,0.400348,-1.229911
3,918,-1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4,3,0,1,115663,11195,6475,8,2023-05-06 06:05:16.903,...,False,True,-0.682462,-1.672292,-1.127259,-1.142029,-1.562646,-0.356772,-1.05547,-1.357575
4,1880,-1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4,2,9,12,249933,34650,12840,102,2023-05-06 06:28:50.651,...,False,True,-0.871619,0.964411,0.802572,0.297558,0.341011,1.028058,0.539998,-0.266629


In [70]:
#Exporting the Normalization Params based on game_mode to use it in predictions on new players matches

features = ['kills', 'deaths', 'assists', 'gold_earned', 'game_duration',
            'total_damage_dealt', 'total_damage_taken', 'total_minions_killed']

game_modes = ['game_mode_ARAM', 'game_mode_CHERRY', 'game_mode_CLASSIC']

# DataFrame to hold the normalization parameters
norm_params = pd.DataFrame()

# Process each game mode separately
for mode in game_modes:
    df_mode = df[df[mode] == 1]  # Filter data for the current game mode
    scaler = StandardScaler()

    # Normalize the features and calculate the mean and standard deviation
    for feature in features:
        if df_mode.shape[0] > 0:
            scaled_feature = scaler.fit_transform(df_mode[[feature]])
            mean_val = scaler.mean_[0]  # Mean of the feature
            std_val = scaler.scale_[0]  # Std dev of the feature

            # Create a new DataFrame for the current row
            new_row = pd.DataFrame({
                'Game_Mode': [mode],
                'Feature': [feature],
                'Mean': [mean_val],
                'StdDev': [std_val]
            })
            norm_params = pd.concat([norm_params, new_row], ignore_index=True)

# Set column order and export to CSV
norm_params = norm_params[['Game_Mode', 'Feature', 'Mean', 'StdDev']]

# Export to CSV
norm_params.to_csv(os.path.join(BASE_FILE_PATH, 'Normalization_Parameters_by_Game_Mode.csv'), index=False)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56434 entries, 0 to 56433
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   game_duration                    56434 non-null  int64  
 1   summoner_id                      56434 non-null  object 
 2   kills                            56434 non-null  int64  
 3   deaths                           56434 non-null  int64  
 4   assists                          56434 non-null  int64  
 5   total_damage_dealt               56434 non-null  int64  
 6   total_damage_taken               56434 non-null  int64  
 7   gold_earned                      56434 non-null  int64  
 8   total_minions_killed             56434 non-null  int64  
 9   game_creation_dt                 56434 non-null  object 
 10  binary_time_group                56434 non-null  object 
 11  game_mode_ARAM                   56434 non-null  bool   
 12  game_mode_CHERRY  

In [72]:
#create the time_segment feature that I will use then to split all the normalized features into 3 different groups initial, mid and late
df['game_creation_dt'] = pd.to_datetime(df['game_creation_dt'])

# Define bins based on quantiles
quantiles = df['game_creation_dt'].quantile([0, 0.33, 0.67, 1]).to_list()
df['time_segment'] = pd.cut(df['game_creation_dt'], bins=quantiles, labels=['initial', 'mid', 'late'], include_lowest=True)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56434 entries, 0 to 56433
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   game_duration                    56434 non-null  int64         
 1   summoner_id                      56434 non-null  object        
 2   kills                            56434 non-null  int64         
 3   deaths                           56434 non-null  int64         
 4   assists                          56434 non-null  int64         
 5   total_damage_dealt               56434 non-null  int64         
 6   total_damage_taken               56434 non-null  int64         
 7   gold_earned                      56434 non-null  int64         
 8   total_minions_killed             56434 non-null  int64         
 9   game_creation_dt                 56434 non-null  datetime64[ns]
 10  binary_time_group                56434 non-null  object   

In [74]:
df.drop(columns=['game_creation_dt', 'game_duration', 'kills', 'deaths',
                  'assists', 'total_damage_dealt', 'total_damage_taken',
                  'gold_earned', 'total_minions_killed', 'game_mode_ARAM',
                  'game_mode_CHERRY', 'game_mode_CLASSIC'], inplace=True)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56434 entries, 0 to 56433
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   summoner_id                      56434 non-null  object  
 1   binary_time_group                56434 non-null  object  
 2   kills_normalized                 56434 non-null  float64 
 3   deaths_normalized                56434 non-null  float64 
 4   assists_normalized               56434 non-null  float64 
 5   gold_earned_normalized           56434 non-null  float64 
 6   game_duration_normalized         56434 non-null  float64 
 7   total_damage_dealt_normalized    56434 non-null  float64 
 8   total_damage_taken_normalized    56434 non-null  float64 
 9   total_minions_killed_normalized  56434 non-null  float64 
 10  time_segment                     56434 non-null  category
dtypes: category(1), float64(8), object(2)
memory usage: 4.4+ MB


In [76]:
# Exclude non-numeric columns from the DataFrame before grouping
numeric_cols = df.select_dtypes(include=[np.number])  # This selects only numeric columns
df_numeric = df[['summoner_id', 'time_segment'] + list(numeric_cols.columns)]

# Addressing the FutureWarning by setting observed=True if the DataFrame contains any categorical data
grouped_df = df_numeric.groupby(['summoner_id', 'time_segment'], observed=True).mean()

# Pivoting the data so each 'time_segment' becomes a column
pivot_df = grouped_df.unstack(level='time_segment')

# Flattening the columns to make them more manageable
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]

# Resetting the index so 'summoner_id' becomes a column
pivot_df.reset_index(inplace=True)

# Optionally, fill any NaN values that may arise from empty segments
pivot_df.fillna(0, inplace=True)

# Display the first few rows to verify
print(pivot_df.head())

                                         summoner_id  \
0    -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4   
1  -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2...   
2   -3octyr43CfmenR8kI_NBmo2L1Z7EuzW7TyGxq23Gn4yvB6x   
3   -48H0zwIE1AchxYRFBcFn7fSEEr6OplutJMpZ4QFhSWflnQA   
4   -4le7Ki3BnzKo2nGERmm8V8LIzkJuUM3yK_6CmDQoFHC3nXo   

   kills_normalized_initial  kills_normalized_mid  kills_normalized_late  \
0                  0.652763              0.310612               0.168744   
1                  0.357901              0.091707              -0.049193   
2                 -0.769496             -0.136008              -0.309265   
3                  0.074166             -0.259615              -0.648333   
4                 -0.314281              0.452480               0.000000   

   deaths_normalized_initial  deaths_normalized_mid  deaths_normalized_late  \
0                   0.068276              -0.134215                1.110894   
1                   1.110894              -0.432

In [77]:
# Extract binary_time_group and summoner_id, and drop duplicates
binary_time_mapping = df[['summoner_id', 'binary_time_group']].drop_duplicates()

# Merge this mapping into the pivot_df
pivot_df = pivot_df.merge(binary_time_mapping, on='summoner_id', how='left')

# Display the first few rows to verify the merge
print(pivot_df.head())

                                         summoner_id  \
0    -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4   
1  -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2...   
2   -3octyr43CfmenR8kI_NBmo2L1Z7EuzW7TyGxq23Gn4yvB6x   
3   -48H0zwIE1AchxYRFBcFn7fSEEr6OplutJMpZ4QFhSWflnQA   
4   -4le7Ki3BnzKo2nGERmm8V8LIzkJuUM3yK_6CmDQoFHC3nXo   

   kills_normalized_initial  kills_normalized_mid  kills_normalized_late  \
0                  0.652763              0.310612               0.168744   
1                  0.357901              0.091707              -0.049193   
2                 -0.769496             -0.136008              -0.309265   
3                  0.074166             -0.259615              -0.648333   
4                 -0.314281              0.452480               0.000000   

   deaths_normalized_initial  deaths_normalized_mid  deaths_normalized_late  \
0                   0.068276              -0.134215                1.110894   
1                   1.110894              -0.432

In [78]:
#selecting top20 most important features
top_features = [
    'deaths_normalized_late', 'kills_normalized_late', 'assists_normalized_late',
    'game_duration_normalized_late', 'total_minions_killed_normalized_late',
    'gold_earned_normalized_late', 'total_damage_dealt_normalized_late',
    'total_damage_taken_normalized_late', 'deaths_normalized_mid',
    'kills_normalized_mid', 'assists_normalized_mid', 'game_duration_normalized_mid',
    'total_minions_killed_normalized_mid', 'gold_earned_normalized_mid',
    'total_damage_dealt_normalized_mid', 'total_damage_taken_normalized_mid',
    'deaths_normalized_initial', 'kills_normalized_initial', 'assists_normalized_initial',
    'game_duration_normalized_initial'
]


# Creating a new DataFrame that includes only the top features and the target variable
top_features_df = pivot_df[top_features + ['binary_time_group']]

In [79]:
top_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   deaths_normalized_late                1000 non-null   float64
 1   kills_normalized_late                 1000 non-null   float64
 2   assists_normalized_late               1000 non-null   float64
 3   game_duration_normalized_late         1000 non-null   float64
 4   total_minions_killed_normalized_late  1000 non-null   float64
 5   gold_earned_normalized_late           1000 non-null   float64
 6   total_damage_dealt_normalized_late    1000 non-null   float64
 7   total_damage_taken_normalized_late    1000 non-null   float64
 8   deaths_normalized_mid                 1000 non-null   float64
 9   kills_normalized_mid                  1000 non-null   float64
 10  assists_normalized_mid                1000 non-null   float64
 11  game_duration_norm

In [80]:
top_features_df.to_csv(os.path.join(BASE_FILE_PATH, 'processed_train_df.csv'), index=False)