In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import pickle

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
BASE_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/Final Project'
print(os.listdir(BASE_FILE_PATH))

['Keys', 'Setting up connection with Git-Hub.ipynb', 'entries.csv', 'summoner_details.csv', 'updated_dataset.csv', 'filtered_dataset.csv', 'filtered_dataset.gsheet', 'filtered_+30_matches_dataset.csv', 'EDA.ipynb', 'merged_dataset.csv', 'Merging match with entries and classifying accounts.ipynb', 'merged_dataset.gsheet', 'Feature engineering and cleaning vol 1.ipynb', 'dataset_after_normalization.csv', 'keepign only win feature .ipynb', 'Models without data normalization.ipynb', 'Normalization_Parameters.csv', 'Normalization_Parameters_by_Team_Position.csv', 'summoner_ids_used_in_model.csv', 'gbm_model.pkl', 'train_df.csv', 'test_df.csv', 'validation_df.csv', 'processed_train_df.csv', 'Normalization_Parameters_by_Game_Mode.csv']


In [4]:
df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'test_df.csv'))
normalization_by_game_df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'Normalization_Parameters_by_Game_Mode.csv'))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11232 entries, 0 to 11231
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   game_duration         11232 non-null  int64 
 1   game_mode             11232 non-null  object
 2   summoner_id           11232 non-null  object
 3   kills                 11232 non-null  int64 
 4   deaths                11232 non-null  int64 
 5   assists               11232 non-null  int64 
 6   total_damage_dealt    11232 non-null  int64 
 7   total_damage_taken    11232 non-null  int64 
 8   gold_earned           11232 non-null  int64 
 9   total_minions_killed  11232 non-null  int64 
 10  game_creation_dt      11232 non-null  object
 11  binary_time_group     11232 non-null  object
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [6]:
#get_dummies for game_mode
dummies = pd.get_dummies(df['game_mode'], prefix='game_mode')

#add dummies back to df
df = pd.concat([df, dummies], axis=1)

#drop original game_mode column
df = df.drop('game_mode', axis=1)

In [7]:
#winsorizing features by game_mode = CHERRY to then normalize all features by game_mode
# Winsorize features only for CHERRY Game Mode
df_cherry = df[df['game_mode_CHERRY'] == True].copy()
features_to_winsorize = [
    'kills', 'deaths', 'assists', 'gold_earned', 'game_duration',
    'total_damage_dealt', 'total_damage_taken', 'total_minions_killed'
]

for feature in features_to_winsorize:
    df_cherry[f'{feature}_winsorized'] = winsorize(df_cherry[feature], limits=[0.01, 0.01])

# Update the main DataFrame only for CHERRY game mode rows
for feature in features_to_winsorize:
    df.loc[df['game_mode_CHERRY'], feature] = df_cherry[f'{feature}_winsorized']

In [8]:
# Normalize the features for each game mode separately
for index, row in normalization_by_game_df.iterrows():
    feature = row['Feature']
    game_mode = row['Game_Mode']
    mean_val = row['Mean']
    stddev_val = row['StdDev']

    # Check if the feature is in the list to be normalized and prepare normalized column name
    if feature in features_to_winsorize:
        normalized_feature_column = f'{feature}_normalized'
        # Normalize the feature in df where the game mode column is True
        if df[game_mode].any():  # Ensure there are rows to normalize for the game mode
            df.loc[df[game_mode], normalized_feature_column] = \
                (df.loc[df[game_mode], feature] - mean_val) / stddev_val

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11232 entries, 0 to 11231
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   game_duration                    11232 non-null  int64  
 1   summoner_id                      11232 non-null  object 
 2   kills                            11232 non-null  int64  
 3   deaths                           11232 non-null  int64  
 4   assists                          11232 non-null  int64  
 5   total_damage_dealt               11232 non-null  int64  
 6   total_damage_taken               11232 non-null  int64  
 7   gold_earned                      11232 non-null  int64  
 8   total_minions_killed             11232 non-null  int64  
 9   game_creation_dt                 11232 non-null  object 
 10  binary_time_group                11232 non-null  object 
 11  game_mode_ARAM                   11232 non-null  bool   
 12  game_mode_CHERRY  

In [10]:
#create the time_segment feature that I will use then to split all the normalized features into 3 different groups initial, mid and late
df['game_creation_dt'] = pd.to_datetime(df['game_creation_dt'])

# Define bins based on quantiles
quantiles = df['game_creation_dt'].quantile([0, 0.33, 0.67, 1]).to_list()
df['time_segment'] = pd.cut(df['game_creation_dt'], bins=quantiles, labels=['initial', 'mid', 'late'], include_lowest=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11232 entries, 0 to 11231
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   game_duration                    11232 non-null  int64         
 1   summoner_id                      11232 non-null  object        
 2   kills                            11232 non-null  int64         
 3   deaths                           11232 non-null  int64         
 4   assists                          11232 non-null  int64         
 5   total_damage_dealt               11232 non-null  int64         
 6   total_damage_taken               11232 non-null  int64         
 7   gold_earned                      11232 non-null  int64         
 8   total_minions_killed             11232 non-null  int64         
 9   game_creation_dt                 11232 non-null  datetime64[ns]
 10  binary_time_group                11232 non-null  object   

In [12]:
df.drop(columns=['game_creation_dt', 'game_duration', 'kills', 'deaths',
                  'assists', 'total_damage_dealt', 'total_damage_taken',
                  'gold_earned', 'total_minions_killed', 'game_mode_ARAM',
                  'game_mode_CHERRY', 'game_mode_CLASSIC'], inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11232 entries, 0 to 11231
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   summoner_id                      11232 non-null  object  
 1   binary_time_group                11232 non-null  object  
 2   kills_normalized                 11232 non-null  float64 
 3   deaths_normalized                11232 non-null  float64 
 4   assists_normalized               11232 non-null  float64 
 5   gold_earned_normalized           11232 non-null  float64 
 6   game_duration_normalized         11232 non-null  float64 
 7   total_damage_dealt_normalized    11232 non-null  float64 
 8   total_damage_taken_normalized    11232 non-null  float64 
 9   total_minions_killed_normalized  11232 non-null  float64 
 10  time_segment                     11232 non-null  category
dtypes: category(1), float64(8), object(2)
memory usage: 888.7+ KB


In [14]:
# Exclude non-numeric columns from the DataFrame before grouping
numeric_cols = df.select_dtypes(include=[np.number])  # This selects only numeric columns
df_numeric = df[['summoner_id', 'time_segment'] + list(numeric_cols.columns)]

# Addressing the FutureWarning by setting observed=True if the DataFrame contains any categorical data
grouped_df = df_numeric.groupby(['summoner_id', 'time_segment'], observed=True).mean()

# Pivoting the data so each 'time_segment' becomes a column
pivot_df = grouped_df.unstack(level='time_segment')

# Flattening the columns to make them more manageable
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]

# Resetting the index so 'summoner_id' becomes a column
pivot_df.reset_index(inplace=True)

# Optionally, fill any NaN values that may arise from empty segments
pivot_df.fillna(0, inplace=True)

# Display the first few rows to verify
print(pivot_df.head())

                                        summoner_id  kills_normalized_initial  \
0   -57abaqo0rKW30x3I3SoJuDtkU0-vqFOXNNhomuF3lbHj-g                  0.151674   
1  -fT4ReY0JD9vVgvXLmQTpu3a1m3hjNbB4QTzNyJw-7D-N5KN                 -0.686245   
2   -p1Bu6m1y9w2cFV4Y_gAylvJQlF_JeyOyZfDBAbe5SI9uzc                  0.720623   
3  00FwXWjDCBKIRCcSI1h8fPgZQMpV75zcXt9SmKtBjKtorUQU                 -0.068594   
4  0O87AgmIgdMyZ91rVXS7x1jW-rymZsK-lte5yjinB3Fg9L6a                  0.036279   

   kills_normalized_mid  kills_normalized_late  deaths_normalized_initial  \
0             -0.098102              -0.178043                  -0.314800   
1              0.000000              -0.850601                  -0.441831   
2              0.057717               0.007476                  -0.120132   
3             -0.430253               0.000000                   0.649333   
4              0.704689               0.000000                  -0.027053   

   deaths_normalized_mid  deaths_normalized_late  

In [15]:
# Extract binary_time_group and summoner_id, and drop duplicates
binary_time_mapping = df[['summoner_id', 'binary_time_group']].drop_duplicates()

# Merge this mapping into the pivot_df
pivot_df = pivot_df.merge(binary_time_mapping, on='summoner_id', how='left')

# Display the first few rows to verify the merge
print(pivot_df.head())

                                        summoner_id  kills_normalized_initial  \
0   -57abaqo0rKW30x3I3SoJuDtkU0-vqFOXNNhomuF3lbHj-g                  0.151674   
1  -fT4ReY0JD9vVgvXLmQTpu3a1m3hjNbB4QTzNyJw-7D-N5KN                 -0.686245   
2   -p1Bu6m1y9w2cFV4Y_gAylvJQlF_JeyOyZfDBAbe5SI9uzc                  0.720623   
3  00FwXWjDCBKIRCcSI1h8fPgZQMpV75zcXt9SmKtBjKtorUQU                 -0.068594   
4  0O87AgmIgdMyZ91rVXS7x1jW-rymZsK-lte5yjinB3Fg9L6a                  0.036279   

   kills_normalized_mid  kills_normalized_late  deaths_normalized_initial  \
0             -0.098102              -0.178043                  -0.314800   
1              0.000000              -0.850601                  -0.441831   
2              0.057717               0.007476                  -0.120132   
3             -0.430253               0.000000                   0.649333   
4              0.704689               0.000000                  -0.027053   

   deaths_normalized_mid  deaths_normalized_late  

In [16]:
#selecting top20 most important features
top_features = [
    'deaths_normalized_late', 'kills_normalized_late', 'assists_normalized_late',
    'game_duration_normalized_late', 'total_minions_killed_normalized_late',
    'gold_earned_normalized_late', 'total_damage_dealt_normalized_late',
    'total_damage_taken_normalized_late', 'deaths_normalized_mid',
    'kills_normalized_mid', 'assists_normalized_mid', 'game_duration_normalized_mid',
    'total_minions_killed_normalized_mid', 'gold_earned_normalized_mid',
    'total_damage_dealt_normalized_mid', 'total_damage_taken_normalized_mid',
    'deaths_normalized_initial', 'kills_normalized_initial', 'assists_normalized_initial',
    'game_duration_normalized_initial'
]


# Creating a new DataFrame that includes only the top features and the target variable
top_features_df = pivot_df[top_features + ['binary_time_group']]

In [17]:
top_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   deaths_normalized_late                200 non-null    float64
 1   kills_normalized_late                 200 non-null    float64
 2   assists_normalized_late               200 non-null    float64
 3   game_duration_normalized_late         200 non-null    float64
 4   total_minions_killed_normalized_late  200 non-null    float64
 5   gold_earned_normalized_late           200 non-null    float64
 6   total_damage_dealt_normalized_late    200 non-null    float64
 7   total_damage_taken_normalized_late    200 non-null    float64
 8   deaths_normalized_mid                 200 non-null    float64
 9   kills_normalized_mid                  200 non-null    float64
 10  assists_normalized_mid                200 non-null    float64
 11  game_duration_norma

In [18]:
top_features_df.to_csv(os.path.join(BASE_FILE_PATH, 'processed_test_df.csv'), index=False)