In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import pickle

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
BASE_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/Final Project'
print(os.listdir(BASE_FILE_PATH))

['Keys', 'Setting up connection with Git-Hub.ipynb', 'entries.csv', 'summoner_details.csv', 'updated_dataset.csv', 'filtered_dataset.csv', 'filtered_dataset.gsheet', 'filtered_+30_matches_dataset.csv', 'EDA.ipynb', 'merged_dataset.csv', 'Merging match with entries and classifying accounts.ipynb', 'merged_dataset.gsheet', 'Feature engineering and cleaning vol 1.ipynb', 'dataset_after_normalization.csv', 'keepign only win feature .ipynb', 'Models without data normalization.ipynb', 'Normalization_Parameters.csv', 'Normalization_Parameters_by_Game_Mode.csv', 'Normalization_Parameters_by_Team_Position.csv', 'summoner_ids_used_in_model.csv', 'gbm_model.pkl']


In [15]:
df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'merged_dataset.csv'))

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   match_id              128021 non-null  object 
 1   game_creation         128021 non-null  int64  
 2   game_duration         128021 non-null  int64  
 3   game_mode             128021 non-null  object 
 4   game_version          128021 non-null  object 
 5   map_id                128021 non-null  int64  
 6   summoner_id           128021 non-null  object 
 7   puuid                 128021 non-null  object 
 8   champion_id           128021 non-null  int64  
 9   kills                 128021 non-null  int64  
 10  deaths                128021 non-null  int64  
 11  assists               128021 non-null  int64  
 12  total_damage_dealt    128021 non-null  int64  
 13  total_damage_taken    128021 non-null  int64  
 14  gold_earned           128021 non-null  int64  
 15  

In [17]:
#drop unnecessary columns
df.drop(columns=['match_id', 'game_creation','days_since_last_game',
                            'game_duration_min', 'win','puuid','map_id','champion_id',
                            'vision_score', 'game_version', 'tier', 'rank',
                            'time_group', 'team_position', 'time_segment'], inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   game_duration         128021 non-null  int64 
 1   game_mode             128021 non-null  object
 2   summoner_id           128021 non-null  object
 3   kills                 128021 non-null  int64 
 4   deaths                128021 non-null  int64 
 5   assists               128021 non-null  int64 
 6   total_damage_dealt    128021 non-null  int64 
 7   total_damage_taken    128021 non-null  int64 
 8   gold_earned           128021 non-null  int64 
 9   total_minions_killed  128021 non-null  int64 
 10  game_creation_dt      128021 non-null  object
 11  binary_time_group     128021 non-null  object
dtypes: int64(8), object(4)
memory usage: 11.7+ MB


In [19]:
#get_dummies for game_mode
dummies = pd.get_dummies(df['game_mode'], prefix='game_mode')

#add dummies back to df
filtered_df = pd.concat([df, dummies], axis=1)

#drop original game_mode column
filtered_df = filtered_df.drop('game_mode', axis=1)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   game_duration         128021 non-null  int64 
 1   game_mode             128021 non-null  object
 2   summoner_id           128021 non-null  object
 3   kills                 128021 non-null  int64 
 4   deaths                128021 non-null  int64 
 5   assists               128021 non-null  int64 
 6   total_damage_dealt    128021 non-null  int64 
 7   total_damage_taken    128021 non-null  int64 
 8   gold_earned           128021 non-null  int64 
 9   total_minions_killed  128021 non-null  int64 
 10  game_creation_dt      128021 non-null  object
 11  binary_time_group     128021 non-null  object
dtypes: int64(8), object(4)
memory usage: 11.7+ MB


In [21]:
#split whole dataset into train, test and validation

# Get unique summoner_ids with their binary_time_group
unique_ids = df[['summoner_id', 'binary_time_group']].drop_duplicates()

# Separate active and inactive summoners
active_ids = unique_ids[unique_ids['binary_time_group'] == 'Active']
inactive_ids = unique_ids[unique_ids['binary_time_group'] == 'Inactive']

# Sample 500 for training, 100 for testing from both active and inactive
train_active = active_ids.sample(n=500, random_state=42)
test_active = active_ids.drop(train_active.index).sample(n=100, random_state=42)
train_inactive = inactive_ids.sample(n=500, random_state=42)
test_inactive = inactive_ids.drop(train_inactive.index).sample(n=100, random_state=42)

# Combine samples
train_ids = pd.concat([train_active, train_inactive])
test_ids = pd.concat([test_active, test_inactive])

# Remaining for validation
validation_ids = unique_ids.drop(train_ids.index).drop(test_ids.index)

# Merge back to the main dataframe to get the full rows
train_df = df[df['summoner_id'].isin(train_ids['summoner_id'])]
test_df = df[df['summoner_id'].isin(test_ids['summoner_id'])]
validation_df = df[df['summoner_id'].isin(validation_ids['summoner_id'])]

# Check the sizes
print("Training Set Size:", train_df.shape)
print("Testing Set Size:", test_df.shape)
print("Validation Set Size:", validation_df.shape)

Training Set Size: (56434, 12)
Testing Set Size: (11232, 12)
Validation Set Size: (60355, 12)


In [22]:
# prompt: export to csv the train_df, test_df and validation_df in the BASE_FILE_PATH

train_df.to_csv(os.path.join(BASE_FILE_PATH, 'train_df.csv'), index=False)
test_df.to_csv(os.path.join(BASE_FILE_PATH, 'test_df.csv'), index=False)
validation_df.to_csv(os.path.join(BASE_FILE_PATH, 'validation_df.csv'), index=False)
