In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve
import pickle

In [170]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [171]:
BASE_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/Final Project'
print(os.listdir(BASE_FILE_PATH))

['Keys', 'Setting up connection with Git-Hub.ipynb', 'entries.csv', 'summoner_details.csv', 'updated_dataset.csv', 'filtered_dataset.csv', 'filtered_dataset.gsheet', 'filtered_+30_matches_dataset.csv', 'EDA.ipynb', 'merged_dataset.csv', 'Merging match with entries and classifying accounts.ipynb', 'merged_dataset.gsheet', 'Feature engineering and cleaning vol 1.ipynb', 'dataset_after_normalization.csv', 'keepign only win feature .ipynb', 'Models without data normalization.ipynb', 'Normalization_Parameters.csv', 'Normalization_Parameters_by_Game_Mode.csv', 'summoner_ids_used_in_model.csv', 'Normalization_Parameters_by_Team_Position.csv', 'gbm_model.pkl']


In [172]:
#get datasets
all_values_df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'merged_dataset.csv'))
summoners_used_in_model = pd.read_csv(os.path.join(BASE_FILE_PATH, 'summoner_ids_used_in_model.csv'))
normalization_by_game_df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'Normalization_Parameters_by_Game_Mode.csv'))
normalization_by_team_position_df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'Normalization_Parameters_by_Team_Position.csv'))


In [173]:
#Open the pkl gbm_model that is in the BASE_FILE_PATH directory

with open(os.path.join(BASE_FILE_PATH, 'gbm_model.pkl'), 'rb') as f:
    gbm_model = pickle.load(f)


In [174]:
all_values_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   match_id              128021 non-null  object 
 1   game_creation         128021 non-null  int64  
 2   game_duration         128021 non-null  int64  
 3   game_mode             128021 non-null  object 
 4   game_version          128021 non-null  object 
 5   map_id                128021 non-null  int64  
 6   summoner_id           128021 non-null  object 
 7   puuid                 128021 non-null  object 
 8   champion_id           128021 non-null  int64  
 9   kills                 128021 non-null  int64  
 10  deaths                128021 non-null  int64  
 11  assists               128021 non-null  int64  
 12  total_damage_dealt    128021 non-null  int64  
 13  total_damage_taken    128021 non-null  int64  
 14  gold_earned           128021 non-null  int64  
 15  

In [175]:
#remove from main dataset all information related to summoners that were used to train the model
filtered_df = all_values_df[~all_values_df['summoner_id'].isin(summoners_used_in_model['summoner_id'])]

In [176]:
#drop unnecessary columns
filtered_df.drop(columns=['match_id', 'game_creation','days_since_last_game',
                            'game_duration_min', 'win','puuid','map_id','champion_id',
                            'vision_score', 'game_version', 'tier', 'rank',
                            'time_group', 'team_position', 'time_segment'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['match_id', 'game_creation','days_since_last_game',


In [177]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71587 entries, 0 to 127961
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   game_duration         71587 non-null  int64 
 1   game_mode             71587 non-null  object
 2   summoner_id           71587 non-null  object
 3   kills                 71587 non-null  int64 
 4   deaths                71587 non-null  int64 
 5   assists               71587 non-null  int64 
 6   total_damage_dealt    71587 non-null  int64 
 7   total_damage_taken    71587 non-null  int64 
 8   gold_earned           71587 non-null  int64 
 9   total_minions_killed  71587 non-null  int64 
 10  game_creation_dt      71587 non-null  object
 11  binary_time_group     71587 non-null  object
dtypes: int64(8), object(4)
memory usage: 7.1+ MB


In [178]:
#get_dummies for game_mode
dummies = pd.get_dummies(filtered_df['game_mode'], prefix='game_mode')

#add dummies back to df
filtered_df = pd.concat([filtered_df, dummies], axis=1)

#drop original game_mode column
filtered_df = filtered_df.drop('game_mode', axis=1)

In [179]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71587 entries, 0 to 127961
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   game_duration         71587 non-null  int64 
 1   summoner_id           71587 non-null  object
 2   kills                 71587 non-null  int64 
 3   deaths                71587 non-null  int64 
 4   assists               71587 non-null  int64 
 5   total_damage_dealt    71587 non-null  int64 
 6   total_damage_taken    71587 non-null  int64 
 7   gold_earned           71587 non-null  int64 
 8   total_minions_killed  71587 non-null  int64 
 9   game_creation_dt      71587 non-null  object
 10  binary_time_group     71587 non-null  object
 11  game_mode_ARAM        71587 non-null  bool  
 12  game_mode_CHERRY      71587 non-null  bool  
 13  game_mode_CLASSIC     71587 non-null  bool  
dtypes: bool(3), int64(8), object(3)
memory usage: 6.8+ MB


In [180]:
# Winsorizing features for CHERRY Game Mode
df_cherry = filtered_df[filtered_df['game_mode_CHERRY'] == True].copy()
features_to_winsorize = [
    'kills', 'deaths', 'assists', 'gold_earned', 'game_duration',
    'total_damage_dealt', 'total_damage_taken', 'total_minions_killed'
]

for feature in features_to_winsorize:
    # Apply winsorization
    df_cherry[f'{feature}_winsorized'] = winsorize(df_cherry[feature], limits=[0.01, 0.01])

# Update the main DataFrame only for CHERRY game mode rows
for feature in features_to_winsorize:
    filtered_df.loc[filtered_df['game_mode_CHERRY'], feature] = df_cherry[f'{feature}_winsorized']

In [181]:
# Normalize the features for each game mode separately
for index, row in normalization_by_game_df.iterrows():
    feature = row['Feature']
    game_mode = row['Game_Mode']
    mean_val = row['Mean']
    stddev_val = row['StdDev']

    # Check if the feature is in the list to be normalized and prepare normalized column name
    if feature in features_to_winsorize:
        normalized_feature_column = f'{feature}_normalized'
        # Normalize the feature in filtered_df where the game mode column is True
        if filtered_df[game_mode].any():  # Ensure there are rows to normalize for the game mode
            filtered_df.loc[filtered_df[game_mode], normalized_feature_column] = \
                (filtered_df.loc[filtered_df[game_mode], feature] - mean_val) / stddev_val


In [182]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71587 entries, 0 to 127961
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   game_duration                    71587 non-null  int64  
 1   summoner_id                      71587 non-null  object 
 2   kills                            71587 non-null  int64  
 3   deaths                           71587 non-null  int64  
 4   assists                          71587 non-null  int64  
 5   total_damage_dealt               71587 non-null  int64  
 6   total_damage_taken               71587 non-null  int64  
 7   gold_earned                      71587 non-null  int64  
 8   total_minions_killed             71587 non-null  int64  
 9   game_creation_dt                 71587 non-null  object 
 10  binary_time_group                71587 non-null  object 
 11  game_mode_ARAM                   71587 non-null  bool   
 12  game_mode_CHERRY      

In [183]:
filtered_df['game_creation_dt'] = pd.to_datetime(filtered_df['game_creation_dt'])

# Define bins based on quantiles
quantiles = filtered_df['game_creation_dt'].quantile([0, 0.33, 0.67, 1]).to_list()
filtered_df['time_segment'] = pd.cut(filtered_df['game_creation_dt'], bins=quantiles, labels=['initial', 'mid', 'late'], include_lowest=True)

In [184]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71587 entries, 0 to 127961
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   game_duration                    71587 non-null  int64         
 1   summoner_id                      71587 non-null  object        
 2   kills                            71587 non-null  int64         
 3   deaths                           71587 non-null  int64         
 4   assists                          71587 non-null  int64         
 5   total_damage_dealt               71587 non-null  int64         
 6   total_damage_taken               71587 non-null  int64         
 7   gold_earned                      71587 non-null  int64         
 8   total_minions_killed             71587 non-null  int64         
 9   game_creation_dt                 71587 non-null  datetime64[ns]
 10  binary_time_group                71587 non-null  object       

In [185]:
filtered_df.drop(columns=['game_creation_dt', 'game_duration', 'kills', 'deaths',
                          'assists', 'total_damage_dealt', 'total_damage_taken',
                          'gold_earned', 'total_minions_killed', 'game_mode_ARAM',
                          'game_mode_CHERRY', 'game_mode_CLASSIC'], inplace=True)

In [186]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71587 entries, 0 to 127961
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   summoner_id                      71587 non-null  object  
 1   binary_time_group                71587 non-null  object  
 2   kills_normalized                 71587 non-null  float64 
 3   deaths_normalized                71587 non-null  float64 
 4   assists_normalized               71587 non-null  float64 
 5   gold_earned_normalized           71587 non-null  float64 
 6   game_duration_normalized         71587 non-null  float64 
 7   total_damage_dealt_normalized    71587 non-null  float64 
 8   total_damage_taken_normalized    71587 non-null  float64 
 9   total_minions_killed_normalized  71587 non-null  float64 
 10  time_segment                     71587 non-null  category
dtypes: category(1), float64(8), object(2)
memory usage: 6.1+ MB


In [187]:
# Exclude non-numeric columns from the DataFrame before grouping
numeric_cols = filtered_df.select_dtypes(include=[np.number])  # This selects only numeric columns
df_numeric = filtered_df[['summoner_id', 'time_segment'] + list(numeric_cols.columns)]

# Addressing the FutureWarning by setting observed=True if the DataFrame contains any categorical data
grouped_df = df_numeric.groupby(['summoner_id', 'time_segment'], observed=True).mean()

# Pivoting the data so each 'time_segment' becomes a column
pivot_df = grouped_df.unstack(level='time_segment')

# Flattening the columns to make them more manageable
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]

# Resetting the index so 'summoner_id' becomes a column
pivot_df.reset_index(inplace=True)

# Optionally, fill any NaN values that may arise from empty segments
pivot_df.fillna(0, inplace=True)

# Display the first few rows to verify
print(pivot_df.head())

                                        summoner_id  kills_normalized_initial  \
0  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE                 -0.678147   
1  -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy                 -0.203322   
2   -3ntNAfsqQ8f8pHSCh805dnhlQvmv2_GYLWxhwDfn6I1h2g                  0.340567   
3   -57abaqo0rKW30x3I3SoJuDtkU0-vqFOXNNhomuF3lbHj-g                  0.165148   
4  -D7G5Q31odyA5MB6rDTvKJwiCLr-iWHmV55cAr9jOLVyEh60                  0.000000   

   kills_normalized_mid  kills_normalized_late  deaths_normalized_initial  \
0             -0.265878              -0.471296                  -1.383072   
1             -0.583182              -0.062466                  -0.511505   
2              1.019350               0.000000                  -0.049310   
3             -0.091400              -0.171668                  -0.313471   
4              0.105556               0.480615                   0.000000   

   deaths_normalized_mid  deaths_normalized_late  

In [188]:
# Extract binary_time_group and summoner_id, and drop duplicates
binary_time_mapping = filtered_df[['summoner_id', 'binary_time_group']].drop_duplicates()

# Merge this mapping into the pivot_df
pivot_df = pivot_df.merge(binary_time_mapping, on='summoner_id', how='left')

# Display the first few rows to verify the merge
print(pivot_df.head())

                                        summoner_id  kills_normalized_initial  \
0  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE                 -0.678147   
1  -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy                 -0.203322   
2   -3ntNAfsqQ8f8pHSCh805dnhlQvmv2_GYLWxhwDfn6I1h2g                  0.340567   
3   -57abaqo0rKW30x3I3SoJuDtkU0-vqFOXNNhomuF3lbHj-g                  0.165148   
4  -D7G5Q31odyA5MB6rDTvKJwiCLr-iWHmV55cAr9jOLVyEh60                  0.000000   

   kills_normalized_mid  kills_normalized_late  deaths_normalized_initial  \
0             -0.265878              -0.471296                  -1.383072   
1             -0.583182              -0.062466                  -0.511505   
2              1.019350               0.000000                  -0.049310   
3             -0.091400              -0.171668                  -0.313471   
4              0.105556               0.480615                   0.000000   

   deaths_normalized_mid  deaths_normalized_late  

In [189]:
pivot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274 entries, 0 to 1273
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   summoner_id                              1274 non-null   object 
 1   kills_normalized_initial                 1274 non-null   float64
 2   kills_normalized_mid                     1274 non-null   float64
 3   kills_normalized_late                    1274 non-null   float64
 4   deaths_normalized_initial                1274 non-null   float64
 5   deaths_normalized_mid                    1274 non-null   float64
 6   deaths_normalized_late                   1274 non-null   float64
 7   assists_normalized_initial               1274 non-null   float64
 8   assists_normalized_mid                   1274 non-null   float64
 9   assists_normalized_late                  1274 non-null   float64
 10  gold_earned_normalized_initial           1274 no

In [190]:
#selecting top20 most important features
top_features = [
    'deaths_normalized_late', 'kills_normalized_late', 'assists_normalized_late',
    'game_duration_normalized_late', 'total_minions_killed_normalized_late',
    'gold_earned_normalized_late', 'total_damage_dealt_normalized_late',
    'total_damage_taken_normalized_late', 'deaths_normalized_mid',
    'kills_normalized_mid', 'assists_normalized_mid', 'game_duration_normalized_mid',
    'total_minions_killed_normalized_mid', 'gold_earned_normalized_mid',
    'total_damage_dealt_normalized_mid', 'total_damage_taken_normalized_mid',
    'deaths_normalized_initial', 'kills_normalized_initial', 'assists_normalized_initial',
    'game_duration_normalized_initial'
]


# Creating a new DataFrame that includes only the top features and the target variable
top_features_df = pivot_df[top_features + ['binary_time_group']+['summoner_id']]

In [191]:
top_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274 entries, 0 to 1273
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   deaths_normalized_late                1274 non-null   float64
 1   kills_normalized_late                 1274 non-null   float64
 2   assists_normalized_late               1274 non-null   float64
 3   game_duration_normalized_late         1274 non-null   float64
 4   total_minions_killed_normalized_late  1274 non-null   float64
 5   gold_earned_normalized_late           1274 non-null   float64
 6   total_damage_dealt_normalized_late    1274 non-null   float64
 7   total_damage_taken_normalized_late    1274 non-null   float64
 8   deaths_normalized_mid                 1274 non-null   float64
 9   kills_normalized_mid                  1274 non-null   float64
 10  assists_normalized_mid                1274 non-null   float64
 11  game_duration_nor

In [192]:
def evaluate_predictions(df, model, features_col, actual_col='binary_time_group'):
    """
    Evaluates the model predictions against actual labels on a DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the features and actual labels.
    model (Model): Trained machine learning model.
    features_col (list): List of columns to use as features.
    actual_col (str): Column name of the actual labels.

    Returns:
    Prints row-by-row prediction results and a summary of the evaluation.
    """
    # Extracting the features and actual labels
    X = df[features_col]
    y_actual = df[actual_col]

    # Predicting using the model
    predictions = model.predict(X)

    correct = 0
    print("Index\tActual\tPredicted\tCorrect")
    # Iterating through each prediction and actual label
    for i, (actual, pred) in enumerate(zip(y_actual, predictions)):
        correct_pred = 'Yes' if actual == pred else 'No'
        print(f"{i}\t{actual}\t{pred}\t{correct_pred}")
        if actual == pred:
            correct += 1

    total = len(predictions)
    print(f"\nTotal Correct Predictions: {correct}/{total}")
    print(f"Accuracy: {correct/total:.2f}")

# Prepare your dataframe by excluding non-feature columns
features_for_prediction = top_features_df.columns.difference(['binary_time_group', 'summoner_id'])
evaluate_predictions(top_features_df, gbm_model, features_for_prediction)

Index	Actual	Predicted	Correct
0	Active	Active	Yes
1	Active	Active	Yes
2	Inactive	Active	No
3	Active	Active	Yes
4	Active	Active	Yes
5	Active	Active	Yes
6	Inactive	Active	No
7	Active	Active	Yes
8	Active	Active	Yes
9	Active	Inactive	No
10	Active	Active	Yes
11	Active	Active	Yes
12	Active	Active	Yes
13	Active	Active	Yes
14	Active	Active	Yes
15	Inactive	Active	No
16	Active	Active	Yes
17	Active	Active	Yes
18	Active	Active	Yes
19	Inactive	Active	No
20	Inactive	Active	No
21	Active	Active	Yes
22	Inactive	Active	No
23	Active	Active	Yes
24	Active	Active	Yes
25	Inactive	Active	No
26	Inactive	Inactive	Yes
27	Inactive	Active	No
28	Active	Active	Yes
29	Active	Active	Yes
30	Active	Active	Yes
31	Active	Active	Yes
32	Active	Active	Yes
33	Inactive	Active	No
34	Active	Active	Yes
35	Active	Active	Yes
36	Inactive	Active	No
37	Inactive	Active	No
38	Inactive	Active	No
39	Active	Active	Yes
40	Inactive	Active	No
41	Active	Active	Yes
42	Inactive	Active	No
43	Active	Active	Yes
44	Inactive	Active	No
45	Inactive	Ac