In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
BASE_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/Final Project'
print(os.listdir(BASE_FILE_PATH))

['Keys', 'Setting up connection with Git-Hub.ipynb', 'entries.csv', 'summoner_details.csv', 'updated_dataset.csv', 'filtered_dataset.csv', 'filtered_dataset.gsheet', 'filtered_+30_matches_dataset.csv', 'EDA.ipynb', 'merged_dataset.csv', 'Merging match with entries and classifying accounts.ipynb', 'merged_dataset.gsheet', 'Feature engineering and cleaning vol 1.ipynb', 'Models without data normalization.ipynb', 'dataset_after_normalization.csv', 'Normalization_Parameters.csv', 'keepign only win feature .ipynb']


In [4]:
df = pd.read_csv(os.path.join(BASE_FILE_PATH, 'merged_dataset.csv'))


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   match_id              128021 non-null  object 
 1   game_creation         128021 non-null  int64  
 2   game_duration         128021 non-null  int64  
 3   game_mode             128021 non-null  object 
 4   game_version          128021 non-null  object 
 5   map_id                128021 non-null  int64  
 6   summoner_id           128021 non-null  object 
 7   puuid                 128021 non-null  object 
 8   champion_id           128021 non-null  int64  
 9   kills                 128021 non-null  int64  
 10  deaths                128021 non-null  int64  
 11  assists               128021 non-null  int64  
 12  total_damage_dealt    128021 non-null  int64  
 13  total_damage_taken    128021 non-null  int64  
 14  gold_earned           128021 non-null  int64  
 15  

In [6]:
df_model = df[['summoner_id', 'win', 'time_segment', 'binary_time_group']]

# Now df_model will contain only the columns you're interested in
print(df_model.head())

                                        summoner_id    win  time_segment  \
0  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE   True  First Period   
1  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE   True  First Period   
2  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE  False  First Period   
3  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE  False  First Period   
4  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE   True  First Period   

  binary_time_group  
0            Active  
1            Active  
2            Active  
3            Active  
4            Active  


In [7]:
# Convert 'win' column from boolean to integer (1 for True, 0 for False)
df_model['win'] = df_model['win'].astype(int)

# Display the head to verify the conversion
print(df_model.head())

                                        summoner_id  win  time_segment  \
0  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE    1  First Period   
1  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE    1  First Period   
2  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE    0  First Period   
3  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE    0  First Period   
4  --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE    1  First Period   

  binary_time_group  
0            Active  
1            Active  
2            Active  
3            Active  
4            Active  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['win'] = df_model['win'].astype(int)


In [8]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128021 entries, 0 to 128020
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   summoner_id        128021 non-null  object
 1   win                128021 non-null  int64 
 2   time_segment       128021 non-null  object
 3   binary_time_group  128021 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.9+ MB


In [9]:
# Create a pivot table to calculate average win rate per time segment
win_avg_per_segment = df.pivot_table(index='summoner_id', columns='time_segment', values='win', aggfunc='mean')

# Reset the index to flatten the DataFrame and rename columns appropriately
win_avg_per_segment.reset_index(inplace=True)
win_avg_per_segment.columns = ['summoner_id', 'win_avg_first_period', 'win_avg_second_period', 'win_avg_third_period']

# Display the transformed DataFrame
print(win_avg_per_segment.head(50))

                                          summoner_id  win_avg_first_period  \
0    --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE              0.600000   
1    -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy              0.500000   
2     -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4              0.550000   
3   -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2...              0.391304   
4     -3ntNAfsqQ8f8pHSCh805dnhlQvmv2_GYLWxhwDfn6I1h2g              0.476190   
5    -3octyr43CfmenR8kI_NBmo2L1Z7EuzW7TyGxq23Gn4yvB6x              0.315789   
6    -48H0zwIE1AchxYRFBcFn7fSEEr6OplutJMpZ4QFhSWflnQA              0.619048   
7    -4le7Ki3BnzKo2nGERmm8V8LIzkJuUM3yK_6CmDQoFHC3nXo              0.500000   
8     -57abaqo0rKW30x3I3SoJuDtkU0-vqFOXNNhomuF3lbHj-g              0.333333   
9    -BuxGQBn0sGfDUFdpPX7WjtCO82yg4x1PjiWjjiBVua9AZc2              0.333333   
10   -Cqsn93hMv_QFJDvMgj0e6ZrGIxQwqQ9A62e-fKdlc0ixyOK              0.409091   
11   -D7G5Q31odyA5MB6rDTvKJwiCLr-iWHmV55cAr9jOLVyEh6

In [10]:
check_agg = df.groupby(['summoner_id', 'time_segment'])['win'].mean().reset_index()
print(check_agg.head(20))

                                          summoner_id   time_segment       win
0    --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE   First Period  0.600000
1    --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE  Second Period  0.473684
2    --yDuIIktIqjdme32tV29wcBIFPE08CY8WhiZhL4Gd6p60aE   Third Period  0.473684
3    -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy   First Period  0.500000
4    -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy  Second Period  0.529412
5    -0FAl-E5gzHW35MsMHI7ZBv4jCme6S8TT2W7az4-G7x7Y-Yy   Third Period  0.500000
6     -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4   First Period  0.550000
7     -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4  Second Period  0.600000
8     -1ddPTjEGbdaQR2HcKWdP0qkBfSjsyJcFot0WGcyGxkjSy4   Third Period  0.526316
9   -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2...   First Period  0.391304
10  -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2...  Second Period  0.636364
11  -2ksdxdFEy1AnyqNsmO-RI_2n8ic7mAzN5-LYGvttHTwY2..

In [11]:
win_avg_per_segment = win_avg_per_segment.merge(
    df[['summoner_id', 'binary_time_group']].drop_duplicates(),
    on='summoner_id',
    how='left'
)

# Now drop the summoner_id as it's no longer needed
win_avg_per_segment.drop('summoner_id', axis=1, inplace=True)

# Display the resulting DataFrame
print(win_avg_per_segment.head())

   win_avg_first_period  win_avg_second_period  win_avg_third_period  \
0              0.600000               0.473684              0.473684   
1              0.500000               0.529412              0.500000   
2              0.550000               0.600000              0.526316   
3              0.391304               0.636364              0.272727   
4              0.476190               0.350000              0.526316   

  binary_time_group  
0            Active  
1            Active  
2          Inactive  
3            Active  
4          Inactive  


In [12]:
# Filter and sample active players
active_samples = win_avg_per_segment[win_avg_per_segment['binary_time_group'] == 'Active'].sample(n=200, random_state=42)

# Filter and sample inactive players
inactive_samples = win_avg_per_segment[win_avg_per_segment['binary_time_group'] == 'Inactive'].sample(n=200, random_state=42)

# Combine both samples into a new DataFrame
filtered_df = pd.concat([active_samples, inactive_samples])

# Shuffle the DataFrame to mix active and inactive entries
win_avg_per_segment = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)





# Convert 'binary_time_group' to numeric
win_avg_per_segment['binary_time_group'] = win_avg_per_segment['binary_time_group'].map({'Active': 1, 'Inactive': 0})








# Features and target variable
X = win_avg_per_segment[['win_avg_first_period', 'win_avg_second_period', 'win_avg_third_period']]
y = win_avg_per_segment['binary_time_group']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Logistic Regression model
model = LogisticRegression()

# Fit the model
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.55      0.54        38
           1       0.57      0.55      0.56        42

    accuracy                           0.55        80
   macro avg       0.55      0.55      0.55        80
weighted avg       0.55      0.55      0.55        80

[[21 17]
 [19 23]]


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

# Splitting the data
X = win_avg_per_segment.drop(['binary_time_group'], axis=1)
y = win_avg_per_segment['binary_time_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Gradient Boosting Machine
gbm_model = LGBMClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)
y_pred_gbm = gbm_model.predict(X_test)

# Evaluating the models
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Gradient Boosting Machine Performance:")
print(classification_report(y_test, y_pred_gbm))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 158, number of negative: 162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493750 -> initscore=-0.025001
[LightGBM] [Info] Start training from score -0.025001
Random Forest Performance:
              precision    recall  f1-score   support

           0       0.55      0.68      0.61        38
           1       0.64      0.50      0.56        42

    accuracy                           0.59        80
   macro avg       0.59      0.59      0.59        80
weighted avg       0.60      0.59      0.58        80

Gradient Boosting Machine Performance:
              precision    recall  f1-score   support

           0       0.52      0.63      0.57        38
           1