In [101]:
# Import the required libraries and dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [102]:
# Read the data from the online_gaming_behavior_dataset.csv file into a Pandas DataFrame
gaming_df = pd.read_csv("../Resources/online_gaming_behavior_dataset.csv")
# Review the DataFrame
gaming_df

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,49029,32,Male,USA,Strategy,20.619662,0,Easy,4,75,85,14,Medium
40030,49030,44,Female,Other,Simulation,13.539280,0,Hard,19,114,71,27,High
40031,49031,15,Female,USA,RPG,0.240057,1,Easy,10,176,29,1,High
40032,49032,34,Male,USA,Sports,14.017818,1,Medium,3,128,70,10,Medium


In [103]:
gaming_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

In [104]:
X = gaming_df.drop(columns='InGamePurchases')
y = gaming_df['InGamePurchases']

In [105]:
## No null values
gaming_df.isnull().sum()

PlayerID                     0
Age                          0
Gender                       0
Location                     0
GameGenre                    0
PlayTimeHours                0
InGamePurchases              0
GameDifficulty               0
SessionsPerWeek              0
AvgSessionDurationMinutes    0
PlayerLevel                  0
AchievementsUnlocked         0
EngagementLevel              0
dtype: int64

In [106]:
### DATA CLEANING
### Can use ordinal encoding here because it is essentially the same as one hot encoding, just combined into one column instead of 2
custom_mapping = [
    ['Easy', 'Medium', 'Hard'],  # Custom order for 'GameDifficulty'
    ['Low', 'Medium', 'High']  # Custom order for 'EngagementLevel'
]

oe_gender = OrdinalEncoder(categories=custom_mapping)

encodings = oe_gender.fit_transform(gaming_df[['GameDifficulty','EngagementLevel']])

gaming_df[['GameDifficulty','EngagementLevel']] = encodings

In [107]:
ohe = OneHotEncoder(sparse_output=False,dtype='int')

ohe_df = pd.DataFrame(data=ohe.fit_transform(gaming_df[['Gender','Location','GameGenre']]), columns=ohe.get_feature_names_out())

In [108]:
gaming_df = pd.concat([gaming_df, ohe_df], axis=1)
gaming_df = gaming_df.drop(columns = ['PlayerID','PlayTimeHours','GameGenre','Location','Gender'])
gaming_df['AvgMinutesPerWeek'] = gaming_df['SessionsPerWeek'] * gaming_df['AvgSessionDurationMinutes']
### END DATA CLEANING

In [109]:
gaming_df

Unnamed: 0,Age,Gender,Location,GameGenre,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,...,Location_Asia,Location_Europe,Location_Other,Location_USA,GameGenre_Action,GameGenre_RPG,GameGenre_Simulation,GameGenre_Sports,GameGenre_Strategy,AvgMinutesPerWeek
0,43,Male,Other,Strategy,0,1.0,6,108,79,25,...,0,0,1,0,0,0,0,0,1,648
1,29,Female,USA,Strategy,0,1.0,5,144,11,10,...,0,0,0,1,0,0,0,0,1,720
2,22,Female,USA,Sports,0,0.0,16,142,35,41,...,0,0,0,1,0,0,0,1,0,2272
3,35,Male,USA,Action,1,0.0,9,85,57,47,...,0,0,0,1,1,0,0,0,0,765
4,33,Male,Europe,Action,0,1.0,2,131,95,37,...,0,1,0,0,1,0,0,0,0,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,32,Male,USA,Strategy,0,0.0,4,75,85,14,...,0,0,0,1,0,0,0,0,1,300
40030,44,Female,Other,Simulation,0,2.0,19,114,71,27,...,0,0,1,0,0,0,1,0,0,2166
40031,15,Female,USA,RPG,1,0.0,10,176,29,1,...,0,0,0,1,0,1,0,0,0,1760
40032,34,Male,USA,Sports,1,1.0,3,128,70,10,...,0,0,0,1,0,0,0,1,0,384
