In [2]:
# Import the required libraries and dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.svm import SVC

In [3]:
# Read the data from the online_gaming_behavior_dataset.csv file into a Pandas DataFrame
gaming_df = pd.read_csv("../Resources/online_gaming_behavior_dataset.csv")
# Review the DataFrame
gaming_df

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,49029,32,Male,USA,Strategy,20.619662,0,Easy,4,75,85,14,Medium
40030,49030,44,Female,Other,Simulation,13.539280,0,Hard,19,114,71,27,High
40031,49031,15,Female,USA,RPG,0.240057,1,Easy,10,176,29,1,High
40032,49032,34,Male,USA,Sports,14.017818,1,Medium,3,128,70,10,Medium


In [4]:
gaming_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

In [5]:
## No null values
gaming_df.isnull().sum()

PlayerID                     0
Age                          0
Gender                       0
Location                     0
GameGenre                    0
PlayTimeHours                0
InGamePurchases              0
GameDifficulty               0
SessionsPerWeek              0
AvgSessionDurationMinutes    0
PlayerLevel                  0
AchievementsUnlocked         0
EngagementLevel              0
dtype: int64

In [6]:
### DATA CLEANING
custom_mapping = [
    ['Easy', 'Medium', 'Hard'],  # Custom order for 'GameDifficulty'
    ['Low', 'Medium', 'High']  # Custom order for 'EngagementLevel'
]
oe_gender = OrdinalEncoder(categories=custom_mapping)
encodings = oe_gender.fit_transform(gaming_df[['GameDifficulty','EngagementLevel']])
gaming_df[['GameDifficulty','EngagementLevel']] = encodings

ohe = OneHotEncoder(sparse_output=False, dtype='int')
ohe_df = pd.DataFrame(data=ohe.fit_transform(gaming_df[['Gender','Location','GameGenre']]), columns=ohe.get_feature_names_out())
gaming_df = pd.concat([gaming_df, ohe_df], axis=1)


In [7]:
# Drop unnecessary columns
gaming_df = gaming_df.drop(columns=['PlayerID', 'PlayTimeHours', 'GameGenre', 'Location', 'Gender'])

# Add a new feature
gaming_df['AvgMinutesPerWeek'] = gaming_df['SessionsPerWeek'] * gaming_df['AvgSessionDurationMinutes']

### END DATA CLEANING ###

In [8]:
### START OF ML
# Define features and target
X = gaming_df.drop(columns='InGamePurchases')
y = gaming_df['InGamePurchases']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

grid_search = RandomizedSearchCV(SVC(), param_distributions=param_grid, n_iter=10, refit=True, verbose=2, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [1]:
# Predict with the optimized model
y_pred_optimized = best_model.predict(X_test)


NameError: name 'best_model' is not defined

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_optimized)

# Plotting the Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Purchase', 'Purchase'], 
            yticklabels=['No Purchase', 'Purchase'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Visualizing the accuracy for different hyperparameter combinations
results = pd.DataFrame(grid_search.cv_results_)
scores_matrix = results.pivot("param_C", "param_gamma", "mean_test_score")

plt.figure(figsize=(12, 8))
sns.heatmap(scores_matrix, annot=True, cmap="YlGnBu")
plt.title('GridSearchCV Results: Accuracy Scores')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()

In [None]:
# Output the results
print("Optimized Accuracy:", optimized_accuracy)
print("\nOptimized Classification Report:\n", optimized_classification_report)
print("\nBest Parameters Found:\n", best_params)