In [None]:
import pandas as pds
import numpy as np
import os
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import sklearn.inspection as skli
import seaborn as sns

In [None]:
# Read and join tables
dir = os.getcwd()

leaderboards = pds.read_csv(os.path.join(os.getcwd(),"data\\v1\\leaderboards-data.csv"), parse_dates=['date', 'verifiedDate'])
games = pds.read_csv(os.path.join(os.getcwd(),"data\\v1\\games-data.csv"), parse_dates=['releaseDate', 'createdDate'])
users = pds.read_csv(os.path.join(os.getcwd(),"data\\v1\\users-data.csv"), parse_dates=['signupDate'])
categories = pds.read_csv(os.path.join(os.getcwd(),"data\\v1\\categories-data.csv"))

data_frame = leaderboards.merge(games, left_on="gameID", right_on="ID", how="left", suffixes=('_df1', '_gme'))
data_frame = data_frame.merge(users, left_on="players", right_on="ID", how="left", suffixes=('_df2', '_usr'))
data_frame = data_frame.merge(categories, left_on="categoryID", right_on="ID", how="left", suffixes=('_df3', '_cat'))

data_frame = data_frame.drop(columns=['ID_df2', 'name_df2', 'URL', 'name_usr', 'parentGameID', 'ID_usr', 'name', 'rules', 'variablesAndValues', 'ID'])
 
display(data_frame)
# Split players rows where there are multiple players into multiple rows 
cols = list(data_frame.columns.drop('players' ))
data_frame = (data_frame
   .set_index(cols)
   .stack()
   .str.split(',', expand=True)
   .stack()
   .unstack(-2)
   .reset_index(-1, drop=True)
   .reset_index()
)

for col in data_frame.columns[data_frame.dtypes == 'object']:
    data_frame[col] = data_frame[col].astype('category')

In [None]:
# Print Table

pds.set_option('display.max_columns', None)
display(data_frame)
print(data_frame.dtypes)

In [None]:
# Preprocessing

def replaceNaNsWithMostFrequent(value):
    return value.fillna(value.mode()[0])

def replaceNaNsWithMean(value):
    return value.fillna(value.mean())

def mapValuesToIntegers(value):
    return value.cat.codes

def convertToUNIXTimestamp(date):
    return (date.fillna("1970-01-01").astype('int64') // (10**9)).replace(0, np.NaN)

vals = data_frame.drop(columns=['runID', 'levelID', 'emulated']).isnull().sum()
cols = data_frame.columns.drop(['runID', 'levelID', 'emulated'])[vals != 0]
vals = vals[vals != 0]
plt.figure(figsize= (15, 6))
plt.bar(cols, vals, width=0.3)
plt.ylabel("Amount of NAs in the dataset")
plt.xlabel("Column label (Columns with 0 NAs not shown)")
plt.show()

data_frame[['date', 
            'verifiedDate', 
            'releaseDate', 
            'createdDate', 
            'signupDate']] = data_frame[['date', 
                                         'verifiedDate', 
                                         'releaseDate', 
                                         'createdDate', 
                                         'signupDate']].apply(convertToUNIXTimestamp)
data_frame[['date', 
            'verifiedDate', 
            'releaseDate', 
            'createdDate', 
            'signupDate',
            'numRuns']] = data_frame[['date', 
                                         'verifiedDate', 
                                         'releaseDate', 
                                         'createdDate', 
                                         'signupDate',
                                         'numRuns']].apply(replaceNaNsWithMean)

for col in data_frame.columns[data_frame.dtypes == 'category']:
    data_frame[[col]] = data_frame[[col]].apply(replaceNaNsWithMostFrequent)
    data_frame[[col]] = data_frame[[col]].apply(mapValuesToIntegers)

display(data_frame)
print(data_frame.dtypes)

In [None]:
# Normalization

minmax = data_frame.copy(deep=True)

for col in data_frame.columns[data_frame.dtypes != 'bool']:
    minimum = data_frame[col].min()
    minmax[col] = (data_frame[col] - minimum) / (data_frame[col].max() - minimum)

display(data_frame)
print(data_frame.isnull().sum())
display(minmax)
print(minmax.isnull().sum())

In [None]:
# Test-Train Split + Downsampling
rdm_seed = 404

minmax_majority = minmax[minmax['emulated'] == False]
minmax_minority = minmax[minmax['emulated'] == True]

mm_downsampled = resample(minmax_majority, 
                                 replace=False,    
                                 n_samples=len(minmax_minority),  
                                 random_state=rdm_seed) 

minmax = pds.concat([mm_downsampled, minmax_minority])
 
# Display new class counts
print(np.unique(minmax['emulated'], return_counts=True))

target = minmax['emulated']

plt.figure(figsize= (15, 6))
plt.bar(["Non-Emulated Pre-Downsampling", "Emulated Pre-Downsampling", "Non-Emulated Post-Downsampling", "Emulated Post-Downsampling"], 
        [minmax_majority.size, minmax_minority.size, (minmax[minmax['emulated'] == False]).size, (minmax[minmax['emulated'] == True]).size],
        width=0.5, color=['lightblue', 'orange', 'lightblue', 'orange'])
plt.ylabel("Amount of Entries in Dataset")
plt.show()

minmax = minmax.drop(columns=['runID', 'levelID', 'emulated'])

minmax_train, minmax_test, mm_target_train, mm_target_test  = train_test_split(minmax, target,test_size = 0.33, random_state = rdm_seed, shuffle = True)
display(minmax_train)
display(minmax_test)
display(mm_target_train)
display(mm_target_test)

In [None]:
accuracyArr = [0, 0, 0, 0, 0, 0, 0, 0]
modelNames = ["KNN", "Multinomial NB", "RandomForest", "Logistic Regression", "Decision Tree", "AdaBoost", "Multi-Layered Perceptron", "Bagging"]

In [None]:
# KNN
k = 5
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(minmax_train, mm_target_train)
target_pred = knn.predict(minmax_test)

In [None]:
# KNN Post-Run Statistics

accuracy = accuracy_score(mm_target_test,target_pred) * 100
accuracyArr[0] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model with k = {k}: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# Multinomial Naive Bayes

MNNB = MultinomialNB()
MNNB.fit(minmax_train, mm_target_train)
target_pred = MNNB.predict(minmax_test)

In [None]:
# Multinomial NB Post-Run Statistics

accuracy = accuracy_score(mm_target_test,target_pred) * 100
accuracyArr[1] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# Random Forest

randomForest = RandomForestClassifier(n_estimators=200, random_state=rdm_seed, class_weight='balanced')
randomForest.fit(minmax_train,mm_target_train)
target_pred = randomForest.predict(minmax_test)

In [None]:
# Random Forest Post-Run Statistics

accuracy = accuracy_score(mm_target_test,target_pred) * 100
accuracyArr[2] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# Logistic Regression

logReg = LogisticRegression(random_state=rdm_seed, class_weight='balanced')
logReg.fit(minmax_train, mm_target_train)
target_pred = logReg.predict(minmax_test)

In [None]:
# Logistic Regression Post-Run Statistics

accuracy = accuracy_score(mm_target_test,target_pred) * 100
accuracyArr[3] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# Decision Tree

dTree = DecisionTreeClassifier(random_state=rdm_seed, class_weight='balanced')
dTree.fit(minmax_train,mm_target_train)
target_pred = dTree.predict(minmax_test)

In [None]:
# Decision Tree Post-Run Statistics

accuracy = accuracy_score(mm_target_test, target_pred) * 100
accuracyArr[4] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# AdaBoost

adaBoost = AdaBoostClassifier(algorithm='SAMME', random_state=rdm_seed)
adaBoost.fit(minmax_train,mm_target_train)
target_pred = adaBoost.predict(minmax_test)

In [None]:
# AdaBoost Post-Run Statistics

accuracy = accuracy_score(mm_target_test, target_pred) * 100
accuracyArr[5] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
# Multi-Layered Perceptron

MLP = MLPClassifier(random_state=rdm_seed, max_iter=1000)
MLP.fit(minmax_train, mm_target_train)
target_pred = MLP.predict(minmax_test)

In [None]:
# Multi-Layered Perceptron Post-Run Statistics

accuracy = accuracy_score(mm_target_test, target_pred) * 100
accuracyArr[6] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()
plt.show()

In [None]:
#Bagging Classifier

bgc=BaggingClassifier()
bgc.fit(minmax_train, mm_target_train)  
target_pred = bgc.predict(minmax_test)

In [None]:
#Bagging Classifier Statistics

accuracy = accuracy_score(mm_target_test,target_pred) * 100
accuracyArr[7] = accuracy
conMatrix = confusion_matrix(mm_target_test, target_pred)

print(f'Accuracy of model: {accuracy}%\n')
print(f'Classification Report:\n {classification_report(mm_target_test, target_pred)}')
ConfusionMatrixDisplay(conMatrix, display_labels=["Not Emulated", "Emulated"],).plot()

In [None]:
importance = skli.permutation_importance(bgc, minmax_test, mm_target_test, n_repeats=30, random_state=rdm_seed)
sorted_importances = importance.importances_mean.argsort()
plt.barh(minmax_train.columns[sorted_importances], importance.importances_mean[sorted_importances])
plt.xlabel("Permutation Importance Score")
plt.show()

In [None]:
corr_matrix = data_frame.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) 
plt.figure(figsize=(10, 6)) 
heatmap = sns.heatmap(corr_matrix, mask=mask, vmin=-1, vmax=1, cmap='coolwarm')
heatmap.set_title('Correlation Matrix Heatmap')
plt.show()

In [None]:
print(accuracyArr)
sorted_accuracies = np.array(accuracyArr).argsort()
plt.barh(np.array(modelNames)[sorted_accuracies], np.array(accuracyArr)[sorted_accuracies], height=0.5)
plt.xlabel("Model Accuracy (%)")
plt.ylabel("Name of Model")
plt.show()