<h3>Dependency Installation</h3>

In [2615]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

<h3>Data Cleaning</h3>

<p>I condense a lot of the data within columns which on one hand means losing some data, but on the other hand means the data becomes significantly more balanced. I also combined related columns and converted everything into numerical data for fitting the model.</p>

In [2616]:
applications = pd.read_csv('Application Data.csv')
applications.dropna(subset=['Username'], inplace=True)

In [2617]:
def ConvertFound(found: str) -> str:
    if found == 'Shouts':
        return 'Shout'
    elif found == 'Friend' or found == 'Other':
        return 'Other'
    elif found == 'Forums' or found == 'Wynn Discord':
        return 'External'
    elif found == 'In-game Party' or found == 'Map':
        return 'In-Game'
    else:
        return np.NaN

In [2618]:
def ConvertActivity(activity: str) -> str:
    if activity == 'No life' or activity == 'Daily':
        return activity
    elif activity == 'Sometimes' or activity == 'Weekly':
        return 'Other'
    else:
        return np.NaN

In [2619]:
def ConvertPreviousGuild(previousGuild: str) -> bool:
    if previousGuild == 'Rando' or previousGuild == 'Allies' or previousGuild == 'Attacker' or previousGuild == 'Adonis':
        return True
    elif np.isnan(previousGuild):
        return False
    else:
        return np.NaN

In [2620]:
def GuildGrindContributions() -> pd.Series:
    guildGrind = ['Warring', 'Grinding', 'Guild Objective']
    cont_1 = applications['Contribute 1'].apply(lambda x: 1 if x in guildGrind else 0)
    cont_2 = applications['Contribute 2'].apply(lambda x: 1 if x in guildGrind else 0)
    cont_3 = applications['Contribute 3'].apply(lambda x: 1 if x in guildGrind else 0)
    cont_4 = applications['Contribute 4'].apply(lambda x: 1 if x in guildGrind else 0)
    
    return (cont_1 + cont_2 + cont_3 + cont_4).astype(bool)

In [2621]:
def CommunityContributions() -> pd.Series:
    community = ['Helping Others', 'Chatting']
    cont_1 = applications['Contribute 1'].apply(lambda x: 1 if x in community else 0)
    cont_2 = applications['Contribute 2'].apply(lambda x: 1 if x in community else 0)
    cont_3 = applications['Contribute 3'].apply(lambda x: 1 if x in community else 0)
    cont_4 = applications['Contribute 4'].apply(lambda x: 1 if x in community else 0)

    return (cont_1 + cont_2 + cont_3 + cont_4).astype(bool)

In [2622]:
def CountActivities() -> pd.Series:
    activity_1 = applications['Enjoy 1'].isna().astype(int)
    activity_2 = applications['Enjoy 2'].isna().astype(int)
    activity_3 = applications['Enjoy 3'].isna().astype(int)

    return activity_1 + activity_2 + activity_3

In [2623]:
def CountHobbies() -> pd.Series:
    hobby_1 = applications['Hobbies 1'].isna().astype(int)
    hobby_2 = applications['Hobbies 2'].isna().astype(int)
    hobby_3 = applications['Hobbies 3'].isna().astype(int)
    hobby_4 = applications['Hobbies 4'].isna().astype(int)

    return hobby_1 + hobby_2 + hobby_3 + hobby_4

In [2624]:
applications_ = pd.DataFrame()

applications_['Age'] = applications['Age'].fillna(round(applications['Age'].mean()))
applications_['Level'] = applications['Level']
applications_['Activity'] = applications['Activity'].apply(ConvertActivity).map({'Other': 0, 'Daily': 1, 'No life': 2})
applications_['Found'] = applications['Found'].apply(ConvertFound).map({'Other': 0, 'In-Game': 1, 'Shout': 2, 'External': 3})
applications_['Past Guild'] = applications['Prev guilds'].apply(ConvertPreviousGuild).map({False:0, True:1})

applications_['Guild Grind'] = GuildGrindContributions().map({False:0, True:1})
applications_['Community'] = CommunityContributions().map({False:0, True:1})

applications_['Num Enjoy'] = CountActivities()
applications_['Num Hobbies'] = CountHobbies()

applications_['Successful'] = applications['Successful'].map({'Yes': True, 'No': False, 'Trouble': False, 'Unknown': np.NaN}).map({False:0, True:1})

applications_.dropna(inplace=True)

In [2625]:
applications_.sample(n=20)

Unnamed: 0,Age,Level,Activity,Found,Past Guild,Guild Grind,Community,Num Enjoy,Num Hobbies,Successful
121,19.0,81.0,2,2,0,0,0,2,3,0
360,16.0,106.0,2,3,1,1,0,2,3,0
342,19.0,90.0,0,1,1,0,1,1,3,0
325,18.0,105.0,1,1,1,1,1,1,3,0
496,18.0,106.0,1,2,0,0,0,0,3,0
21,18.0,105.0,1,1,1,0,0,1,1,0
48,18.0,77.0,1,2,0,0,0,1,2,1
340,18.0,105.0,0,2,1,1,0,1,3,0
35,16.0,103.0,2,2,1,0,0,2,3,0
260,18.0,55.0,0,2,0,0,0,2,0,0


<h3> Model Creation </h3>

<p> I split my data, and because it was so imbalanced, I created a separate set of training data using SMOTE (creates new values of under-represented data using a KNN algorithm). I then created two models, one using the resampled data, and one using the original data. I then build a confusion matrix to get an idea of the predictions rather than basing everything off of accuracy due to the imbalanced nature of the dataset. </p>

In [2626]:
data = applications_[['Age', 'Level', 'Activity', 'Found', 'Past Guild', 'Guild Grind', 'Community', 'Num Enjoy', 'Num Hobbies']]
labels = applications_['Successful']
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.35)

train_data_rs, train_labels_rs = SMOTE().fit_resample(train_data, train_labels)

In [2627]:
model_rs = make_pipeline(StandardScaler(), SVC(class_weight='balanced', cache_size=2000))
model_rs.fit(train_data_rs, train_labels_rs)

tn, fp, fn, tp = confusion_matrix(y_pred=model_rs.predict(test_data), y_true=test_labels).ravel()
print(tn, fp, fn, tp)

print(model_rs.score(test_data, test_labels))

127 27 13 8
0.7714285714285715


In [2628]:
model = make_pipeline(StandardScaler(), SVC(class_weight='balanced', cache_size=2000))
model.fit(train_data, train_labels)

tn, fp, fn, tp = confusion_matrix(y_pred=model.predict(test_data), y_true=test_labels).ravel()
print(tn, fp, fn, tp)

print(model.score(test_data, test_labels))

108 46 13 8
0.6628571428571428
