In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

### Steam Dataset

In [3]:
#https://data.world/craigkelly/steam-game-data
steam_df = pd.read_csv('games-features.csv')
steam_df.head()

Unnamed: 0,QueryID,ResponseID,QueryName,ResponseName,ReleaseDate,RequiredAge,DemoCount,DeveloperCount,DLCCount,Metacritic,...,LegalNotice,Reviews,SupportedLanguages,Website,PCMinReqsText,PCRecReqsText,LinuxMinReqsText,LinuxRecReqsText,MacMinReqsText,MacRecReqsText
0,10,10,Counter-Strike,Counter-Strike,Nov 1 2000,0,0,1,0,88,...,,,English French German Italian Spanish Simplifi...,,Minimum: 500 mhz processor 96mb ram 16mb video...,,Minimum: Linux Ubuntu 12.04 Dual-core from Int...,,Minimum: OS X Snow Leopard 10.6.3 1GB RAM 4GB...,
1,20,20,Team Fortress Classic,Team Fortress Classic,Apr 1 1999,0,0,1,0,0,...,,,English French German Italian Spanish,,Minimum: 500 mhz processor 96mb ram 16mb video...,,Minimum: Linux Ubuntu 12.04 Dual-core from Int...,,Minimum: OS X Snow Leopard 10.6.3 1GB RAM 4GB...,
2,30,30,Day of Defeat,Day of Defeat,May 1 2003,0,0,1,0,79,...,,,English French German Italian Spanish,http://www.dayofdefeat.com/,Minimum: 500 mhz processor 96mb ram 16mb video...,,Minimum: Linux Ubuntu 12.04 Dual-core from Int...,,Minimum: OS X Snow Leopard 10.6.3 1GB RAM 4GB...,
3,40,40,Deathmatch Classic,Deathmatch Classic,Jun 1 2001,0,0,1,0,0,...,,,English French German Italian Spanish,,Minimum: 500 mhz processor 96mb ram 16mb video...,,Minimum: Linux Ubuntu 12.04 Dual-core from Int...,,Minimum: OS X Snow Leopard 10.6.3 1GB RAM 4GB...,
4,50,50,Half-Life: Opposing Force,Half-Life: Opposing Force,Nov 1 1999,0,0,1,0,0,...,,,English French German Korean,,Minimum: 500 mhz processor 96mb ram 16mb video...,,Minimum: Linux Ubuntu 12.04 Dual-core from Int...,,Minimum: OS X Snow Leopard 10.6.3 1GB RAM 4GB...,


In [4]:
steam_df = steam_df[['Metacritic', 'GenreIsNonGame', 'GenreIsIndie', 'GenreIsAction',
                     'GenreIsAdventure', 'GenreIsCasual', 'GenreIsStrategy', 'GenreIsRPG',
                     'GenreIsSimulation', 'GenreIsEarlyAccess', 'GenreIsFreeToPlay', 'GenreIsSports',
                     'GenreIsRacing', 'GenreIsMassivelyMultiplayer', 'RecommendationCount', 'IsFree',
                     'FreeVerAvail', 'SteamSpyOwners', 'PlatformWindows', 'PlatformLinux',
                     'PlatformMac', 'CategorySinglePlayer', 'CategoryMultiplayer', 'CategoryCoop',
                     'CategoryMMO', 'CategoryInAppPurchase', 'CategoryIncludeSrcSDK', 'CategoryIncludeLevelEditor',
                     'CategoryVRSupport', 'MovieCount', 'ScreenshotCount', 'PriceFinal']]

#Drop games with no owners
steam_df = steam_df[steam_df['SteamSpyOwners'] > 0]

#Drop games not on windows (only 1 game)
steam_df = steam_df[steam_df['PlatformWindows'] == True]

In [5]:
steam_df.head()

Unnamed: 0,Metacritic,GenreIsNonGame,GenreIsIndie,GenreIsAction,GenreIsAdventure,GenreIsCasual,GenreIsStrategy,GenreIsRPG,GenreIsSimulation,GenreIsEarlyAccess,...,CategoryMultiplayer,CategoryCoop,CategoryMMO,CategoryInAppPurchase,CategoryIncludeSrcSDK,CategoryIncludeLevelEditor,CategoryVRSupport,MovieCount,ScreenshotCount,PriceFinal
0,88,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,13,9.99
1,0,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,5,4.99
2,79,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,5,4.99
3,0,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,4,4.99
4,0,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,5,4.99


In [6]:
steam_df['is_popular'] = steam_df['SteamSpyOwners'] > steam_df['SteamSpyOwners'].median()

In [7]:
steam_df_X = steam_df.drop(columns=['is_popular'])
steam_df_y = steam_df['is_popular']

In [8]:
steam_cat_X_col = ['GenreIsNonGame', 'GenreIsIndie', 'GenreIsAction',
       'GenreIsAdventure', 'GenreIsCasual', 'GenreIsStrategy', 'GenreIsRPG',
       'GenreIsSimulation', 'GenreIsEarlyAccess', 'GenreIsFreeToPlay',
       'GenreIsSports', 'GenreIsRacing', 'GenreIsMassivelyMultiplayer',
       'IsFree', 'FreeVerAvail',
       'PlatformWindows', 'PlatformLinux', 'PlatformMac',
       'CategorySinglePlayer', 'CategoryMultiplayer', 'CategoryCoop',
       'CategoryMMO', 'CategoryInAppPurchase', 'CategoryIncludeSrcSDK',
       'CategoryIncludeLevelEditor', 'CategoryVRSupport']

steam_col_transform = ColumnTransformer([('one-hot', OneHotEncoder(), steam_cat_X_col)], remainder='passthrough')
steam_X = steam_col_transform.fit_transform(steam_df_X)

steam_label_encoder = LabelEncoder()
steam_y = steam_label_encoder.fit_transform(steam_df_y)

### Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [9]:
#https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [11]:
cat_X_col = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
             'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
             'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
             'stalk-surface-below-ring', 'stalk-color-above-ring',
             'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
             'ring-type', 'spore-print-color', 'population', 'habitat']

col_transform = ColumnTransformer([('one-hot', OneHotEncoder(), cat_X_col)])
shrooms_X = col_transform.fit_transform(shrooms_df_X)

label_encoder = LabelEncoder()
shrooms_y = label_encoder.fit_transform(shrooms_df_y)

### Perform Trials

In [12]:
tree_params = {}

log_reg_params = [
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [1, 10]},
]

perceptron_params = {}

svc_params = [
    {'kernel': ['linear'], 'C': [1, 10]},
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}
]

knn_params = {}

forest_params = {}

models_params_grid = {
#     DecisionTreeClassifier(): tree_params,
    LogisticRegression(): log_reg_params,
#     Perceptron(): perceptron_params,
    SVC(): svc_params,
#     KNeighborsClassifier(): knn_params,
#     RandomForestClassifier(), forest_params
}

In [13]:
results_columns = ['dataset', 'trial', 'model', 'best_params', 'accuracy', 'precision', 'recall', 'specificity', 'f1', 'auc', 'logloss']
performance = pd.DataFrame(columns=results_columns)

In [14]:
def perform_trial(model, params, X_train, X_test, y_train, y_test):
    model_performance = pd.DataFrame(columns=results_columns)
    
    grid = GridSearchCV(model, params, cv=5, verbose=True, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    data = {
        'best_params': grid.best_params_,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'specificity': recall_score(y_test,y_pred, pos_label=0),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred),
        'logloss': log_loss(y_test, y_pred)
    }
    model_performance = model_performance.append(data, ignore_index=True)
    
    return model_performance

In [14]:
def perform_trials_on_data(models_params_grid, data_X, data_y, num_trials):
    dataset_performance = pd.DataFrame(columns=results_columns)
    
    for i in range(0, num_trials):
        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=i)
        
        for model in models_params_grid.keys():
            model_performance = perform_trial(model, models_params_grid[model], X_train, X_test, y_train, y_test)
            model_performance['trial'] = i;
            model_performance['model'] = str(model)
            dataset_performance = dataset_performance.append(model_performance, ignore_index=True)
            
    return dataset_performance

In [None]:
steam_performance = perform_trials_on_data(models_params_grid, steam_X, steam_y, 5)
steam_performance['dataset'] = 'steam'
performance.append(steam_performance, ignore_index=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


   dataset trial                 model  \
0      NaN     0  LogisticRegression()   

                                        best_params  accuracy  precision  \
0  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}  0.818267   0.910891   

     recall  specificity       f1       auc   logloss  
0  0.700952     0.932961  0.79225  0.816957  6.276848  
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   52.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


   dataset trial                 model  \
0      NaN     0  LogisticRegression()   
1      NaN     0                 SVC()   

                                        best_params  accuracy  precision  \
0  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}  0.818267   0.910891   
1                      {'C': 1, 'kernel': 'linear'}  1.000000   1.000000   

     recall  specificity       f1       auc       logloss  
0  0.700952     0.932961  0.79225  0.816957  6.276848e+00  
1  1.000000     1.000000  1.00000  1.000000  9.992007e-16  
Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [None]:
shrooms_performance = perform_trials_on_data(models_params_grid, shrooms_X, shrooms_y, 5)
shrooms_performance['dataset'] = 'shrooms'
performance.append(shrooms_performance, ignore_index=True)

In [None]:
performance