In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

### Clean Chess Dataset

In [2]:
#https://www.kaggle.com/datasnaek/chess
chess_df = pd.read_csv('Data/games.csv')
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [3]:
chess_df['winner_white'] = chess_df['winner'] == 'white'
chess_df = chess_df[['rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
                     'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]
chess_df.head()

Unnamed: 0,rated,created_at,last_move_at,turns,victory_status,white_rating,black_rating,opening_eco,opening_ply,winner_white
0,False,1504210000000.0,1504210000000.0,13,outoftime,1500,1191,D10,5,True
1,True,1504130000000.0,1504130000000.0,16,resign,1322,1261,B00,4,False
2,True,1504130000000.0,1504130000000.0,61,mate,1496,1500,C20,3,True
3,True,1504110000000.0,1504110000000.0,61,mate,1439,1454,D02,3,True
4,True,1504030000000.0,1504030000000.0,95,mate,1523,1469,C41,5,True


In [4]:
chess_df_X = chess_df.drop(columns=['winner_white'])
chess_df_y = chess_df['winner_white']

In [5]:
chess_X_cat_col = ['rated', 'victory_status', 'opening_eco']
chess_X_col_transform = ColumnTransformer([('one-hot', OneHotEncoder(drop='if_binary'), chess_X_cat_col)], remainder='passthrough')
chess_X = chess_X_col_transform.fit_transform(chess_df_X)

chess_y_label_encoder = LabelEncoder()
chess_y = chess_y_label_encoder.fit_transform(chess_df_y)

### Clean Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [6]:
#https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('Data/mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [7]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [8]:
shrooms_df_X_cat_col = shrooms_df_X.columns

shrooms_X_col_transform = ColumnTransformer([('one-hot', OneHotEncoder(), shrooms_df_X_cat_col)])
shrooms_X = shrooms_X_col_transform.fit_transform(shrooms_df_X)

shrooms_y_label_encoder = LabelEncoder()
shrooms_y = shrooms_y_label_encoder.fit_transform(shrooms_df_y)

### Perform Trials

In [9]:
results_columns = ['dataset', 'model', 'trial', 'best_params', 'accuracy', 'precision', 'recall', 'specificity', 'f1', 'auc', 'logloss']
results = pd.DataFrame(columns=results_columns)

In [10]:
tree_params = []

log_reg_params = [
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [1, 10]},
]

perceptron_params = []

svc_params = [
    {'kernel': ['linear'], 'C': [1, 10]},
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]}
]

knn_params = [
    {'n_neighbors': list(range(1, 10)), 'weights': ['uniform', 'distance'], 'algorithm': ['auto'], 'p': list(range(1, 4))}
]

forest_params = []

models = {
#     'tree': (DecisionTreeClassifier(), tree_params),
    'log_reg': (LogisticRegression(), log_reg_params),
#     'perceptron': (Perceptron(), perceptron_params),
    'svm': (SVC(), svc_params),
    'knn': (KNeighborsClassifier(), knn_params),
#     'forest': (RandomForestClassifier(), forest_params)
}

In [11]:
datasets0 = {
    'chess': (chess_X, chess_y),
    'shrooms': (shrooms_X, shrooms_y)
}

# datasets1 = {
#     'data2': (data2_X, data2_y),
#     'data3': (data3_X, data3_y)
# }

# datasets2 = {
#     'data4': (data4_X, data4_y),
#     'data5': (data5_X, data5_y)
# }

In [None]:
datasets = datasets0;

# for each dataset
for dataset_name in datasets.keys():
    data_X = datasets[dataset_name][0]
    data_y = datasets[dataset_name][1]
    
    # perform 5 trials on each dataset
    for trial_count in range(5):

        # pick 5000 samples with replacement to be in the training set
        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=5000, random_state=trial_count)

        # perform trials on each model
        for model_name in models.keys():
            model = models[model_name][0]
            model_params = models[model_name][1]

            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params, cv=5, verbose=True, n_jobs=-1)
            
            # find the best parameters for the model
            # grid search automatically refits a model on the entire validation set using the best parameters
            search.fit(X_train, y_train)

            # use metrics to evaluate model performance on the test set
            y_pred = search.predict(X_test)
            result = {
                'dataset': dataset_name,
                'model': model_name,
                'trial': trial_count,
                'best_params': search.best_params_,
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred),
                'specificity': recall_score(y_test,y_pred, pos_label=0),
                'f1': f1_score(y_test, y_pred),
                'auc': roc_auc_score(y_test, y_pred),
                'logloss': log_loss(y_test, y_pred)
            }

            # append results to the results dataframe
            results.append(result, ignore_index=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   13.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [None]:
# combine datasets

In [None]:
# compute averages

In [None]:
# create table 1

In [None]:
# create table 2