In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

import warnings
# warnings.filterwarnings('ignore')

### Clean Chess Dataset

In [13]:
# https://www.kaggle.com/datasnaek/chess
chess_df = pd.read_csv('Data/games.csv')
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [14]:
chess_df['winner_white'] = chess_df['winner'] == 'white'
# chess_df = chess_df[['rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
#                      'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]
chess_df = chess_df[['rated', 'turns', 'victory_status',
                     'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]
chess_df.head()

Unnamed: 0,rated,turns,victory_status,white_rating,black_rating,opening_eco,opening_ply,winner_white
0,False,13,outoftime,1500,1191,D10,5,True
1,True,16,resign,1322,1261,B00,4,False
2,True,61,mate,1496,1500,C20,3,True
3,True,61,mate,1439,1454,D02,3,True
4,True,95,mate,1523,1469,C41,5,True


In [15]:
chess_df_X = chess_df.drop(columns=['winner_white'])
chess_df_y = chess_df['winner_white']

In [16]:
chess_X_cat_col = ['rated', 'victory_status', 'opening_eco']
chess_X = pd.get_dummies(columns=chess_X_cat_col, data=chess_df_X)

chess_y = chess_df_y.replace({True: 1, False: 0})

### Clean Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [17]:
# https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('Data/mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [18]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [19]:
shrooms_X = pd.get_dummies(data=shrooms_df_X)
shrooms_y = shrooms_df_y.replace({'e': 0, 'p': 1})

### Clean Cardio Dataset

Retrieved from the kaggle site https://www.kaggle.com/sulianova/cardiovascular-disease-dataset, this cardio dataset has 70000 samples and 12 variables, which were collected at the moment of medical examination. It contains a target variable that indicates the presence or absence of cardiovascular disease, as well as 11 features that might be associated with the presence of cardiovascular disease, such as age, gender, and blood pressure. There are 3 types of 11 input features:
- objective feature: factual information
- examination feature: results of medical examination
- subjective feature: information given by the patient

A more detailed description of 12 variables are shown below:

- age: objective feature, int (days)
- height: objective feature, int (cm)
- weight: objective feature, float (kg)
- gender: objective feature, categorical code, 1: male, 2:female
- ap_hi: systolic blood pressure, examination feature, int
- ap_lo: diastolic blood pressure, examination feature, int
- cholesterol: examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- gluc: glucose, examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- smoke: subjective feature, binary, 0: do not smoke, 1: smoke
- alco: alcohol intake, subjective feature, binary, 0: do not drink alcohol, 1: drink alcohol
- active: physical activity, subjective feature, binary, 0: not physically active, 1: physically active
- cardio: presence or absence of cardiovascular disease, target variable, binary, 0: disease not present, 1: disease present

For this dataset, we want use those 11 input features and apply machine learning algorithms to predict whether a person has cardiovascular disease or not.

In [19]:
# load the cardio dataset
cardio = pd.read_csv('data/cardio.csv', delimiter = ';')
# drop unnecessary column "id"
cardio = cardio.drop(columns = ['id'])
# convert age in days to age in years
cardio['age'] = cardio['age'].apply(lambda x: int(x/365))

In [20]:
# one hot encoding categorical input features stored in cate_cols
cate_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio = pd.get_dummies(columns = cate_cols, data = cardio)

In [25]:
# a look at cleaned dataset
cardio.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
0,50,168,62.0,110,80,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1
1,55,156,85.0,140,90,1,1,0,0,0,1,1,0,0,1,0,1,0,0,1
2,51,165,64.0,130,70,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0
3,48,169,82.0,150,100,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1
4,47,156,56.0,100,60,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0


In [26]:
# split the cardio dataset into input features and labels 
cardio_X = cardio.drop(columns=['cardio']) # input features
cardio_y = cardio['cardio'] # true lables

### Clean Rain Dataset

### Clean BnB Dataset

### Clean Olympic Dataset

### Perform Trials

In [34]:
# Parameters for the model
tree_params = [
    {
        'max_depth': [2,3,4,5,7,10,13,15,18,None], 
        'min_samples_split':[2,3,5,7,10,15,20],
        'min_samples_leaf':[2,3,5,7,10,15,20]
    }
]

log_reg_params = [
    {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': 10 **np.array(np.arange(-8, 5, 1), dtype='float32')
    }
]

perceptron_params = [
    {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]
    }
]

svc_params = [
    {
        'kernel': ['linear'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32')
    },
    {
        'kernel': ['poly'],
        'degree': [2, 3],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
    },
    {
        'kernel': ['rbf'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
        'gamma': [0.001,0.01,0.1,1,2]
    }
]

knn_params = [
    {
        'n_neighbors': np.arange(1, 106, 4),
        'metric': ["euclidean", "manhattan", "minkowski"]
    }
]

forest_params = [
    {
        'n_estimators': [1024],
        'min_samples_split': [1, 2, 4, 6, 8, 12, 16, 20]
    }
]

models_without_svm = {
    'tree': (DecisionTreeClassifier(), tree_params),
    'log_reg': (LogisticRegression(), log_reg_params),
    'perceptron': (Perceptron(), perceptron_params),
    'knn': (KNeighborsClassifier(), knn_params),
    'forest': (RandomForestClassifier(), forest_params)
}

models_only_svm = {
    'svm': (SVC(), svc_params)
}

In [24]:
# perform trials on dataset
def perform_trials(dataset_name, models, data_X, data_y):
    results_columns = ['dataset', 'model', 'trial',
                       'train_accuracy', 'train_precision', 'train_recall', 'train_specificity',
                       'train_f1', 'train_auc', 'train_logloss',
                       'test_accuracy', 'test_precision', 'test_recall', 'test_specificity',
                       'test_f1', 'test_auc', 'test_logloss']
    num_trials = 5
    
    data_results = pd.DataFrame(columns=results_columns)

    # perform trials using each model
    for model_name in models.keys():
        
        model = models[model_name][0]
        model_params = models[model_name][1]
        
        train_metrics = np.zeros(7)
        test_metrics =  np.zeros(7)
        
        model_results = pd.DataFrame(columns=results_columns)
        
        # perform 5 trials on each dataset
        for trial_count in range(num_trials):
            # pick 5000 samples with replacement to be in the training set
            X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=5000, random_state=trial_count)
            
            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params, cv=5, verbose=3, n_jobs=-1)
            
            # find the best parameters for the model
            # grid search automatically refits a model on the entire validation set using the best parameters
            search.fit(X_train, y_train)
            
            # use metrics to evaluate model performance on the test set
            y_train_pred = search.predict(X_train)
            y_test_pred = search.predict(X_test)
            
            # compute metrics
            model_result = {
                'dataset': dataset_name,
                'model': model_name,
                'trial': trial_count + 1,

                'train_accuracy': accuracy_score(y_train, y_train_pred),
                'train_precision': precision_score(y_train, y_train_pred),
                'train_recall': recall_score(y_train, y_train_pred),
                'train_specificity': recall_score(y_train, y_train_pred, pos_label=0),
                'train_f1': f1_score(y_train, y_train_pred),
                'train_auc': roc_auc_score(y_train, y_train_pred),
                'train_logloss': log_loss(y_train, y_train_pred),

                'test_accuracy': accuracy_score(y_test, y_test_pred),
                'test_precision': precision_score(y_test, y_test_pred),
                'test_recall': recall_score(y_test, y_test_pred),
                'test_specificity': recall_score(y_test, y_test_pred, pos_label=0),
                'test_f1': f1_score(y_test, y_test_pred),
                'test_auc': roc_auc_score(y_test, y_test_pred),
                'test_logloss': log_loss(y_test, y_test_pred)
            }
            
            # append model_result to the model_results dataframe
            model_results = model_results.append(model_result, ignore_index=True)
        
        # append model_results to data_results
        data_results = data_results.append(model_results, ignore_index=True)
        
        avg_result = {
            'dataset': dataset_name,
            'model': model_name,
            'trial': 'avg',
            
            'train_accuracy': model_results.train_accuracy.mean(),
            'train_precision': model_results.train_precision.mean(),
            'train_recall': model_results.train_recall.mean(),
            'train_specificity': model_results.train_specificity.mean(),
            'train_f1': model_results.train_f1.mean(),
            'train_auc': model_results.train_auc.mean(),
            'train_logloss': model_results.train_logloss.mean(),
            
            'test_accuracy': model_results.test_accuracy.mean(),
            'test_precision': model_results.test_precision.mean(),
            'test_recall': model_results.test_recall.mean(),
            'test_specificity': model_results.test_specificity.mean(),
            'test_f1': model_results.test_f1.mean(),
            'test_auc': model_results.test_auc.mean(),
            'test_logloss': model_results.test_logloss.mean()
        }
        
        # append avg_result to the data_results dataframe
        data_results = data_results.append(avg_result, ignore_index=True)
    
    return data_results

### Duy Results

In [None]:
chess_results_no_svm = perform_trials('chess', models_without_svm, chess_X, chess_y)
chess_results_no_svm.to_csv('results/chess_no_svm')
chess_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 928 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 2208 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   46.2s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   43.2s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   45.6s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   48.3s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   48.4s finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.3s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.0s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.4s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.5s finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   23.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.7min


In [None]:
shrooms_results_no_svm = perform_trials('shrooms', models_without_svm, shrooms_X, shrooms_y)
shrooms_results_no_svm.to_csv('results/shrooms_no_svm')
shrooms_results_no_svm

In [None]:
chess_results_svm = perform_trials('chess', models_only_svm, chess_X, chess_y)
chess_results_svm.to_csv('results/chess_svm')
chess_results_svm

In [None]:
shrooms_results_svm = perform_trials('shrooms', models_only_svm, shrooms_X, shrooms_y)
shrooms_results_svm.to_csv('results/shrooms_svm')
shrooms_results_svm

### Results of Cardio Dataset

In [33]:
cardio_results_no_svm = perform_trials('cardio', models_without_svm, cardio_X, cardio_y)
cardio_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.7178    nan 0.7202    nan 0.7178    nan 0.7276    nan 0.7178
    nan 0.7138    nan 0.7178    nan 0.7162    nan 0.7178    nan 0.7166
    nan 0.7178    nan 0.7182    nan 0.7178    nan 0.7182    nan 0.7178
    nan 0.715     nan 0.7178    nan 0.7184    nan 0.7178    nan 0.7186
    nan 0.7178]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6582    nan 0.6512    nan 0.6582    nan 0.6518    nan 0.6582
    nan 0.6478    nan 0.6582    nan 0.6546    nan 0.6582    nan 0.6552
    nan 0.6582    nan 0.654     nan 0.6582    nan 0.655     nan 0.6582
    nan 0.6578    nan 0.6582    nan 0.6562    nan 0.6582    nan 0.6568
    nan 0.6582]
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.7142    nan 0.716     nan 0.7142    nan 0.719     nan 0.7142
    nan 0.71      nan 0.7142    nan 0.7122    nan 0.7142    nan 0.7126
    nan 0.7142    nan 0.7126    nan 0.7142    nan 0.7142    nan 0.7142
    nan 0.7148    nan 0.7142    nan 0.7126    nan 0.7142    nan 0.7152
    nan 0.7142]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6604    nan 0.6426    nan 0.6604    nan 0.6452    nan 0.6604
    nan 0.6462    nan 0.6604    nan 0.6546    nan 0.6604    nan 0.661
    nan 0.6604    nan 0.659     nan 0.6604    nan 0.6578    nan 0.6604
    nan 0.6594    nan 0.6604    nan 0.659     nan 0.6604    nan 0.6574
    nan 0.6604]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.7094    nan 0.7144    nan 0.7094    nan 0.7168    nan 0.7094
    nan 0.708     nan 0.7094    nan 0.714     nan 0.7094    nan 0.7106
    nan 0.7094    nan 0.7108    nan 0.7094    nan 0.709     nan 0.7094
    nan 0.7092    nan 0.7094    nan 0.7092    nan 0.7094    nan 0.709
    nan 0.7094]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,cardio,tree,1,0.7272,0.760951,0.657407,0.796105,0.7054,0.726756,9.42226,0.718092,0.751375,0.651762,0.784402,0.698032,0.718082,9.736833
1,cardio,tree,2,0.7308,0.735317,0.737049,0.724307,0.736182,0.730678,9.297947,0.716569,0.713967,0.720638,0.712518,0.717287,0.716578,9.789467
2,cardio,tree,3,0.7276,0.77355,0.656041,0.801545,0.709966,0.728793,9.408441,0.723985,0.753861,0.663574,0.784166,0.705842,0.72387,9.53332
3,cardio,tree,4,0.7372,0.768923,0.684774,0.790557,0.724413,0.737665,9.076873,0.724569,0.741616,0.688172,0.76087,0.713895,0.724521,9.513137
4,cardio,tree,5,0.735,0.787234,0.647831,0.82308,0.710762,0.735456,9.152846,0.726877,0.773141,0.641379,0.812196,0.701123,0.726788,9.433412
5,cardio,tree,avg,0.73156,0.765195,0.67662,0.787119,0.717344,0.73187,9.271673,0.722018,0.746792,0.673105,0.770831,0.707236,0.721968,9.601234
6,cardio,log_reg,1,0.7278,0.767,0.649356,0.805246,0.703292,0.727301,9.401533,0.719323,0.754983,0.649269,0.789355,0.698147,0.719312,9.694322
7,cardio,log_reg,2,0.649,0.661771,0.636578,0.661909,0.64893,0.649243,12.123243,0.647708,0.652171,0.629829,0.66551,0.640806,0.64767,12.167879
8,cardio,log_reg,3,0.7178,0.755425,0.657615,0.779992,0.703135,0.718803,9.746929,0.72,0.741978,0.672945,0.766875,0.705778,0.71991,9.670951
9,cardio,log_reg,4,0.6752,0.661395,0.72958,0.619855,0.693816,0.674717,11.218345,0.676892,0.653113,0.752719,0.601266,0.699387,0.676992,11.159904


In [35]:
perform_trials('cardio', models_without_svm, cardio_X, cardio_y)

Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.7066 0.5032 0.71      nan 0.7066 0.498  0.7086    nan 0.7066
 0.7098 0.7068    nan 0.7066 0.7068 0.7066    nan 0.7066 0.7064 0.7066
    nan 0.7066 0.7066 0.7066    nan 0.7066 0.7066 0.7066    nan 0.7066
 0.7066 0.7066    nan 0.7066 0.7066 0.7066    nan 0.7066 0.7066 0.7066
    nan 0.7066]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.647  0.5096 0.6476    nan 0.6472 0.5094 0.6472    nan 0.6476
 0.6324 0.6474    nan 0.6476 0.6478 0.6472    nan 0.6476 0.6472 0.6474
    nan 0.6474 0.6476 0.647     nan 0.6474 0.6476 0.6478    nan 0.6468
 0.6474 0.6476    nan 0.6476 0.6476 0.6474    nan 0.6474 0.647  0.6476
    nan 0.6472]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6794 0.5082 0.6804    nan 0.6798 0.5082 0.6804    nan 0.6798
 0.6744 0.6798    nan 0.6798 0.6794 0.6802    nan 0.6794 0.6794 0.68
    nan 0.68   0.6796 0.6796    nan 0.6798 0.6796 0.6802    nan 0.6796
 0.6792 0.6798    nan 0.6802 0.6798 0.6796    nan 0.6802 0.6796 0.6802
    nan 0.68  ]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6592 0.5044 0.6602    nan 0.6596 0.5046 0.6594    nan 0.6598
 0.6598 0.6596    nan 0.6596 0.6596 0.6596    nan 0.6598 0.6596 0.66
    nan 0.6596 0.6596 0.6598    nan 0.6598 0.6596 0.6592    nan 0.6596
 0.6596 0.66      nan 0.6596 0.6598 0.6596    nan 0.6592 0.6596 0.6598
    nan 0.6598]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6966 0.5018 0.6964    nan 0.6966 0.5026 0.6966    nan 0.6966
 0.6898 0.6966    nan 0.6966 0.6964 0.6968    nan 0.6966 0.6968 0.6966
    nan 0.6966 0.6966 0.6966    nan 0.6968 0.6966 0.6966    nan 0.6968
 0.6968 0.6966    nan 0.6966 0.6966 0.6966    nan 0.6966 0.6966 0.6968
    nan 0.6968]


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,cardio,log_reg,1,0.712,0.745763,0.637681,0.785374,0.6875,0.711527,9.947254,0.704323,0.737496,0.634344,0.774281,0.682042,0.704312,10.212409
1,cardio,log_reg,2,0.6464,0.624125,0.769623,0.518352,0.689279,0.643988,12.2131,0.644646,0.613317,0.778792,0.511069,0.68622,0.64493,12.273683
2,cardio,log_reg,3,0.6808,0.676109,0.713892,0.646604,0.694487,0.680248,11.024916,0.686585,0.672717,0.724397,0.648916,0.697601,0.686657,10.825125
3,cardio,log_reg,4,0.6516,0.649197,0.672879,0.629944,0.660826,0.651411,12.033456,0.660677,0.650499,0.692547,0.628891,0.670865,0.660719,11.719952
4,cardio,log_reg,5,0.6974,0.716638,0.658177,0.737033,0.686165,0.697605,10.451538,0.702138,0.713903,0.673597,0.73062,0.693165,0.702109,10.287881
5,cardio,log_reg,avg,0.67764,0.682366,0.690451,0.663461,0.683651,0.676956,11.134053,0.679674,0.677586,0.700735,0.658755,0.685978,0.679745,11.06381


In [None]:
# combine datasets

In [None]:
# create table 1

In [None]:
# create table 2

In [None]:
# create table 3