In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

import warnings
# warnings.filterwarnings('ignore')

### Clean Chess Dataset

In [13]:
# https://www.kaggle.com/datasnaek/chess
chess_df = pd.read_csv('Data/games.csv')
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [14]:
chess_df['winner_white'] = chess_df['winner'] == 'white'
# chess_df = chess_df[['rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
#                      'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]
chess_df = chess_df[['rated', 'turns', 'victory_status',
                     'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]
chess_df.head()

Unnamed: 0,rated,turns,victory_status,white_rating,black_rating,opening_eco,opening_ply,winner_white
0,False,13,outoftime,1500,1191,D10,5,True
1,True,16,resign,1322,1261,B00,4,False
2,True,61,mate,1496,1500,C20,3,True
3,True,61,mate,1439,1454,D02,3,True
4,True,95,mate,1523,1469,C41,5,True


In [15]:
chess_df_X = chess_df.drop(columns=['winner_white'])
chess_df_y = chess_df['winner_white']

In [16]:
chess_X_cat_col = ['rated', 'victory_status', 'opening_eco']
chess_X = pd.get_dummies(columns=chess_X_cat_col, data=chess_df_X)

chess_y = chess_df_y.replace({True: 1, False: 0})

### Clean Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [17]:
# https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('Data/mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [18]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [19]:
shrooms_X = pd.get_dummies(data=shrooms_df_X)
shrooms_y = shrooms_df_y.replace({'e': 0, 'p': 1})

### Clean Cardio Dataset

Retrieved from the kaggle site https://www.kaggle.com/sulianova/cardiovascular-disease-dataset, this cardio dataset has 70000 samples and 12 variables, which were collected at the moment of medical examination. It contains a target variable that indicates the presence or absence of cardiovascular disease, as well as 11 features that might be associated with the presence of cardiovascular disease, such as age, gender, and blood pressure. There are 3 types of 11 input features:
- objective feature: factual information
- examination feature: results of medical examination
- subjective feature: information given by the patient

A more detailed description of 11 features are shown below:

- age: objective feature, int (days)
- height: objective feature, int (cm)
- weight: objective feature, float (kg)
- gender: objective feature, categorical code, 1: male, 2:female
- ap_hi: systolic blood pressure, examination feature, int
- ap_lo: diastolic blood pressure, examination feature, int
- cholesterol: examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- gluc: glucose, examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- smoke: subjective feature, binary, 0: do not smoke, 1: smoke
- alco: alcohol intake, subjective feature, binary, 0: do not drink alcohol, 1: drink alcohol
- active: physical activity, subjective feature, binary, 0: not physically active, 1: physically active

A detailed description of the target variable is shown below: 

- cardio: presence or absence of cardiovascular disease, binary, 0: disease not present, 1: disease present

For this dataset, we want use those 11 input features and apply machine learning algorithms to predict whether a person has cardiovascular disease or not.

In [2]:
# load the cardio dataset
cardio = pd.read_csv('data/cardio.csv', delimiter = ';')
cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
# no missing values in cardio dataset
cardio.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [4]:
# drop unnecessary column "id"
cardio = cardio.drop(columns = ['id'])
# convert age in days to age in years
cardio['age'] = cardio['age'].apply(lambda x: int(x/365))

In [5]:
# one hot encoding categorical input features stored in cate_cols
cate_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio = pd.get_dummies(columns = cate_cols, data = cardio)

In [6]:
# a look at cleaned dataset
cardio.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
0,50,168,62.0,110,80,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1
1,55,156,85.0,140,90,1,1,0,0,0,1,1,0,0,1,0,1,0,0,1
2,51,165,64.0,130,70,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0
3,48,169,82.0,150,100,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1
4,47,156,56.0,100,60,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0


In [7]:
# split the cardio dataset into input features and labels 
cardio_X = cardio.drop(columns=['cardio']) # input features
cardio_y = cardio['cardio'] # true lables

### Clean Rain Dataset

Retrieved from the kaggle site https://www.kaggle.com/jsphyg/weather-dataset-rattle-package, this Rain in Australia dataset contains about 10 years of daily weather observations from many locations across Australia. There are 145460 samples and 23 variables in this dataset. It contains a target variable that indicates whether it rained the next day, as well as 22 features that might be associated with the target variable, such as minimum temperature, maximum temperature, rainfall of the day.

A more detailed description of 22 features are shown below:

- Date: the date of observation
- Location: the common name of the location of the weather station
- MinTemp: the minimum temperature in degrees celsius
- MaxTemp: the maximum temperature in degrees celsius
- Rainfall: the amount of rainfall recorded for the day in mm
- Evaporation: the so-called Class A pan evaporation (mm) in the 24 hours to 9am
- Sunshine: the number of hours of bright sunshine in the day
- WindGustDir: the direction of the strongest wind gust in the 24 hours to midnight
- WindGustSpeed: the speed (km/h) of the strongest wind gust in the 24 hours to midnight
- WindDir9am: direction of the wind at 9am
- WindDir3pm: direction of the wind at 3pm
- WindSpeed9am: wind speed (km/hr) averaged over 10 minutes prior to 9am
- WindSpeed3pm: wind speed (km/hr) averaged over 10 minutes prior to 3pm
- Humidity9am: humidity (percent) at 9am
- Humidity3pm: humidity (percent) at 3pm
- Pressure9am: atmospheric pressure (hpa) reduced to mean sea level at 9am
- Pressure3pm: atmospheric pressure (hpa) reduced to mean sea level at 3pm
- Cloud9am: fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast
- Cloud3pm: fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values
- Temp9am: temperature (degrees C) at 9am
- Temp3pm: temperature (degrees C) at 3pm
- RainToday: whether the precipitation (mm) in the 24 hours to 9am exceeded 1mm, Yes: the precipitation exceeded 1mm, No: it did not exceed 1mm

A detailed description of the target variable is shown below: 

- RainTomorrow: whether amount of next day rain exceeded 1mm, Yes: next day precipitation exceeded 1mm, No: it did not exceed 1mm

For this dataset, we want use those 22 input features and apply machine learning algorithms to predict whether it rained the next day or not.

Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

In [8]:
# load Australian rain dataset
aus = pd.read_csv('Data/weatherAUS.csv')
aus.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [9]:
# display the number of missing values in each column
aus.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [10]:
# fill missing values in categorical columns with the mode 
cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
for cate_col in cate_cols:
    aus[cate_col] = aus[cate_col].fillna(aus[cate_col].mode()[0])

In [11]:
# fill missing values in numerical columns with the mean
num_cols = aus.dtypes.index[aus.dtypes == "float64"].tolist()
for num_col in num_cols:
    aus[num_col] = aus[num_col].fillna(aus[num_col].mean())

In [12]:
# all missing values are filled
aus.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [13]:
# split the date of each observation into year, month, and day
splitted_date = aus['Date'].str.split('-')

# create 'Year', 'Month', 'Day' columns using splitted results of the date
aus['Year'] = splitted_date.str[0].astype(int)
aus['Month'] = splitted_date.str[1].astype(int)
aus['Day'] = splitted_date.str[2].astype(int)

# drop original 'Date' column
aus = aus.drop(columns = ['Date'])

In [14]:
# use 0 and 1 to indicate whether it rained or not
# 0: it rained, 1: it did not rain
aus['RainToday'] = aus['RainToday'].replace({'No': 0, 'Yes': 1})
aus['RainTomorrow'] = aus['RainTomorrow'].replace({'No': 0, 'Yes': 1})

In [15]:
# one hot encoding all categorical columns
cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
aus = pd.get_dummies(columns = cate_cols, data = aus)
aus.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,13.4,22.9,0.6,5.468232,7.611178,44.0,20.0,24.0,71.0,22.0,...,0,0,0,0,0,0,0,0,1,0
1,7.4,25.1,0.0,5.468232,7.611178,44.0,4.0,22.0,44.0,25.0,...,0,0,0,0,0,0,0,0,0,1
2,12.9,25.7,0.0,5.468232,7.611178,46.0,19.0,26.0,38.0,30.0,...,0,0,0,0,0,0,0,0,0,1
3,9.2,28.0,0.0,5.468232,7.611178,24.0,11.0,9.0,45.0,16.0,...,0,0,0,0,0,0,0,0,0,0
4,17.5,32.3,1.0,5.468232,7.611178,41.0,7.0,20.0,82.0,33.0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
# split the rain dataset into input features and labels 
aus_X = aus.drop(columns=['RainTomorrow']) # input features
aus_y = aus['RainTomorrow'] # true lables

### Clean BnB Dataset

### Clean Olympic Dataset

### Perform Trials

In [17]:
# Parameters for the model
tree_params = [
    {
        'max_depth': [2,3,4,5,7,10,13,15,18,None], 
        'min_samples_split':[2,3,5,7,10,15,20],
        'min_samples_leaf':[2,3,5,7,10,15,20]
    }
]

log_reg_params = [
    {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': 10 **np.array(np.arange(-8, 5, 1), dtype='float32')
    }
]

perceptron_params = [
    {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]
    }
]

svc_params = [
    {
        'kernel': ['linear'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32')
    },
    {
        'kernel': ['poly'],
        'degree': [2, 3],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
    },
    {
        'kernel': ['rbf'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
        'gamma': [0.001,0.01,0.1,1,2]
    }
]

knn_params = [
    {
        'n_neighbors': np.arange(1, 106, 4),
        'metric': ["euclidean", "manhattan", "minkowski"]
    }
]

forest_params = [
    {
        'n_estimators': [1024],
        'min_samples_split': [1, 2, 4, 6, 8, 12, 16, 20]
    }
]

models_without_svm = {
    'tree': (DecisionTreeClassifier(), tree_params),
    'log_reg': (LogisticRegression(), log_reg_params),
    'perceptron': (Perceptron(), perceptron_params),
    'knn': (KNeighborsClassifier(), knn_params),
    'forest': (RandomForestClassifier(), forest_params)
}

models_only_svm = {
    'svm': (SVC(), svc_params)
}

In [18]:
# perform trials on dataset
def perform_trials(dataset_name, models, data_X, data_y):
    results_columns = ['dataset', 'model', 'trial',
                       'train_accuracy', 'train_precision', 'train_recall', 'train_specificity',
                       'train_f1', 'train_auc', 'train_logloss',
                       'test_accuracy', 'test_precision', 'test_recall', 'test_specificity',
                       'test_f1', 'test_auc', 'test_logloss']
    num_trials = 5
    
    data_results = pd.DataFrame(columns=results_columns)

    # perform trials using each model
    for model_name in models.keys():
        
        model = models[model_name][0]
        model_params = models[model_name][1]
        
        train_metrics = np.zeros(7)
        test_metrics =  np.zeros(7)
        
        model_results = pd.DataFrame(columns=results_columns)
        
        # perform 5 trials on each dataset
        for trial_count in range(num_trials):
            # pick 5000 samples with replacement to be in the training set
            X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, train_size=5000, random_state=trial_count)
            
            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params, cv=5, verbose=3, n_jobs=-1)
            
            # find the best parameters for the model
            # grid search automatically refits a model on the entire validation set using the best parameters
            search.fit(X_train, y_train)
            
            # use metrics to evaluate model performance on the test set
            y_train_pred = search.predict(X_train)
            y_test_pred = search.predict(X_test)
            
            # compute metrics
            model_result = {
                'dataset': dataset_name,
                'model': model_name,
                'trial': trial_count + 1,

                'train_accuracy': accuracy_score(y_train, y_train_pred),
                'train_precision': precision_score(y_train, y_train_pred),
                'train_recall': recall_score(y_train, y_train_pred),
                'train_specificity': recall_score(y_train, y_train_pred, pos_label=0),
                'train_f1': f1_score(y_train, y_train_pred),
                'train_auc': roc_auc_score(y_train, y_train_pred),
                'train_logloss': log_loss(y_train, y_train_pred),

                'test_accuracy': accuracy_score(y_test, y_test_pred),
                'test_precision': precision_score(y_test, y_test_pred),
                'test_recall': recall_score(y_test, y_test_pred),
                'test_specificity': recall_score(y_test, y_test_pred, pos_label=0),
                'test_f1': f1_score(y_test, y_test_pred),
                'test_auc': roc_auc_score(y_test, y_test_pred),
                'test_logloss': log_loss(y_test, y_test_pred)
            }
            
            # append model_result to the model_results dataframe
            model_results = model_results.append(model_result, ignore_index=True)
        
        # append model_results to data_results
        data_results = data_results.append(model_results, ignore_index=True)
        
        avg_result = {
            'dataset': dataset_name,
            'model': model_name,
            'trial': 'avg',
            
            'train_accuracy': model_results.train_accuracy.mean(),
            'train_precision': model_results.train_precision.mean(),
            'train_recall': model_results.train_recall.mean(),
            'train_specificity': model_results.train_specificity.mean(),
            'train_f1': model_results.train_f1.mean(),
            'train_auc': model_results.train_auc.mean(),
            'train_logloss': model_results.train_logloss.mean(),
            
            'test_accuracy': model_results.test_accuracy.mean(),
            'test_precision': model_results.test_precision.mean(),
            'test_recall': model_results.test_recall.mean(),
            'test_specificity': model_results.test_specificity.mean(),
            'test_f1': model_results.test_f1.mean(),
            'test_auc': model_results.test_auc.mean(),
            'test_logloss': model_results.test_logloss.mean()
        }
        
        # append avg_result to the data_results dataframe
        data_results = data_results.append(avg_result, ignore_index=True)
    
    return data_results

### Duy Results

In [None]:
chess_results_no_svm = perform_trials('chess', models_without_svm, chess_X, chess_y)
chess_results_no_svm.to_csv('results/chess_no_svm')
chess_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 928 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 2208 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   46.2s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   43.2s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   45.6s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   48.3s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   48.4s finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.3s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.0s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.4s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   24.5s finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:   23.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.9s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.7min


In [None]:
shrooms_results_no_svm = perform_trials('shrooms', models_without_svm, shrooms_X, shrooms_y)
shrooms_results_no_svm.to_csv('results/shrooms_no_svm')
shrooms_results_no_svm

In [None]:
chess_results_svm = perform_trials('chess', models_only_svm, chess_X, chess_y)
chess_results_svm.to_csv('results/chess_svm')
chess_results_svm

In [None]:
shrooms_results_svm = perform_trials('shrooms', models_only_svm, shrooms_X, shrooms_y)
shrooms_results_svm.to_csv('results/shrooms_svm')
shrooms_results_svm

### Results of Cardio Dataset

In [19]:
# running algorithms except SVM on cardio dataset
cardio_results_no_svm = perform_trials('cardio', models_without_svm, cardio_X, cardio_y)
cardio_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.7066 0.5032 0.71      nan 0.7066 0.498  0.7086    nan 0.7066
 0.7098 0.707     nan 0.7066 0.7068 0.7066    nan 0.7066 0.7064 0.7066
    nan 0.7066 0.7066 0.7066    nan 0.7066 0.7066 0.7066    nan 0.7066
 0.7066 0.7066    nan 0.7066 0.7066 0.7066    nan 0.7066 0.7066 0.7066
    nan 0.7066]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.647  0.5096 0.6476    nan 0.6476 0.5094 0.647     nan 0.6472
 0.6328 0.6472    nan 0.6476 0.6476 0.6476    nan 0.6474 0.647  0.6474
    nan 0.6474 0.6474 0.6472    nan 0.6474 0.6472 0.6474    nan 0.6474
 0.6476 0.6472    nan 0.6474 0.6472 0.6472    nan 0.647  0.6474 0.6472
    nan 0.6474]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.68   0.5082 0.6804    nan 0.6796 0.5082 0.6802    nan 0.6792
 0.6746 0.6802    nan 0.6794 0.6796 0.68      nan 0.6796 0.6796 0.6792
    nan 0.6796 0.6798 0.6796    nan 0.6798 0.6802 0.68      nan 0.6798
 0.6796 0.68      nan 0.68   0.6794 0.6802    nan 0.6802 0.68   0.6798
    nan 0.6798]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6596 0.5024 0.66      nan 0.66   0.5046 0.6594    nan 0.6596
 0.66   0.6592    nan 0.6592 0.6596 0.6592    nan 0.6594 0.659  0.6598
    nan 0.6596 0.6592 0.6598    nan 0.6598 0.66   0.6594    nan 0.6598
 0.6596 0.6594    nan 0.6596 0.6596 0.6602    nan 0.6594 0.6598 0.6598
    nan 0.6596]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.6968 0.5014 0.6964    nan 0.6966 0.5026 0.6966    nan 0.697
 0.6896 0.6968    nan 0.6968 0.6964 0.6966    nan 0.6968 0.6968 0.6966
    nan 0.6966 0.6968 0.6968    nan 0.6966 0.6966 0.6966    nan 0.6966
 0.6966 0.6964    nan 0.6968 0.6968 0.6966    nan 0.6966 0.6968 0.6966
    nan 0.6966]
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,cardio,tree,1,0.7272,0.760951,0.657407,0.796105,0.7054,0.726756,9.42226,0.718092,0.751375,0.651762,0.784402,0.698032,0.718082,9.736833
1,cardio,tree,2,0.7308,0.735317,0.737049,0.724307,0.736182,0.730678,9.297947,0.716569,0.713967,0.720638,0.712518,0.717287,0.716578,9.789467
2,cardio,tree,3,0.7276,0.77355,0.656041,0.801545,0.709966,0.728793,9.408441,0.723985,0.753861,0.663574,0.784166,0.705842,0.72387,9.53332
3,cardio,tree,4,0.7372,0.768923,0.684774,0.790557,0.724413,0.737665,9.076873,0.724569,0.741616,0.688172,0.76087,0.713895,0.724521,9.513137
4,cardio,tree,5,0.735,0.787234,0.647831,0.82308,0.710762,0.735456,9.152846,0.726877,0.773141,0.641379,0.812196,0.701123,0.726788,9.433412
5,cardio,tree,avg,0.73156,0.765195,0.67662,0.787119,0.717344,0.73187,9.271673,0.722018,0.746792,0.673105,0.770831,0.707236,0.721968,9.601234
6,cardio,log_reg,1,0.712,0.745763,0.637681,0.785374,0.6875,0.711527,9.947254,0.704323,0.737496,0.634344,0.774281,0.682042,0.704312,10.212409
7,cardio,log_reg,2,0.6464,0.622257,0.779042,0.508564,0.691879,0.643803,12.213104,0.644538,0.611887,0.786316,0.503362,0.688221,0.644839,12.277406
8,cardio,log_reg,3,0.6804,0.675999,0.712712,0.647011,0.69387,0.679861,11.038732,0.686446,0.672761,0.723719,0.649315,0.69731,0.686517,10.829907
9,cardio,log_reg,4,0.6508,0.650504,0.664948,0.6364,0.657647,0.650674,12.061085,0.660569,0.652315,0.685738,0.635467,0.668609,0.660602,11.723669


In [22]:
# running SVM algorithm on cardio dataset, generally take longer time to run than other algorithms combined
cardio_results_svm = perform_trials('cardio', models_only_svm, cardio_X, cardio_y)
cardio_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,cardio,svm,1,0.7472,0.778793,0.68599,0.807631,0.729452,0.746811,8.73148,0.722677,0.752822,0.662933,0.782403,0.705024,0.722668,9.578487
1,cardio,svm,2,0.7352,0.774933,0.677002,0.795677,0.722664,0.736339,9.145948,0.728338,0.757845,0.669421,0.787006,0.710894,0.728214,9.382942
2,cardio,svm,3,0.744,0.792575,0.672176,0.818219,0.727428,0.745198,8.841998,0.727431,0.752808,0.675689,0.778975,0.712167,0.727332,9.414296
3,cardio,svm,4,0.7444,0.779928,0.687153,0.802663,0.730607,0.744908,8.828189,0.726185,0.749125,0.679052,0.773192,0.71237,0.726122,9.457339
4,cardio,svm,5,0.738,0.768887,0.684441,0.792119,0.724211,0.73828,9.049242,0.728062,0.748655,0.685794,0.77024,0.715847,0.728017,9.392514
5,cardio,svm,avg,0.74176,0.779023,0.681352,0.803262,0.726872,0.742307,8.919372,0.726538,0.752251,0.674578,0.778363,0.71126,0.726471,9.445116


In [25]:
# combine results of svm and non-svm algorithms and save as a csv file
cardio_final_results = cardio_results_no_svm.append(cardio_results_svm, ignore_index=True)
cardio_final_results.to_csv('results/cardio_results.csv', index = False)

In [26]:
# display performance
pd.read_csv('results/cardio_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,cardio,tree,1,0.7272,0.760951,0.657407,0.796105,0.7054,0.726756,9.42226,0.718092,0.751375,0.651762,0.784402,0.698032,0.718082,9.736833
1,cardio,tree,2,0.7308,0.735317,0.737049,0.724307,0.736182,0.730678,9.297947,0.716569,0.713967,0.720638,0.712518,0.717287,0.716578,9.789467
2,cardio,tree,3,0.7276,0.77355,0.656041,0.801545,0.709966,0.728793,9.408441,0.723985,0.753861,0.663574,0.784166,0.705842,0.72387,9.53332
3,cardio,tree,4,0.7372,0.768923,0.684774,0.790557,0.724413,0.737665,9.076873,0.724569,0.741616,0.688172,0.76087,0.713895,0.724521,9.513137
4,cardio,tree,5,0.735,0.787234,0.647831,0.82308,0.710762,0.735456,9.152846,0.726877,0.773141,0.641379,0.812196,0.701123,0.726788,9.433412
5,cardio,tree,avg,0.73156,0.765195,0.67662,0.787119,0.717344,0.73187,9.271673,0.722018,0.746792,0.673105,0.770831,0.707236,0.721968,9.601234
6,cardio,log_reg,1,0.712,0.745763,0.637681,0.785374,0.6875,0.711527,9.947254,0.704323,0.737496,0.634344,0.774281,0.682042,0.704312,10.212409
7,cardio,log_reg,2,0.6464,0.622257,0.779042,0.508564,0.691879,0.643803,12.213104,0.644538,0.611887,0.786316,0.503362,0.688221,0.644839,12.277406
8,cardio,log_reg,3,0.6804,0.675999,0.712712,0.647011,0.69387,0.679861,11.038732,0.686446,0.672761,0.723719,0.649315,0.69731,0.686517,10.829907
9,cardio,log_reg,4,0.6508,0.650504,0.664948,0.6364,0.657647,0.650674,12.061085,0.660569,0.652315,0.685738,0.635467,0.668609,0.660602,11.723669


### Results of Australian Rain Dataset

In [21]:
# running algorithms except SVM on Australian rain dataset
aus_results_no_svm = perform_trials('aus', models_without_svm, aus_X, aus_y)
aus_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.842  0.7874 0.8228    nan 0.8418 0.7874 0.8408    nan 0.8418
 0.8338 0.8414    nan 0.8418 0.8404 0.8418    nan 0.8418 0.8418 0.8416
    nan 0.8418 0.8418 0.8418    nan 0.8416 0.8418 0.8418    nan 0.8422
 0.8418 0.8418    nan 0.8418 0.8418 0.8418    nan 0.8418 0.842  0.8418
    nan 0.8418]
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.8416 0.78   0.8304    nan 0.8412 0.78   0.8412    nan 0.8416
 0.8382 0.8414    nan 0.8416 0.8408 0.8414    nan 0.8414 0.8408 0.8414
    nan 0.8416 0.8412 0.8414    nan 0.8416 0.8416 0.8416    nan 0.8418
 0.8414 0.8416    nan 0.8416 0.8414 0.841     nan 0.8416 0.8416 0.8412
    nan 0.8412]
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.8366 0.7848 0.8252    nan 0.8366 0.7848 0.8354    nan 0.8368
 0.8336 0.8366    nan 0.8368 0.8348 0.8368    nan 0.8364 0.8366 0.8368
    nan 0.8366 0.8366 0.837     nan 0.8368 0.8366 0.837     nan 0.8368
 0.8368 0.8368    nan 0.837  0.8368 0.8366    nan 0.8366 0.8368 0.8368
    nan 0.8368]


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.8366 0.7742 0.825     nan 0.8366 0.7742 0.8356    nan 0.8368
 0.8292 0.8368    nan 0.8366 0.8364 0.8368    nan 0.8368 0.8366 0.8368
    nan 0.8366 0.8368 0.8368    nan 0.8368 0.8368 0.8368    nan 0.8368
 0.8366 0.8368    nan 0.8366 0.8368 0.8366    nan 0.8368 0.8366 0.8368
    nan 0.8368]
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


    nan 0.8382 0.7768 0.8306    nan 0.8382 0.7768 0.839     nan 0.8382
 0.8344 0.838     nan 0.8384 0.8382 0.8386    nan 0.8386 0.8382 0.8382
    nan 0.8382 0.8382 0.8382    nan 0.8386 0.8382 0.8384    nan 0.8382
 0.838  0.8382    nan 0.8384 0.8384 0.8384    nan 0.8384 0.8382 0.8382
    nan 0.8382]


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 8 candidates, totalling 40 fits




Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,aus,tree,1,0.8492,0.79206,0.394167,0.97206,0.526382,0.683114,5.208465,0.83213,0.738826,0.363179,0.96392,0.486978,0.66355,5.798042
1,aus,tree,2,0.8538,0.760962,0.489091,0.956667,0.595462,0.722879,5.049596,0.829467,0.670925,0.435163,0.940109,0.527917,0.687636,5.890023
2,aus,tree,3,0.8428,0.776718,0.378253,0.970183,0.50875,0.674218,5.429514,0.829012,0.724368,0.35554,0.962,0.47697,0.65877,5.905746
3,aus,tree,4,0.8538,0.745074,0.535872,0.946525,0.62339,0.741199,5.049602,0.828855,0.656716,0.457168,0.933025,0.539068,0.695096,5.911174
4,aus,tree,5,0.8622,0.787349,0.524194,0.95932,0.629371,0.741757,4.759469,0.830016,0.663578,0.453984,0.93546,0.539127,0.694722,5.871091
5,aus,tree,avg,0.85236,0.772432,0.464315,0.960951,0.576671,0.712633,5.099329,0.829896,0.690883,0.413007,0.946903,0.514012,0.679955,5.875215
6,aus,log_reg,1,0.8436,0.727715,0.422389,0.957328,0.534524,0.689859,5.401891,0.838167,0.742602,0.401473,0.960892,0.52118,0.681183,5.589523
7,aus,log_reg,2,0.843,0.727931,0.457273,0.951795,0.561697,0.704534,5.422618,0.839228,0.714293,0.443773,0.950193,0.547437,0.696983,5.552891
8,aus,log_reg,3,0.8362,0.712397,0.400558,0.955657,0.51279,0.678108,5.657479,0.839784,0.730639,0.426674,0.955818,0.538739,0.691246,5.533707
9,aus,log_reg,4,0.8384,0.728959,0.452613,0.950917,0.55847,0.701765,5.581497,0.839193,0.722869,0.430434,0.953752,0.539576,0.692093,5.554118


In [27]:
# running SVM algorithm on Australian rain dataset, generally take longer time to run than other algorithms combined
aus_results_svm = perform_trials('aus', models_only_svm, aus_X, aus_y)
aus_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,aus,svm,1,0.8552,0.746725,0.482596,0.955804,0.586286,0.7192,5.001243,0.843778,0.734472,0.450899,0.954189,0.558766,0.702544,5.39576
1,aus,svm,2,0.8562,0.750988,0.518182,0.951538,0.613233,0.73486,4.966706,0.843671,0.709647,0.484973,0.944321,0.576182,0.714647,5.399454
2,aus,svm,3,0.8512,0.744838,0.469331,0.955912,0.575827,0.712622,5.139398,0.842937,0.72149,0.46216,0.94989,0.563417,0.706025,5.424778
3,aus,svm,4,0.848,0.729763,0.519043,0.943942,0.606625,0.731493,5.249929,0.840823,0.694537,0.487089,0.939961,0.572603,0.713525,5.497816
4,aus,svm,5,0.8544,0.758667,0.509857,0.953399,0.609861,0.731628,5.028875,0.844297,0.715122,0.480414,0.946335,0.574729,0.713374,5.377814
5,aus,svm,avg,0.853,0.746196,0.499802,0.952119,0.598366,0.72596,5.07723,0.843101,0.715054,0.473107,0.946939,0.569139,0.710023,5.419124


In [28]:
# combine results of svm and non-svm algorithms and save as a csv file
aus_final_results = aus_results_no_svm.append(aus_results_svm, ignore_index=True)
aus_final_results.to_csv('results/aus_results.csv', index = False)

In [29]:
# display performance
pd.read_csv('results/aus_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,aus,tree,1,0.8492,0.79206,0.394167,0.97206,0.526382,0.683114,5.208465,0.83213,0.738826,0.363179,0.96392,0.486978,0.66355,5.798042
1,aus,tree,2,0.8538,0.760962,0.489091,0.956667,0.595462,0.722879,5.049596,0.829467,0.670925,0.435163,0.940109,0.527917,0.687636,5.890023
2,aus,tree,3,0.8428,0.776718,0.378253,0.970183,0.50875,0.674218,5.429514,0.829012,0.724368,0.35554,0.962,0.47697,0.65877,5.905746
3,aus,tree,4,0.8538,0.745074,0.535872,0.946525,0.62339,0.741199,5.049602,0.828855,0.656716,0.457168,0.933025,0.539068,0.695096,5.911174
4,aus,tree,5,0.8622,0.787349,0.524194,0.95932,0.629371,0.741757,4.759469,0.830016,0.663578,0.453984,0.93546,0.539127,0.694722,5.871091
5,aus,tree,avg,0.85236,0.772432,0.464315,0.960951,0.576671,0.712633,5.099329,0.829896,0.690883,0.413007,0.946903,0.514012,0.679955,5.875215
6,aus,log_reg,1,0.8436,0.727715,0.422389,0.957328,0.534524,0.689859,5.401891,0.838167,0.742602,0.401473,0.960892,0.52118,0.681183,5.589523
7,aus,log_reg,2,0.843,0.727931,0.457273,0.951795,0.561697,0.704534,5.422618,0.839228,0.714293,0.443773,0.950193,0.547437,0.696983,5.552891
8,aus,log_reg,3,0.8362,0.712397,0.400558,0.955657,0.51279,0.678108,5.657479,0.839784,0.730639,0.426674,0.955818,0.538739,0.691246,5.533707
9,aus,log_reg,4,0.8384,0.728959,0.452613,0.950917,0.55847,0.701765,5.581497,0.839193,0.722869,0.430434,0.953752,0.539576,0.692093,5.554118


In [None]:
# combine datasets

In [None]:
# create table 1

In [None]:
# create table 2

In [None]:
# create table 3