In [64]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

### Clean Chess Dataset

In [5]:
# https://www.kaggle.com/datasnaek/chess
chess_df = pd.read_csv('Data/games.csv')
chess_df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [6]:
chess_num_cols = ['created_at', 'last_move_at', 'turns', 'white_rating', 'black_rating', 'opening_ply']
for num_col in chess_num_cols:
    chess_df[num_col] = (chess_df[num_col] - chess_df[num_col].mean()) /chess_df[num_col].std()

chess_df['winner_white'] = chess_df['winner'] == 'white'
chess_df = chess_df[['rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
                     'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]

# max_openings = 30
# popular_opennings = chess_df['opening_eco'].value_counts()[:max_openings].index.tolist()
# def replace_opening(x):
#     if (x in popular_opennings):
#         return x
#     else:
#         return 'other'

# chess_df['opening_eco'] = chess_df['opening_eco'].apply(replace_opening)

chess_df.head()

Unnamed: 0,rated,created_at,last_move_at,turns,victory_status,white_rating,black_rating,opening_eco,opening_ply,winner_white
0,False,0.722528,0.722501,-1.413916,outoftime,-0.331779,-1.366951,D10,0.065431,True
1,True,0.719721,0.719694,-1.324552,resign,-0.942931,-1.126431,B00,-0.292076,False
2,True,0.719721,0.719694,0.015907,mate,-0.345513,-0.305227,C20,-0.649582,True
3,True,0.71902,0.718992,0.015907,mate,-0.541219,-0.463283,D02,-0.649582,True
4,True,0.716213,0.716185,1.028698,mate,-0.25281,-0.411743,C41,0.065431,True


In [7]:
chess_df_X = chess_df.drop(columns=['winner_white'])
chess_df_y = chess_df['winner_white']

In [8]:
chess_X_cat_col = ['rated', 'victory_status', 'opening_eco']
chess_X = pd.get_dummies(columns=chess_X_cat_col, data=chess_df_X)

chess_y = chess_df_y.replace({True: 1, False: 0})

In [9]:
chess_y.value_counts()

0    10057
1    10001
Name: winner_white, dtype: int64

### Clean Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [16]:
# https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('Data/mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [17]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [18]:
shrooms_X = pd.get_dummies(data=shrooms_df_X)
shrooms_y = shrooms_df_y.replace({'e': 0, 'p': 1})

In [21]:
shrooms_y.value_counts()

0    4208
1    3916
Name: class, dtype: int64

### Clean Cardio Dataset

Retrieved from the kaggle site https://www.kaggle.com/sulianova/cardiovascular-disease-dataset, this cardio dataset has 70000 samples and 12 variables, which were collected at the moment of medical examination. It contains a target variable that indicates the presence or absence of cardiovascular disease, as well as 11 features that might be associated with the presence of cardiovascular disease, such as age, gender, and blood pressure. There are 3 types of 11 input features:
- objective feature: factual information
- examination feature: results of medical examination
- subjective feature: information given by the patient

A more detailed description of 11 features are shown below:

- age: objective feature, int (days)
- height: objective feature, int (cm)
- weight: objective feature, float (kg)
- gender: objective feature, categorical code, 1: male, 2:female
- ap_hi: systolic blood pressure, examination feature, int
- ap_lo: diastolic blood pressure, examination feature, int
- cholesterol: examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- gluc: glucose, examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- smoke: subjective feature, binary, 0: do not smoke, 1: smoke
- alco: alcohol intake, subjective feature, binary, 0: do not drink alcohol, 1: drink alcohol
- active: physical activity, subjective feature, binary, 0: not physically active, 1: physically active

A detailed description of the target variable is shown below: 

- cardio: presence or absence of cardiovascular disease, binary, 0: disease not present, 1: disease present

For this dataset, we want use those 11 input features and apply machine learning algorithms to predict whether a person has cardiovascular disease or not.

In [3]:
# load the cardio dataset
cardio = pd.read_csv('data/cardio.csv', delimiter = ';')
cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# no missing values in cardio dataset
cardio.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [5]:
# drop unnecessary column "id"
cardio = cardio.drop(columns = ['id'])
# convert age in days to age in years
cardio['age'] = cardio['age'].apply(lambda x: int(x/365))

In [6]:
# one hot encoding categorical input features stored in cate_cols
cardio_cate_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio = pd.get_dummies(columns = cardio_cate_cols, data = cardio)

In [7]:
# scale numerical attributes to 0 mean 1 std
cardio_num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
for num_col in cardio_num_cols:
    cardio[num_col] = (cardio[num_col] - cardio[num_col].mean()) / cardio[num_col].std()

In [8]:
# a look at cleaned dataset
cardio.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cardio,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
0,-0.419797,0.443449,-0.847867,-0.122181,-0.088238,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1
1,0.319108,-1.018161,0.749826,0.07261,-0.03518,1,1,0,0,0,1,1,0,0,1,0,1,0,0,1
2,-0.272016,0.078046,-0.708937,0.007679,-0.141296,1,1,0,0,0,1,1,0,0,1,0,1,0,1,0
3,-0.715359,0.56525,0.541431,0.13754,0.017878,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1
4,-0.86314,-1.018161,-1.264657,-0.187111,-0.194354,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0


In [9]:
# 50.03% negative labels, 49.97% positive labels
cardio['cardio'].value_counts(normalize = True)

0    0.5003
1    0.4997
Name: cardio, dtype: float64

In [10]:
# split the cardio dataset into input features and labels 
cardio_X = cardio.drop(columns=['cardio']) # input features
cardio_y = cardio['cardio'] # true lables

### Clean Rain Dataset

Retrieved from the kaggle site https://www.kaggle.com/jsphyg/weather-dataset-rattle-package, this Rain in Australia dataset contains about 10 years of daily weather observations from many locations across Australia. There are 145460 samples and 23 variables in this dataset. It contains a target variable that indicates whether it rained the next day, as well as 22 features that might be associated with the target variable, such as minimum temperature, maximum temperature, rainfall of the day.

A more detailed description of 22 features are shown below:

- Date: the date of observation
- Location: the common name of the location of the weather station
- MinTemp: the minimum temperature in degrees celsius
- MaxTemp: the maximum temperature in degrees celsius
- Rainfall: the amount of rainfall recorded for the day in mm
- Evaporation: the so-called Class A pan evaporation (mm) in the 24 hours to 9am
- Sunshine: the number of hours of bright sunshine in the day
- WindGustDir: the direction of the strongest wind gust in the 24 hours to midnight
- WindGustSpeed: the speed (km/h) of the strongest wind gust in the 24 hours to midnight
- WindDir9am: direction of the wind at 9am
- WindDir3pm: direction of the wind at 3pm
- WindSpeed9am: wind speed (km/hr) averaged over 10 minutes prior to 9am
- WindSpeed3pm: wind speed (km/hr) averaged over 10 minutes prior to 3pm
- Humidity9am: humidity (percent) at 9am
- Humidity3pm: humidity (percent) at 3pm
- Pressure9am: atmospheric pressure (hpa) reduced to mean sea level at 9am
- Pressure3pm: atmospheric pressure (hpa) reduced to mean sea level at 3pm
- Cloud9am: fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast
- Cloud3pm: fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values
- Temp9am: temperature (degrees C) at 9am
- Temp3pm: temperature (degrees C) at 3pm
- RainToday: whether the precipitation (mm) in the 24 hours to 9am exceeded 1mm, Yes: the precipitation exceeded 1mm, No: it did not exceed 1mm

A detailed description of the target variable is shown below: 

- RainTomorrow: whether amount of next day rain exceeded 1mm, Yes: next day precipitation exceeded 1mm, No: it did not exceed 1mm

For this dataset, we want use those 22 input features and apply machine learning algorithms to predict whether it rained the next day or not.

Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

In [11]:
# load Australian rain dataset
aus = pd.read_csv('Data/weatherAUS.csv')
aus.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [12]:
# display the number of missing values in each column
aus.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [13]:
# fill missing values in categorical columns with the mode 
aus_cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
for cate_col in aus_cate_cols:
    aus[cate_col] = aus[cate_col].fillna(aus[cate_col].mode()[0])

In [14]:
# fill missing values in numerical columns with the mean
aus_num_cols = aus.dtypes.index[aus.dtypes == "float64"].tolist()
for num_col in aus_num_cols:
    aus[num_col] = aus[num_col].fillna(aus[num_col].mean())
    # scale numerical attributes to 0 mean 1 std
    aus[num_col] = (aus[num_col] - aus[num_col].mean()) / aus[num_col].std()

In [15]:
# all missing values are filled
aus.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [16]:
# split the date of each observation into year, month, and day
splitted_date = aus['Date'].str.split('-')

# create 'Year', 'Month', 'Day' columns using splitted results of the date
aus['Year'] = splitted_date.str[0].astype(int)
aus['Month'] = splitted_date.str[1].astype(int)
aus['Day'] = splitted_date.str[2].astype(int)

# drop original 'Date' column
aus = aus.drop(columns = ['Date'])

In [17]:
# use 0 and 1 to indicate whether it rained or not
# 0: it rained, 1: it did not rain
aus['RainToday'] = aus['RainToday'].replace({'No': 0, 'Yes': 1})
aus['RainTomorrow'] = aus['RainTomorrow'].replace({'No': 0, 'Yes': 1})

In [18]:
# one hot encoding all categorical columns
cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
aus = pd.get_dummies(columns = cate_cols, data = aus)
aus.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,0.189446,-0.045336,-0.210071,-1.737284e-12,-1.385234e-12,0.302233,0.672219,0.612321,0.112394,-1.44296,...,0,0,0,0,0,0,0,0,1,0
1,-0.753098,0.265042,-0.281649,-1.737284e-12,-1.385234e-12,0.302233,-1.133434,0.382873,-1.319604,-1.296413,...,0,0,0,0,0,0,0,0,0,1
2,0.110901,0.349691,-0.281649,-1.737284e-12,-1.385234e-12,0.454692,0.559366,0.841768,-1.637826,-1.052167,...,0,0,0,0,0,0,0,0,0,1
3,-0.470335,0.674177,-0.281649,-1.737284e-12,-1.385234e-12,-1.22236,-0.343461,-1.108537,-1.266567,-1.736055,...,0,0,0,0,0,0,0,0,0,0
4,0.833518,1.280826,-0.162353,-1.737284e-12,-1.385234e-12,0.073544,-0.794874,0.153425,0.695801,-0.90562,...,0,1,0,0,0,0,0,0,0,0


In [19]:
# 78.0854% negative labels, 21.9146% positive labels
aus['RainTomorrow'].value_counts(normalize = True)

0    0.780854
1    0.219146
Name: RainTomorrow, dtype: float64

In [20]:
# split the rain dataset into input features and labels 
aus_X = aus.drop(columns=['RainTomorrow']) # input features
aus_y = aus['RainTomorrow'] # true lables

### Clean BnB Dataset

The dataset is from https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv, and contains data from AirBnB listing and metrics in New York City, New York for the year 2019. There are 47900 unique values in this dataset, in which we will be using different features to predict whether an AirBnB pricing is expensive or not.

In [3]:
# load the AirBnb dataset
airbnb = pd.read_csv('data/AB_NYC_2019.csv')
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
# We will be looking at these features of the AirBnB for our predictive model
airbnb = airbnb[['neighbourhood_group','latitude','longitude','room_type', 'price',
                       'minimum_nights','number_of_reviews','availability_365']]

In [5]:
# The price median will be the threshold for our expensive classifier
airbnb['price'].median()

106.0

In [6]:
# Creating a new column with prices greater than the median as True, lesser than as False
airbnb = airbnb.assign(
    is_expensive = airbnb.get('price') > 106.0
)
airbnb.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,is_expensive
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,365,True
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,355,True
2,Manhattan,40.80902,-73.9419,Private room,150,3,0,365,True
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,194,False
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0,False


In [7]:
# split data into input features and labels
airbnb_df_X = airbnb.drop(columns=['is_expensive'])
airbnb_df_y = airbnb['is_expensive']

In [8]:
# one hot encoding for the categorical columns
airbnb_X_cat_col = ['neighbourhood_group', 'room_type']
airbnb_X = pd.get_dummies(columns=airbnb_X_cat_col, data=airbnb_df_X)

airbnb_y = airbnb_df_y.replace({True: 1, False: 0})

### Clean Olympic Dataset

The dataset is from https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results?select=athlete_events.csv, and includes historic data of all participants from the Olympic Games, from Athens 1896 to Rio 2016. There are 271116 unique values/observations. We want to see whether or not we can predict a gold medalist just by participant features.

In [10]:
# load the Olympic dataset
olympic = pd.read_csv('data/athlete_events.csv')
olympic.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [11]:
# These are the features we will be looking at for our classifier
olympic = olympic[['Sex', 'Age','Height','Weight','NOC','Sport','Medal']]
olympic.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Medal
0,M,24.0,180.0,80.0,CHN,Basketball,
1,M,23.0,170.0,60.0,CHN,Judo,
2,M,24.0,,,DEN,Football,
3,M,34.0,,,DEN,Tug-Of-War,Gold
4,F,21.0,185.0,82.0,NED,Speed Skating,


In [12]:
# There are some NaN values in this dataset, so we want to remove those
olympic.shape

(271116, 7)

In [13]:
# This removes the NaN values from the numerical columns
olympic = olympic[olympic['Height'].notna()]
olympic = olympic[olympic['Weight'].notna()]
olympic = olympic[olympic['Age'].notna()]
olympic.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Medal
0,M,24.0,180.0,80.0,CHN,Basketball,
1,M,23.0,170.0,60.0,CHN,Judo,
4,F,21.0,185.0,82.0,NED,Speed Skating,
5,F,21.0,185.0,82.0,NED,Speed Skating,
6,F,25.0,185.0,82.0,NED,Speed Skating,


In [14]:
# After that cleaning, we are left with 206165 participants to work with
olympic.shape

(206165, 7)

In [15]:
# We want to make a column to see if the participant is a Gold Medalist
olympic['Gold'] = olympic['Medal']=='Gold'
olympic.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Medal,Gold
0,M,24.0,180.0,80.0,CHN,Basketball,,False
1,M,23.0,170.0,60.0,CHN,Judo,,False
4,F,21.0,185.0,82.0,NED,Speed Skating,,False
5,F,21.0,185.0,82.0,NED,Speed Skating,,False
6,F,25.0,185.0,82.0,NED,Speed Skating,,False


In [16]:
# We no longer need the Medal column
olympic = olympic.drop(columns=['Medal'])
olympic.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Gold
0,M,24.0,180.0,80.0,CHN,Basketball,False
1,M,23.0,170.0,60.0,CHN,Judo,False
4,F,21.0,185.0,82.0,NED,Speed Skating,False
5,F,21.0,185.0,82.0,NED,Speed Skating,False
6,F,25.0,185.0,82.0,NED,Speed Skating,False


In [17]:
# split the olympic dataset into input features and labels
olympic_df_X = olympic.drop(columns=['Gold'])
olympic_df_y = olympic['Gold']

In [18]:
# one hot encode all the categorical columns
olympic_X_cat_col = ['Sex','NOC','Sport']
olympic_X = pd.get_dummies(columns=olympic_X_cat_col, data=olympic_df_X)
olympic_y = olympic_df_y.replace({True: 0, False: 1})

### Perform Trials

In [27]:
# Parameters for the model

tree_params = [
    {
        'max_depth': [2,3,4,5,7,10,13,15,18,None], 
        'min_samples_split':[2,3,5,7,10,15,20],
        'min_samples_leaf':[2,3,5,7,10,15,20]
    }
]

log_reg_params = [        
    {
        'solver': ['lbfgs'],
        'max_iter': [5000],
        'penalty': ['l2'],
        'C': 10**np.arange(-4, 5, 1, dtype='float32')
    },
    {
        'solver': ['saga'],
        'max_iter': [5000],
        'penalty': ['l1', 'l2'],
        'C': 10**np.arange(-4, 5, 1, dtype='float32')
    },
    {
        'solver': ['saga', 'lbfgs'],
        'max_iter': [5000],
        'penalty': ['none']
    }
]

perceptron_params = [
    {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]
    }
]

svc_params = [
    {
        'kernel': ['linear'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32')
    },
    {
        'kernel': ['poly'],
        'degree': [2, 3],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
    },
    {
        'kernel': ['rbf'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
        'gamma': [0.001,0.01,0.1,1,2]
    }
]

knn_params = [
    {
        'n_neighbors': np.arange(1, 106, 4),
        'metric': ["euclidean", "manhattan", "minkowski"]
    }
]

forest_params = [
    {
        'n_estimators': [1024],
        'min_samples_split': [1, 2, 4, 6, 8, 12, 16, 20]
    }
]

# models that do not include SVM classifier
models_without_svm = {
    'tree': (DecisionTreeClassifier(), tree_params),
    'log_reg': (LogisticRegression(), log_reg_params),
    'perceptron': (Perceptron(), perceptron_params),
    'knn': (KNeighborsClassifier(), knn_params),
    'forest': (RandomForestClassifier(), forest_params)
}

# SVM model
models_only_svm = {
    'svm': (SVC(), svc_params)
}

In [248]:
(list(models_without_svm.keys()) + list(models_only_svm.keys()))

['tree', 'log_reg', 'perceptron', 'knn', 'forest', 'svm']

In [103]:
models = list(models_without_svm.keys()) + list(models_only_svm.keys())

results_columns = ['dataset', 'model', 'trial',
                       'train_accuracy', 'train_precision',
                       'train_recall', 'train_specificity',
                       'train_f1', 'train_roc_auc',
                       'test_accuracy', 'test_precision', 
                       'test_recall', 'test_specificity',
                       'test_f1', 'test_roc_auc']
scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'specificity': make_scorer(recall_score, pos_label=0),
        'f1': make_scorer(f1_score),
        'roc_auc': make_scorer(roc_auc_score),
    }

In [29]:
# perform 7 trials using each of 6 algorithms on one dataset
def perform_trials(dataset_name, models, data_X, data_y):
    
    num_trials = 7
    
    data_results = pd.DataFrame(columns=results_columns)

    for model_name in models.keys():
        model = models[model_name][0]        
        model_params_grid = models[model_name][1]
        model_results = pd.DataFrame(columns=results_columns)
        
        # perform 7 trials using each model on the dataset
        for trial_count in range(num_trials):
            # pick 5000 samples with replacement to be in the training set
            X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                                train_size=5000, 
                                                                random_state=trial_count)
            
            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params_grid, cv=5, verbose=3,
                                  n_jobs=-1, refit=False, scoring=scoring)
            
            # fit grid search model with training set
            search.fit(X_train, y_train)
            
            # store 7 metrics calculated in one trial
            model_result = {
                'dataset': dataset_name,
                'model': model_name,
                'trial': trial_count + 1
            }
            
            for score_name in scoring.keys():
                # find the best parameters that make model achieves best score of the metric
                best_params = search.cv_results_['params'][np.argmin(search.cv_results_['rank_test_' + score_name])]
                # use best parameters to create the optimal model for the metric
                best_model = clone(model).set_params(**best_params)
                # train the optimal model
                best_model.fit(X_train, y_train)
                
                # compute metrics
                train_score = scoring[score_name](best_model, X_train, y_train)
                test_score = scoring[score_name](best_model, X_test, y_test)

                # append scores
                model_result['train_' + score_name] = train_score
                model_result['test_' + score_name] = test_score
            
            # append scores of one trial to the model_results dataframe
            model_results = model_results.append(model_result, ignore_index=True)
        
        # append model_results to data_results
        data_results = data_results.append(model_results, ignore_index=True)
        
        # store scores averaged over 7 trials
        avg_result = {
            'dataset': dataset_name,
            'model': model_name,
            'trial': 'avg',
            
            'train_accuracy': model_results.train_accuracy.mean(),
            'train_precision': model_results.train_precision.mean(),
            'train_recall': model_results.train_recall.mean(),
            'train_specificity': model_results.train_specificity.mean(),
            'train_f1': model_results.train_f1.mean(),
            'train_roc_auc': model_results.train_roc_auc.mean(),
            
            'test_accuracy': model_results.test_accuracy.mean(),
            'test_precision': model_results.test_precision.mean(),
            'test_recall': model_results.test_recall.mean(),
            'test_specificity': model_results.test_specificity.mean(),
            'test_f1': model_results.test_f1.mean(),
            'test_roc_auc': model_results.test_roc_auc.mean()
        }
        
        # append avg_result to the data_results dataframe
        data_results = data_results.append(avg_result, ignore_index=True)
    
    return data_results

### Chess Results

In [None]:
chess_results_no_svm = perform_trials('chess', models_without_svm, chess_X, chess_y)
chess_results_no_svm.to_csv('results/chess_no_svm.csv')
chess_results_no_svm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.0min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.0min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.1min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 29 candidates, totalling 145 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.9s


In [None]:
chess_results_svm = perform_trials('chess', models_only_svm, chess_X, chess_y)
chess_results_svm.to_csv('results/chess_svm.csv')
chess_results_svm

In [35]:
chess_results_no_svm = pd.read_csv('results/chess_no_svm.csv')
chess_results_svm = pd.read_csv('results/chess_svm.csv')

In [36]:
chess_results = chess_results_no_svm.append(chess_results_svm, ignore_index=True)
chess_results.to_csv('results/chess.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


### Shrooms Results

In [59]:
shrooms_results_no_svm = perform_trials('shrooms', models_without_svm, shrooms_X, shrooms_y)
shrooms_results_no_svm.to_csv('results/shrooms_no_svm.csv')
shrooms_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   16.6s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   16.5s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,shrooms,tree,1,1.0,1.0,1.0,1.0,1.0,1.0,1.554272,1.0,1.0,1.0,1.0,1.0,1.0,1.625254
1,shrooms,tree,2,0.9994,1.0,0.99875,1.0,0.999375,0.999375,1.574997,0.998399,1.0,0.996702,1.0,0.998348,0.998351,1.592083
2,shrooms,tree,avg,0.9997,1.0,0.999375,1.0,0.999687,0.999688,1.564635,0.9992,1.0,0.998351,1.0,0.999174,0.999175,1.608669


In [None]:
shrooms_results_svm = perform_trials('shrooms', models_only_svm, shrooms_X, shrooms_y)
shrooms_results_svm.to_csv('results/shrooms_svm.csv')
shrooms_results_svm

In [None]:
shrooms_results_no_svm = pd.read_csv('results/shrooms_no_svm.csv')
shrooms_results_svm = pd.read_csv('results/shrooms_svm.csv')

In [None]:
shrooms_results = shrooms_results_no_svm.append(shrooms_results_svm, ignore_index=True)
shrooms_results.to_csv('results/shrooms.csv')

### Results of Cardio Dataset

In [24]:
# running algorithms except SVM on cardio dataset
cardio_results_no_svm = perform_trials('cardio', models_without_svm, cardio_X, cardio_y)
cardio_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100

 0.7441335  0.74557375]
 0.72058804 0.71776871]
 0.75516031 0.75794045]
 0.7316989  0.73092221]
 0.73787418 0.73785458]
 9.04925793 9.04925681]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.73224623 0.73332237]
 0.70172734 0.70133749]
 0.73286088 0.73449603]
 0.71645241 0.71672658]
 0.71729411 0.71791676]
  9.77457847  9.75385456]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.74891148 0.75257985]
 0.69775072 0.70090033]
 0.75802411 0.7616843 ]
 0.72230869 0.72570451]
 0.72788742 0.73129232]
 9.4153656  9.29793232]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.73634522 0.73776768]
 0.69348421 0.69626434]
 0.74737211 0.74818263]
 0.71416158 0.71636865]
 0.72042816 0.72222349]
  9.66404974  9.60187963]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.72766677 0.72878402]
 0.71351017 0.71589824]
 0.73019806 0.73099642]
 0.72035244 0.72220084]
 0.72185411 0.72344733]
  9.6087949   9.55353254]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.73774584 0.74218078]
 0.69114843 0.69277692]
 0.76383066 0.76853654]
 0.7136309  0.71652045]
 0.72748954 0.73065673]
  9.3877357   9.27720969]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.74367672 0.74668793]
 0.69864192 0.69904112]
 0.75861082 0.76221082]
 0.72033562 0.72194811]
 0.72862637 0.73062597]
  9.37392035  9.30484135]


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,cardio,tree,1,0.7272,0.76961,0.657407,0.814388,0.7054,0.726756,3.329554,0.718092,0.760048,0.651762,0.803292,0.698032,0.718082,12.5021
1,cardio,tree,2,0.7308,0.825455,0.771586,0.884584,0.736182,0.730678,3.377911,0.716569,0.823521,0.757578,0.888329,0.717287,0.716578,12.549928
2,cardio,tree,3,0.7276,0.77355,0.721763,0.817405,0.738914,0.728793,3.143042,0.723985,0.753861,0.691226,0.803083,0.711474,0.72387,12.463843
3,cardio,tree,4,0.7372,0.787893,0.746233,0.823245,0.724413,0.737665,3.426264,0.724569,0.761494,0.689836,0.797068,0.713895,0.724521,12.273081
4,cardio,tree,5,0.735,0.797519,0.734182,0.807801,0.743664,0.735456,2.977256,0.726877,0.775533,0.706,0.803805,0.711856,0.726788,12.826238
5,cardio,tree,6,0.7348,0.747525,0.731538,0.799922,0.720253,0.734063,3.315736,0.731446,0.761724,0.672682,0.804416,0.720963,0.731479,12.584985
6,cardio,tree,7,0.7414,0.784314,0.714229,0.819456,0.748012,0.741457,3.101595,0.722185,0.772991,0.658558,0.811395,0.695597,0.722577,12.45002
7,cardio,tree,avg,0.733429,0.783695,0.725277,0.823829,0.730977,0.733553,3.238766,0.723389,0.772739,0.689663,0.815913,0.709872,0.723414,12.521457
8,cardio,log_reg,1,0.7292,0.760739,0.663446,1.0,0.708817,0.728782,17.158864,0.726646,0.757817,0.665702,1.0,0.708874,0.726637,17.266731
9,cardio,log_reg,2,0.652,0.661729,1.0,0.658646,0.688273,0.652064,16.938208,0.647785,0.649235,1.0,0.661212,0.681556,0.647767,17.306453


In [25]:
# running SVM algorithm on cardio dataset, generally take longer time to run than other algorithms combined
cardio_results_svm = perform_trials('cardio', models_only_svm, cardio_X, cardio_y)
cardio_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,cardio,svm,1,0.7372,0.805755,0.814412,1.0,0.719471,0.736826,17.158864,0.7276,0.808387,0.800369,1.0,0.710771,0.727591,17.266731
1,cardio,svm,2,0.7332,0.748567,1.0,0.760604,0.732665,0.733509,16.9313,0.726415,0.733427,0.999784,0.757929,0.7213,0.72638,17.309641
2,cardio,svm,3,0.737,0.785714,1.0,0.824319,0.726668,0.737818,16.979656,0.727262,0.773072,0.99963,0.813709,0.716606,0.727192,17.304327
3,cardio,svm,4,0.735,0.777726,1.0,0.80226,0.741597,0.735625,17.110906,0.727154,0.752055,0.999969,0.786744,0.710793,0.727087,17.2937
4,cardio,svm,5,0.7344,0.770219,1.0,0.80579,0.725506,0.734588,17.179985,0.725523,0.765986,1.0,0.802022,0.715822,0.725488,17.287855
5,cardio,svm,6,0.7284,0.776371,0.821297,1.0,0.704654,0.727103,16.924001,0.727862,0.784359,0.813484,1.0,0.709144,0.727918,17.280547
6,cardio,svm,7,0.7386,0.903614,1.0,0.996797,0.724668,0.738641,17.255972,0.728046,0.831545,1.0,0.995173,0.712434,0.728008,17.28201
7,cardio,svm,avg,0.734829,0.795424,0.947959,0.884253,0.725033,0.734873,17.077241,0.727123,0.778404,0.944748,0.879368,0.713838,0.727095,17.289259


In [26]:
# combine results of svm and non-svm algorithms and save as a csv file
cardio_final_results = cardio_results_no_svm.append(cardio_results_svm, ignore_index=True)
cardio_final_results.to_csv('results/cardio_results.csv', index = False)

In [27]:
# display performance
pd.read_csv('results/cardio_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,cardio,tree,1,0.7272,0.76961,0.657407,0.814388,0.7054,0.726756,3.329554,0.718092,0.760048,0.651762,0.803292,0.698032,0.718082,12.5021
1,cardio,tree,2,0.7308,0.825455,0.771586,0.884584,0.736182,0.730678,3.377911,0.716569,0.823521,0.757578,0.888329,0.717287,0.716578,12.549928
2,cardio,tree,3,0.7276,0.77355,0.721763,0.817405,0.738914,0.728793,3.143042,0.723985,0.753861,0.691226,0.803083,0.711474,0.72387,12.463843
3,cardio,tree,4,0.7372,0.787893,0.746233,0.823245,0.724413,0.737665,3.426264,0.724569,0.761494,0.689836,0.797068,0.713895,0.724521,12.273081
4,cardio,tree,5,0.735,0.797519,0.734182,0.807801,0.743664,0.735456,2.977256,0.726877,0.775533,0.706,0.803805,0.711856,0.726788,12.826238
5,cardio,tree,6,0.7348,0.747525,0.731538,0.799922,0.720253,0.734063,3.315736,0.731446,0.761724,0.672682,0.804416,0.720963,0.731479,12.584985
6,cardio,tree,7,0.7414,0.784314,0.714229,0.819456,0.748012,0.741457,3.101595,0.722185,0.772991,0.658558,0.811395,0.695597,0.722577,12.45002
7,cardio,tree,avg,0.733429,0.783695,0.725277,0.823829,0.730977,0.733553,3.238766,0.723389,0.772739,0.689663,0.815913,0.709872,0.723414,12.521457
8,cardio,log_reg,1,0.7292,0.760739,0.663446,1.0,0.708817,0.728782,17.158864,0.726646,0.757817,0.665702,1.0,0.708874,0.726637,17.266731
9,cardio,log_reg,2,0.652,0.661729,1.0,0.658646,0.688273,0.652064,16.938208,0.647785,0.649235,1.0,0.661212,0.681556,0.647767,17.306453


### Results of Australian Rain Dataset

In [30]:
# running algorithms except SVM on Australian rain dataset
aus_results_no_svm = perform_trials('aus', models_without_svm, aus_X, aus_y)
aus_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.77282843 0.77207898]
 0.40921694 0.4045221 ]
 0.96748818 0.96774199]
 0.53495496 0.5307424 ]
 0.68835256 0.68613204]
 5.22228346 5.24991432]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.77497655 0.78039805]
 0.44636364 0.43818182]
 0.96307692 0.96487179]
 0.5659159  0.56063584]
 0.70472028 0.70152681]
 5.20156275 5.21537714]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.74481347 0.75098865]
 0.39597761 0.39411714]
 0.9627928  0.96432179]
 0.51566134 0.51539945]
 0.6793852  0.67921946]
 5.49859655 5.47096457]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.77640566 0.77869872]
 0.45081219 0.4472645 ]
 0.96099525 0.96151238]
 0.56853344 0.56619252]
 0.70590372 0.70438844]
 5.32590347 5.33971866]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.77434258 0.78001521]
 0.44978379 0.45337524]
 0.96241127 0.96318314]
 0.56859129 0.5728229 ]
 0.70609753 0.70827919]
 5.24991736 5.20156259]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.76996873 0.76524748]
 0.44018692 0.4317757 ]
 0.96412214 0.96386768]
 0.55988356 0.55158775]
 0.70215453 0.69782169]
 5.11176146 5.18083917]


Fitting 5 folds for each of 8 candidates, totalling 40 fits


 0.7402196  0.7397975 ]
 0.40777046 0.40323735]
 0.95973339 0.95999013]
 0.52520984 0.52145675]
 0.68375193 0.68161374]
 5.58839913 5.61602999]


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,aus,tree,1,0.8492,0.8,0.634995,0.98222,0.577857,0.793885,1.768402,0.83213,0.793374,0.454306,0.981741,0.514792,0.680295,7.417087
1,aus,tree,2,0.8534,0.777567,0.864545,0.97,0.595251,0.722949,1.733863,0.829382,0.734551,0.487182,0.965026,0.528443,0.68799,7.434552
2,aus,tree,3,0.8424,0.784884,0.860595,0.981142,0.559152,0.799772,1.733862,0.828926,0.780864,0.467777,0.978989,0.524279,0.680138,7.590942
3,aus,tree,4,0.8538,0.823529,0.713906,0.9845,0.62339,0.741199,1.125968,0.828435,0.801478,0.492878,0.98384,0.539103,0.695147,7.276675
4,aus,tree,5,0.8622,0.838608,0.879928,0.986869,0.703435,0.796881,1.637154,0.829895,0.801005,0.503917,0.983756,0.541967,0.702007,7.460127
5,aus,tree,6,0.8568,0.772182,0.715888,0.975827,0.656448,0.764614,1.699321,0.823978,0.76804,0.463466,0.976079,0.544421,0.699465,7.465532
6,aus,tree,7,0.8388,0.775701,0.849228,0.969223,0.697723,0.79317,1.678599,0.833198,0.73595,0.481284,0.96252,0.517844,0.690402,7.350698
7,aus,tree,avg,0.850943,0.796067,0.788441,0.97854,0.630465,0.77321,1.62531,0.829421,0.773609,0.478687,0.975993,0.530121,0.690778,7.427945
8,aus,log_reg,1,0.8536,0.788591,0.493885,1.0,0.592217,0.723448,7.336036,0.844789,0.822835,0.466541,0.999991,0.568289,0.708628,7.570941
9,aus,log_reg,2,0.8546,0.852665,0.532727,1.0,0.614256,0.737902,7.584715,0.842987,0.79805,0.504143,0.999991,0.584528,0.721091,7.559876


In [31]:
# running SVM algorithm on Australian rain dataset, generally take longer time to run than other algorithms combined
aus_results_svm = perform_trials('aus', models_only_svm, aus_X, aus_y)
aus_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,aus,svm,1,0.8576,0.808743,0.484478,1.0,0.588915,0.719694,7.342944,0.845016,0.842434,0.458103,1.0,0.552106,0.697773,7.577089
1,aus,svm,2,0.859,0.942529,0.7,1.0,0.619383,0.737564,7.598531,0.84251,0.879286,0.472528,1.0,0.574432,0.713079,7.56799
2,aus,svm,3,0.8534,0.861272,0.47119,1.0,0.572881,0.711767,7.432745,0.843343,0.860588,0.46755,1.0,0.561399,0.705952,7.573892
3,aus,svm,4,0.8562,0.895161,0.719221,1.0,0.811189,0.851731,7.798856,0.843429,0.86999,0.478763,1.0,0.561484,0.70763,7.560859
4,aus,svm,5,0.8552,0.886667,0.511649,1.0,0.602321,0.728919,7.709055,0.844867,0.854671,0.480836,1.0,0.569531,0.711297,7.564056
5,aus,svm,6,0.856,0.946667,0.490654,1.0,0.590551,0.722045,7.391298,0.84518,0.902718,0.46061,1.0,0.55775,0.703466,7.575367
6,aus,svm,7,0.8472,0.85446,0.506812,1.0,0.588918,0.723142,7.605439,0.844724,0.844099,0.493956,1.0,0.574148,0.715173,7.567744
7,aus,svm,avg,0.854943,0.885071,0.554858,1.0,0.62488,0.742123,7.554124,0.844153,0.864827,0.473192,1.0,0.564407,0.707767,7.569571


In [32]:
# combine results of svm and non-svm algorithms and save as a csv file
aus_final_results = aus_results_no_svm.append(aus_results_svm, ignore_index=True)
aus_final_results.to_csv('results/aus_results.csv', index = False)

In [33]:
# display performance
pd.read_csv('results/aus_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,aus,tree,1,0.8492,0.8,0.634995,0.98222,0.577857,0.793885,1.768402,0.83213,0.793374,0.454306,0.981741,0.514792,0.680295,7.417087
1,aus,tree,2,0.8534,0.777567,0.864545,0.97,0.595251,0.722949,1.733863,0.829382,0.734551,0.487182,0.965026,0.528443,0.68799,7.434552
2,aus,tree,3,0.8424,0.784884,0.860595,0.981142,0.559152,0.799772,1.733862,0.828926,0.780864,0.467777,0.978989,0.524279,0.680138,7.590942
3,aus,tree,4,0.8538,0.823529,0.713906,0.9845,0.62339,0.741199,1.125968,0.828435,0.801478,0.492878,0.98384,0.539103,0.695147,7.276675
4,aus,tree,5,0.8622,0.838608,0.879928,0.986869,0.703435,0.796881,1.637154,0.829895,0.801005,0.503917,0.983756,0.541967,0.702007,7.460127
5,aus,tree,6,0.8568,0.772182,0.715888,0.975827,0.656448,0.764614,1.699321,0.823978,0.76804,0.463466,0.976079,0.544421,0.699465,7.465532
6,aus,tree,7,0.8388,0.775701,0.849228,0.969223,0.697723,0.79317,1.678599,0.833198,0.73595,0.481284,0.96252,0.517844,0.690402,7.350698
7,aus,tree,avg,0.850943,0.796067,0.788441,0.97854,0.630465,0.77321,1.62531,0.829421,0.773609,0.478687,0.975993,0.530121,0.690778,7.427945
8,aus,log_reg,1,0.8536,0.788591,0.493885,1.0,0.592217,0.723448,7.336036,0.844789,0.822835,0.466541,0.999991,0.568289,0.708628,7.570941
9,aus,log_reg,2,0.8546,0.852665,0.532727,1.0,0.614256,0.737902,7.584715,0.842987,0.79805,0.504143,0.999991,0.584528,0.721091,7.559876


### Results of AirBnB Price Dataset

In [22]:
# running algorithms except SVM on airbnb dataset
airbnb_results_no_svm = perform_trials('airbnb', models_without_svm, airbnb_X, airbnb_y)
airbnb_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 1936 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   15.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:    9.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   10.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 2104 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   13.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    9.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    8.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    8.9s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    8.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.1s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    9.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   14.9s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   14.8s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.0s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.5s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   15.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.3s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   56.0s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.0s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.6s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.5s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.5s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,airbnb,tree,1,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
1,airbnb,tree,2,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
2,airbnb,tree,3,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
3,airbnb,tree,4,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
4,airbnb,tree,5,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
5,airbnb,tree,avg,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
6,airbnb,log_reg,1,0.9878,1.0,0.975373,1.0,0.987533,0.987687,0.4213731,0.98804,1.0,0.976078,1.0,0.987894,0.988039,0.4130962
7,airbnb,log_reg,2,0.9906,1.0,0.981033,1.0,0.990426,0.990517,0.3246645,0.987721,0.99958,0.975849,0.99959,0.987572,0.987719,0.4241123
8,airbnb,log_reg,3,0.9876,1.0,0.974684,1.0,0.987179,0.987342,0.4282808,0.989065,1.0,0.978156,1.0,0.988957,0.989078,0.377688
9,airbnb,log_reg,4,0.9884,1.0,0.976585,1.0,0.988154,0.988292,0.4006498,0.989725,1.0,0.97945,1.0,0.989618,0.989725,0.3548693


In [23]:
airbnb_results_svm = perform_trials('airbnb', models_only_svm, airbnb_X, airbnb_y)
airbnb_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   27.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   25.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.3s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   25.7s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.2s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,airbnb,svm,1,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999795,0.999727,0.999863,0.999727,0.999795,0.999795,0.007082
1,airbnb,svm,2,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999704,0.999408,1.0,0.999408,0.999704,0.999704,0.010229
2,airbnb,svm,3,0.9994,0.998777,1.0,0.998824,0.999388,0.999412,0.02072375,0.999271,0.998546,1.0,0.99854,0.999272,0.99927,0.02518
3,airbnb,svm,4,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999863,0.999863,0.999863,0.999863,0.999863,0.999863,0.004721
4,airbnb,svm,5,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999954,1.0,0.999909,1.0,0.999954,0.999954,0.001574
5,airbnb,svm,avg,0.99988,0.999755,1.0,0.999765,0.999878,0.999882,0.004144749,0.999718,0.999509,0.999927,0.999508,0.999718,0.999717,0.009757


In [24]:
airbnb_final_results = airbnb_results_no_svm.append(airbnb_results_svm, ignore_index=True)
airbnb_final_results.to_csv('results/airbnb_results.csv', index = False)

In [25]:
# display performance
pd.read_csv('results/airbnb_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,airbnb,tree,1,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
1,airbnb,tree,2,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
2,airbnb,tree,3,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
3,airbnb,tree,4,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
4,airbnb,tree,5,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
5,airbnb,tree,avg,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
6,airbnb,log_reg,1,0.9878,1.0,0.975373,1.0,0.987533,0.987687,0.4213731,0.98804,1.0,0.976078,1.0,0.987894,0.988039,0.4130962
7,airbnb,log_reg,2,0.9906,1.0,0.981033,1.0,0.990426,0.990517,0.3246645,0.987721,0.99958,0.975849,0.99959,0.987572,0.987719,0.4241123
8,airbnb,log_reg,3,0.9876,1.0,0.974684,1.0,0.987179,0.987342,0.4282808,0.989065,1.0,0.978156,1.0,0.988957,0.989078,0.377688
9,airbnb,log_reg,4,0.9884,1.0,0.976585,1.0,0.988154,0.988292,0.4006498,0.989725,1.0,0.97945,1.0,0.989618,0.989725,0.3548693


### Results of Olympic Gold Medal Dataset


In [26]:
# running algorithms except SVM on Olympic dataset
olympic_results_no_svm = perform_trials('olympic', models_without_svm, olympic_X, olympic_y)
olympic_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 1892 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 2180 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.4min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.3min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.3min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:   58.8s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.1min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.8min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.8s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.6min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.6min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.7min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,tree,1,0.9522,0.954655,0.997053,0.096386,0.975394,0.546719,1.650989,0.949827,0.953788,0.995456,0.069974,0.974177,0.532715,1.732942
1,olympic,tree,2,0.9568,0.958232,0.998108,0.144628,0.977764,0.571368,1.492108,0.950081,0.95434,0.9951,0.08262,0.974294,0.53886,1.724185
2,olympic,tree,3,0.954,0.954911,0.998737,0.104,0.976333,0.551368,1.58882,0.951,0.953734,0.996816,0.06746,0.974799,0.532138,1.692422
3,olympic,tree,4,0.95,0.952678,0.996839,0.074803,0.974259,0.535821,1.726976,0.948853,0.952527,0.995833,0.042469,0.973699,0.519151,1.766595
4,olympic,tree,5,0.9604,0.961701,0.998326,0.140271,0.979671,0.569299,1.367766,0.950568,0.953834,0.996214,0.072994,0.974563,0.534604,1.707359
5,olympic,tree,avg,0.95468,0.956436,0.997813,0.112018,0.976684,0.554915,1.565332,0.950066,0.953645,0.995884,0.067103,0.974306,0.531494,1.724701
6,olympic,log_reg,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
7,olympic,log_reg,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
8,olympic,log_reg,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
9,olympic,log_reg,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204


In [27]:
olympic_results_svm = perform_trials('olympic', models_only_svm, olympic_X, olympic_y)
olympic_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.2min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.0min finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,svm,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
1,olympic,svm,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
2,olympic,svm,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
3,olympic,svm,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204
4,olympic,svm,5,0.9558,0.9558,1.0,0.0,0.977401,0.5,1.526649,0.950558,0.950558,1.0,0.0,0.974652,0.5,1.707706
5,olympic,svm,avg,0.95136,0.95136,1.0,0.0,0.975072,0.5,1.680005,0.950668,0.950668,1.0,0.0,0.97471,0.5,1.703894


In [28]:
olympic_final_results = olympic_results_no_svm.append(olympic_results_svm, ignore_index=True)
olympic_final_results.to_csv('results/olympic_results.csv', index = False)

In [29]:
# display performance
pd.read_csv('results/olympic_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,tree,1,0.9522,0.954655,0.997053,0.096386,0.975394,0.546719,1.650989,0.949827,0.953788,0.995456,0.069974,0.974177,0.532715,1.732942
1,olympic,tree,2,0.9568,0.958232,0.998108,0.144628,0.977764,0.571368,1.492108,0.950081,0.95434,0.9951,0.08262,0.974294,0.53886,1.724185
2,olympic,tree,3,0.954,0.954911,0.998737,0.104,0.976333,0.551368,1.58882,0.951,0.953734,0.996816,0.06746,0.974799,0.532138,1.692422
3,olympic,tree,4,0.95,0.952678,0.996839,0.074803,0.974259,0.535821,1.726976,0.948853,0.952527,0.995833,0.042469,0.973699,0.519151,1.766595
4,olympic,tree,5,0.9604,0.961701,0.998326,0.140271,0.979671,0.569299,1.367766,0.950568,0.953834,0.996214,0.072994,0.974563,0.534604,1.707359
5,olympic,tree,avg,0.95468,0.956436,0.997813,0.112018,0.976684,0.554915,1.565332,0.950066,0.953645,0.995884,0.067103,0.974306,0.531494,1.724701
6,olympic,log_reg,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
7,olympic,log_reg,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
8,olympic,log_reg,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
9,olympic,log_reg,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204


## output tables

In [49]:
# combine datasets
cardio_results = pd.read_csv('results/cardio_results.csv')
olympic_results = pd.read_csv('results/olympic_results.csv')
olympic_results.columns = cardio_results.columns
airbnb_results = pd.read_csv('results/airbnb_results.csv')
airbnb_results.columns = cardio_results.columns
aus_results = pd.read_csv('results/aus_results.csv')
results = cardio_results.append([olympic_results, airbnb_results, aus_results], ignore_index = True)

In [50]:
train_metrics = ["train_" + x for x in list(scoring.keys())]
test_metrics = ["test_" + x for x in list(scoring.keys())]

In [230]:
test_avg_results = results.loc[results['trial'] == 'avg', ['dataset', 'model', 'trial'] + test_metrics]
test_avg_results = test_avg_results.reset_index(drop = True)

In [231]:
test_results = results.loc[results['trial'] != 'avg', ['dataset', 'model', 'trial'] + test_metrics]
test_results = test_results.reset_index(drop = True)

In [232]:
train_avg_results = results.loc[results['trial'] == 'avg', ['dataset', 'model', 'trial'] + train_metrics]
train_avg_results = train_avg_results.reset_index(drop = True)

In [233]:
train_results = results.loc[results['trial'] != 'avg', ['dataset', 'model', 'trial'] + train_metrics]
train_results = train_results.reset_index(drop = True)

In [241]:
dataset = train_avg_results['dataset'].unique().tolist() 

#### Table 2

In [235]:
table2 = test_avg_results.groupby(by='model').mean()
table2['mean'] = table2.apply(np.mean, axis = 1)
table2 = table2.reset_index()
table2

Unnamed: 0,model,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,mean
0,forest,0.880654,0.86087,0.784385,0.687425,0.81093,0.735518,0.793297
1,knn,0.855209,0.874959,0.755615,0.681039,0.764354,0.700043,0.77187
2,log_reg,0.869327,0.867512,0.807338,0.698772,0.785246,0.72506,0.792209
3,perceptron,0.697309,0.640654,0.646051,0.714123,0.604598,0.677625,0.663393
4,svm,0.880415,0.898352,0.854467,0.719719,0.813168,0.733645,0.816628
5,tree,0.875719,0.874998,0.791059,0.714752,0.803575,0.736421,0.799421


#### Table 2 p value appendix 

In [236]:
best_models = table2.set_index('model').idxmax()

In [237]:
tl2_pvals = pd.DataFrame(columns = ['model'] + test_metrics)
tl2_pvals['model'] = table2['model']

In [238]:
for metric in test_metrics:
    best_model = best_models[metric]
    grp = test_results.groupby(by='model')
    for model in models:
        pval = stats.ttest_ind(grp.get_group(best_model)[metric].tolist(), grp.get_group(model)[metric].tolist()).pvalue
        tl2_pvals.loc[tl2_pvals['model'] == model, metric] = pval
tl2_pvals

Unnamed: 0,model,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc
0,forest,1.0,0.149951,0.247245,0.714857,0.961355,0.999837
1,knn,0.408413,0.387342,0.114356,0.696504,0.359515,0.410973
2,log_reg,0.709716,0.238554,0.487489,0.834681,0.611499,0.836367
3,perceptron,0.00721976,0.00190464,0.0532257,0.751263,0.0171629,0.200692
4,svm,0.994095,1.0,1.0,1.0,1.0,0.989963
5,tree,0.85645,0.331502,0.278635,0.9184,0.840714,1.0


#### Table 3

In [246]:
models

['tree', 'log_reg', 'perceptron', 'knn', 'forest', 'svm']

In [242]:
table3 = pd.DataFrame(columns = ['model'] + dataset + ['mean'])
table3['model'] = 

Unnamed: 0,model,cardio,olympic,airbnb,aus,mean


In [244]:
train_avg_results.head()

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc
0,cardio,tree,avg,0.733429,0.783695,0.725277,0.823829,0.730977,0.733553
1,cardio,log_reg,avg,0.695571,0.718427,0.758997,0.800589,0.59916,0.695531
2,cardio,perceptron,avg,0.604514,0.797079,0.617288,0.791935,0.523746,0.6022
3,cardio,knn,avg,0.679857,0.699976,0.662977,0.737672,0.667303,0.679873
4,cardio,forest,avg,0.826057,0.833526,0.882582,0.839613,0.822529,0.825942


In [None]:
for i in range

In [208]:
results.loc[results['trial'] == 'avg', test_metrics].apply(np.mean, axis = 1)

7     0.739165
15    0.710081
23    0.655749
31    0.666772
39    0.729092
dtype: float64

#### Appendix 1

In [200]:
appendix1 = results.loc[results['trial'] == 'avg', ['dataset', 'model', 'trial'] + train_metrics]
appendix1 = appendix1.reset_index(drop = True)
appendix1 = appendix1.groupby(by='model').mean()
appendix1['mean'] = appendix1.apply(np.mean, axis = 1)
appendix1 = appendix1.reset_index()
appendix1

Unnamed: 0,model,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,mean
0,forest,0.944053,0.944727,0.961435,0.803177,0.946819,0.874239,0.912408
1,knn,0.866976,0.883886,0.913128,0.685751,0.909122,0.794556,0.842236
2,log_reg,0.871769,0.870004,0.8128,0.699749,0.791637,0.728624,0.795764
3,perceptron,0.698393,0.641897,0.646874,0.715925,0.603835,0.677603,0.664088
4,svm,0.885253,0.907903,0.875704,0.721004,0.831216,0.74422,0.82755
5,tree,0.884763,0.884049,0.877883,0.728597,0.834532,0.765419,0.829207


#### Appendix 1 p value appendix

In [203]:
best_models = appendix1.set_index('model').idxmax()

train_results = results.loc[results['trial'] != 'avg', ['dataset', 'model', 'trial'] + train_metrics]
train_results = train_results.reset_index(drop=True)

appendix1_pvals = pd.DataFrame(columns = ['model'] + train_metrics)
appendix1_pvals['model'] = appendix1['model']

for metric in train_metrics:
    best_model = best_models[metric]
    grp = train_results.groupby(by='model')
    for model in models:
        pval = stats.ttest_ind(grp.get_group(best_model)[metric].tolist(), grp.get_group(model)[metric].tolist()).pvalue
        appendix1_pvals.loc[appendix1_pvals['model'] == model, metric] = pval
appendix1_pvals

Unnamed: 0,model,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc
0,forest,1.0,1.0,1.0,1.0,1.0,1.0
1,knn,0.0049323,0.0179431,0.117362,0.285982,0.225815,0.14457
2,log_reg,0.00636608,0.00342604,0.00508212,0.379869,0.000899723,0.00125152
3,perceptron,0.000200617,0.000261449,0.000980005,0.315693,2.20055e-05,5.46066e-05
4,svm,0.0136206,0.0747235,0.0320331,0.52745,0.000937803,0.00415101
5,tree,0.0133099,0.0068008,0.00302404,0.482672,0.000966603,0.00889826


In [None]:
# create table 1

In [None]:
# create table 2

In [None]:
# create table 3