In [3]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

### Clean Chess Dataset

rated: whether or not the game was a rated game

created_at: time at game start

turns: how many turns the game took

victory_status: whether the game ended with a timeout, resignation, or checkmate

white_rating: rating of the white player

black_rating: rating of the black player

opening_eco: code corresponding to the opening moves played

opening_ply: the length of the opening

In [4]:
# https://www.kaggle.com/datasnaek/chess
chess = pd.read_csv('Data/games.csv')
chess.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [5]:
# standarize numerical columns
chess_num_cols = ['created_at', 'last_move_at', 'turns', 'white_rating', 'black_rating', 'opening_ply']
for num_col in chess_num_cols:
    chess[num_col] = (chess[num_col] - chess[num_col].mean()) /chess[num_col].std()

# create a new column called winner_white that is true if white wins, false otherwise
chess['winner_white'] = chess['winner'] == 'white'
chess = chess[['rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
                     'white_rating', 'black_rating', 'opening_eco', 'opening_ply', 'winner_white']]

In [6]:
chess_df_X = chess.drop(columns=['winner_white'])
chess_df_y = chess['winner_white']

In [7]:
# one-hot encode categorical columns
chess_X_cat_col = ['rated', 'victory_status', 'opening_eco']
chess_X = pd.get_dummies(columns=chess_X_cat_col, data=chess_df_X)

chess_y = chess_df_y.replace({True: 1, False: 0})

In [8]:
# 50.1396% negative instances, 49.8604% positive instances
chess_y.value_counts(normalize = True)

0    0.501396
1    0.498604
Name: winner_white, dtype: float64

### Clean Mushrooms Dataset

class: edible(e), poisonous(p)

cap-shape: bell(b), conical(c), convex(x), flat(f), knobbed(k), sunken(s)

cap-surface: fibrous(f), grooves(g), scaly(y), smooth(s)

cap-color: brown(n), buff(b), cinnamon(c), gray(g), green(r), pink(p), purple(u), red(e), white(w), yellow(y)

bruises: bruises(t), no(f)

odor: almond(a), anise(l), creosote(c), fishy(y), foul(f), musty(m), none(n), pungent(p), spicy(s)

gill-attachment: attached(a), descending(d), free(f), notched(n)

gill-spacing: close(c), crowded(w), distant(d)

gill-size: broad(b), narrow(n)

gill-color: black(k), brown(n), buff(b), chocolate(h), gray(g), green(r), orange(o), pink(p), purple(u), red(e), white(w), yellow(y)

stalk-shape: enlarging(e), tapering(t)

stalk-root: bulbous(b), club(c), cup(u), equal(e), rhizomorphs(z), rooted(r), missing(?)

stalk-surface-above-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-surface-below-ring: fibrous(f), scaly(y), silky(k), smooth(s)

stalk-color-above-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

stalk-color-below-ring: brown(n), buff(b), cinnamon(c), gray(g), orange(o), pink(p), red(e), white(w), yellow(y)

veil-type: partial(p), universal(u)

veil-color: brown(n), orange(o), white(w), yellow(y)

ring-number: none(n), one(o), two(t)

ring-type: cobwebby(c), evanescent(e), flaring(f), large(l), none(n), pendant(p), sheathing(s), zone(z)

spore-print-color: black(k), brown(n), buff(b), chocolate(h), green(r), orange(o), purple(u), white(w), yellow(y)

population: abundant(a), clustered(c), numerous(n), scattered(s), several(v), solitary(y)

habitat: grasses(g), leaves(l), meadows(m), paths(p), urban(u), waste(w), woods(d)

In [9]:
# https://www.kaggle.com/uciml/mushroom-classification
shrooms = pd.read_csv('Data/mushrooms.csv')
shrooms.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
shrooms_df_X = shrooms.drop(columns=['class'])
shrooms_df_y = shrooms['class']

In [11]:
shrooms_X = pd.get_dummies(data=shrooms_df_X)
shrooms_y = shrooms_df_y.replace({'e': 0, 'p': 1})

In [12]:
# 51.8% negative instances, 48.2% positive instances
shrooms_y.value_counts(normalize = True)

0    0.517971
1    0.482029
Name: class, dtype: float64

### Clean Cardio Dataset

Retrieved from the kaggle site https://www.kaggle.com/sulianova/cardiovascular-disease-dataset, this cardio dataset has 70000 samples and 12 variables, which were collected at the moment of medical examination. It contains a target variable that indicates the presence or absence of cardiovascular disease, as well as 11 features that might be associated with the presence of cardiovascular disease, such as age, gender, and blood pressure. There are 3 types of 11 input features:
- objective feature: factual information
- examination feature: results of medical examination
- subjective feature: information given by the patient

A more detailed description of 11 features are shown below:

- age: objective feature, int (days)
- height: objective feature, int (cm)
- weight: objective feature, float (kg)
- gender: objective feature, categorical code, 1: male, 2:female
- ap_hi: systolic blood pressure, examination feature, int
- ap_lo: diastolic blood pressure, examination feature, int
- cholesterol: examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- gluc: glucose, examination feature, categorical code, 1: normal, 2: above normal, 3: well above normal
- smoke: subjective feature, binary, 0: do not smoke, 1: smoke
- alco: alcohol intake, subjective feature, binary, 0: do not drink alcohol, 1: drink alcohol
- active: physical activity, subjective feature, binary, 0: not physically active, 1: physically active

A detailed description of the target variable is shown below: 

- cardio: presence or absence of cardiovascular disease, binary, 0: disease not present, 1: disease present

For this dataset, we want use those 11 input features and apply machine learning algorithms to predict whether a person has cardiovascular disease or not.

In [13]:
# load the cardio dataset
cardio = pd.read_csv('data/cardio.csv', delimiter = ';')
cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
# no missing values in cardio dataset
cardio.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [15]:
# drop unnecessary column "id"
cardio = cardio.drop(columns = ['id'])
# convert age in days to age in years
cardio['age'] = cardio['age'].apply(lambda x: int(x/365))

In [16]:
# one hot encoding categorical input features stored in cate_cols
cardio_cate_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
cardio = pd.get_dummies(columns = cardio_cate_cols, data = cardio)

In [17]:
# scale numerical attributes to 0 mean 1 std
cardio_num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
for num_col in cardio_num_cols:
    cardio[num_col] = (cardio[num_col] - cardio[num_col].mean()) / cardio[num_col].std()

In [18]:
# 50.03% negative labels, 49.97% positive labels
cardio['cardio'].value_counts(normalize = True)

0    0.5003
1    0.4997
Name: cardio, dtype: float64

In [19]:
# split the cardio dataset into input features and labels 
cardio_X = cardio.drop(columns=['cardio']) # input features
cardio_y = cardio['cardio'] # true lables

### Clean Rain Dataset

Retrieved from the kaggle site https://www.kaggle.com/jsphyg/weather-dataset-rattle-package, this Rain in Australia dataset contains about 10 years of daily weather observations from many locations across Australia. There are 145460 samples and 23 variables in this dataset. It contains a target variable that indicates whether it rained the next day, as well as 22 features that might be associated with the target variable, such as minimum temperature, maximum temperature, rainfall of the day.

A more detailed description of 22 features are shown below:

- Date: the date of observation
- Location: the common name of the location of the weather station
- MinTemp: the minimum temperature in degrees celsius
- MaxTemp: the maximum temperature in degrees celsius
- Rainfall: the amount of rainfall recorded for the day in mm
- Evaporation: the so-called Class A pan evaporation (mm) in the 24 hours to 9am
- Sunshine: the number of hours of bright sunshine in the day
- WindGustDir: the direction of the strongest wind gust in the 24 hours to midnight
- WindGustSpeed: the speed (km/h) of the strongest wind gust in the 24 hours to midnight
- WindDir9am: direction of the wind at 9am
- WindDir3pm: direction of the wind at 3pm
- WindSpeed9am: wind speed (km/hr) averaged over 10 minutes prior to 9am
- WindSpeed3pm: wind speed (km/hr) averaged over 10 minutes prior to 3pm
- Humidity9am: humidity (percent) at 9am
- Humidity3pm: humidity (percent) at 3pm
- Pressure9am: atmospheric pressure (hpa) reduced to mean sea level at 9am
- Pressure3pm: atmospheric pressure (hpa) reduced to mean sea level at 3pm
- Cloud9am: fraction of sky obscured by cloud at 9am. This is measured in "oktas", which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast
- Cloud3pm: fraction of sky obscured by cloud (in "oktas": eighths) at 3pm. See Cload9am for a description of the values
- Temp9am: temperature (degrees C) at 9am
- Temp3pm: temperature (degrees C) at 3pm
- RainToday: whether the precipitation (mm) in the 24 hours to 9am exceeded 1mm, Yes: the precipitation exceeded 1mm, No: it did not exceed 1mm

A detailed description of the target variable is shown below: 

- RainTomorrow: whether amount of next day rain exceeded 1mm, Yes: next day precipitation exceeded 1mm, No: it did not exceed 1mm

For this dataset, we want use those 22 input features and apply machine learning algorithms to predict whether it rained the next day or not.

Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

In [20]:
# load Australian rain dataset
aus = pd.read_csv('Data/weatherAUS.csv')
aus.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [21]:
# display the number of missing values in each column
aus.isnull().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [22]:
# fill missing values in categorical columns with the mode 
aus_cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
for cate_col in aus_cate_cols:
    aus[cate_col] = aus[cate_col].fillna(aus[cate_col].mode()[0])

In [23]:
# fill missing values in numerical columns with the mean
aus_num_cols = aus.dtypes.index[aus.dtypes == "float64"].tolist()
for num_col in aus_num_cols:
    aus[num_col] = aus[num_col].fillna(aus[num_col].mean())
    # scale numerical attributes to 0 mean 1 std
    aus[num_col] = (aus[num_col] - aus[num_col].mean()) / aus[num_col].std()

In [24]:
# all missing values are filled
aus.isnull().sum()

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [25]:
# split the date of each observation into year, month, and day
splitted_date = aus['Date'].str.split('-')

# create 'Year', 'Month', 'Day' columns using splitted results of the date
aus['Year'] = splitted_date.str[0].astype(int)
aus['Month'] = splitted_date.str[1].astype(int)
aus['Day'] = splitted_date.str[2].astype(int)

# drop original 'Date' column
aus = aus.drop(columns = ['Date'])

In [26]:
# use 0 and 1 to indicate whether it rained or not
# 0: it rained, 1: it did not rain
aus['RainToday'] = aus['RainToday'].replace({'No': 0, 'Yes': 1})
aus['RainTomorrow'] = aus['RainTomorrow'].replace({'No': 0, 'Yes': 1})

In [27]:
# one hot encoding all categorical columns
cate_cols = aus.dtypes.index[aus.dtypes == "object"].tolist()
aus = pd.get_dummies(columns = cate_cols, data = aus)

In [28]:
# 78.0854% negative labels, 21.9146% positive labels
aus['RainTomorrow'].value_counts(normalize = True)

0    0.780854
1    0.219146
Name: RainTomorrow, dtype: float64

In [29]:
# split the rain dataset into input features and labels 
aus_X = aus.drop(columns=['RainTomorrow']) # input features
aus_y = aus['RainTomorrow'] # true lables

### Clean BnB Dataset

The dataset is from https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv, and contains data from AirBnB listing and metrics in New York City, New York for the year 2019. There are 47900 unique values and 8 predictive variables in this dataset, in which we will be using different features to predict whether an AirBnB pricing is expensive or not. This makes the price variable the target variable of our data. 

A more detailed description of 7 features are shown below:

- neighbourhood_group: location, str
- latitude: latitude coordinates, int
- longitude: longitude coordinates, int
- room_type: listing space type, str
- minimum_nights: amount of nights minimum, int
- number_of_reviews: number of reviews, int
- availability_365: number of days when listing is available for booking, int

A detailed description of the target variable is shown below:

- price: price in USD, int

For this dataset, we want use these 7 input features and apply machine learning algorithms to predict whether the AirBnb price is expensive or not.

In [30]:
# load the AirBnb dataset
airbnb = pd.read_csv('data/AB_NYC_2019.csv')
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [31]:
# We will be looking at these features of the AirBnB for our predictive model
airbnb = airbnb[['neighbourhood_group','latitude','longitude','room_type', 'price',
                       'minimum_nights','number_of_reviews','availability_365']]

In [32]:
# The price median will be the threshold for our expensive classifier
airbnb['price'].median()

106.0

In [33]:
# Creating a new variable called 'is_expensive' that labels prices greater than the median as True, 
# lesser than the median as False
airbnb = airbnb.assign(
    is_expensive = airbnb.get('price') > 106.0
)
airbnb.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365,is_expensive
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,365,True
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,355,True
2,Manhattan,40.80902,-73.9419,Private room,150,3,0,365,True
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,194,False
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0,False


In [34]:
# Drop the price column
airbnb = airbnb.drop(columns = ['price'])

In [35]:
# Scale numerical attributes to 0 mean 1 std
airbnb_num_cols = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews','availability_365']
for num_col in airbnb_num_cols:
    airbnb[num_col] = (airbnb[num_col] - airbnb[num_col].mean()) / airbnb[num_col].std()

In [36]:
# 50.05% negative labels, 49.95% positive labels
airbnb['is_expensive'].value_counts(normalize = True)

False    0.500501
True     0.499499
Name: is_expensive, dtype: float64

In [37]:
# split data into input features and labels
airbnb_df_X = airbnb.drop(columns=['is_expensive'])
airbnb_df_y = airbnb['is_expensive']

In [38]:
# one hot encoding for the categorical columns
airbnb_X_cat_col = ['neighbourhood_group', 'room_type']
airbnb_X = pd.get_dummies(columns=airbnb_X_cat_col, data=airbnb_df_X)

airbnb_y = airbnb_df_y.replace({True: 1, False: 0})

### Clean Olympic Dataset

The dataset is from https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results?select=athlete_events.csv, and includes historic data of all participants from the Olympic Games, from Athens 1896 to Rio 2016. There are 271116 unique values/observations. We want to see whether or not we can predict a gold medalist from all the medalists just by using participants' features. We use the 'Medal' column as our target variable for our dataset.

A more detailed description of 6 features are shown below:

- Sex: M or F, str 
- Age: age, int
- Height: in centimeters, int
- Weight: in kilgrams, int
- NOC: National Olympic Committee 3-letter code, str
- Sport: sport, str

A detailed description of the target variable is shown below:

- Medal: Gold, Silver, Bronze, or NA, str

For this dataset, we want use these 6 input features and apply machine learning algorithms to predict whether an Olympian is a gold medalist or not.

In [39]:
# load the Olympic dataset
olympic = pd.read_csv('data/athlete_events.csv')
olympic.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [40]:
# We only care about the medalists out of all Olympians, so we want to drop the NaN values in 'Medal'
olympic[olympic['Medal'].isnull()].shape

(231333, 15)

In [41]:
# drop null medal values from dataset
olympic = olympic[olympic['Medal'].notnull()]
olympic.shape

(39783, 15)

In [42]:
# These are the features we will be looking at for our classifier
olympic = olympic[['Sex', 'Age','Height','Weight','NOC','Sport','Medal']]

In [43]:
# This removes the NaN values from the numerical columns
olympic = olympic[olympic['Height'].notna()]
olympic = olympic[olympic['Weight'].notna()]
olympic = olympic[olympic['Age'].notna()]
olympic.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Medal
40,M,28.0,184.0,85.0,FIN,Ice Hockey,Bronze
41,M,28.0,175.0,64.0,FIN,Gymnastics,Bronze
42,M,28.0,175.0,64.0,FIN,Gymnastics,Gold
44,M,28.0,175.0,64.0,FIN,Gymnastics,Gold
48,M,28.0,175.0,64.0,FIN,Gymnastics,Gold


In [44]:
# After that cleaning, we are left with 30181 participants to work with
olympic.shape

(30181, 7)

In [45]:
# We want to make a column to see if the participant is a Gold Medalist
olympic['Gold'] = olympic['Medal']=='Gold'

In [46]:
# We no longer need the Medal column
olympic = olympic.drop(columns=['Medal'])

In [47]:
# Scale numerical attributes to 0 mean 1 std
olympic_num_cols = ['Age', 'Height', 'Weight']
for num_col in olympic_num_cols:
    olympic[num_col] = (olympic[num_col] - olympic[num_col].mean()) / olympic[num_col].std()

In [48]:
# 66.31% negative labels, 33.68% positive labels
olympic['Gold'].value_counts(normalize = True)

False    0.663132
True     0.336868
Name: Gold, dtype: float64

In [49]:
# split the olympic dataset into input features and labels
olympic_df_X = olympic.drop(columns=['Gold'])
olympic_df_y = olympic['Gold']

In [50]:
# one hot encode all the categorical columns
olympic_X_cat_col = ['Sex','NOC','Sport']
olympic_X = pd.get_dummies(columns=olympic_X_cat_col, data=olympic_df_X)
olympic_y = olympic_df_y.replace({True: 1, False: 0})

### Perform Trials

In [51]:
# Parameters for the model
tree_params = [
    {
        'max_depth': [2,3,4,5,7,10,13,15,18,None], 
        'min_samples_split':[2,3,5,7,10,15,20],
        'min_samples_leaf':[2,3,5,7,10,15,20]
    }
]

log_reg_params = [        
    {
        'solver': ['lbfgs'],
        'max_iter': [5000],
        'penalty': ['l2'],
        'C': 10**np.arange(-4, 5, 1, dtype='float32')
    },
    {
        'solver': ['saga'],
        'max_iter': [5000],
        'penalty': ['l1', 'l2'],
        'C': 10**np.arange(-4, 5, 1, dtype='float32')
    },
    {
        'solver': ['saga', 'lbfgs'],
        'max_iter': [5000],
        'penalty': ['none']
    }
]

perceptron_params = [
    {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]
    }
]

svc_params = [
    {
        'kernel': ['linear'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32')
    },
    {
        'kernel': ['poly'],
        'degree': [2, 3],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
    },
    {
        'kernel': ['rbf'],
        'C': 10 **np.array(np.arange(-3, 2, 2), dtype='float32'),
        'gamma': [0.001,0.01,0.1,1,2]
    }
]

knn_params = [
    {
        'n_neighbors': np.arange(1, 106, 4),
        'metric': ["euclidean", "manhattan", "minkowski"]
    }
]

forest_params = [
    {
        'n_estimators': [1024],
        'min_samples_split': [1, 2, 4, 6, 8, 12, 16, 20]
    }
]

# models that do not include SVM classifier
models_without_svm = {
    'DT': (DecisionTreeClassifier(), tree_params),
    'LOGREG': (LogisticRegression(), log_reg_params),
    'PERC': (Perceptron(), perceptron_params),
    'KNN': (KNeighborsClassifier(), knn_params),
    'RF': (RandomForestClassifier(), forest_params)
}

# SVM model
models_only_svm = {
    'SVM': (SVC(), svc_params)
}

In [52]:
models = np.sort(list(models_without_svm.keys()) + list(models_only_svm.keys())).tolist()

scoring = {
        'ACC': make_scorer(accuracy_score),
        'PREC': make_scorer(precision_score),
        'REC': make_scorer(recall_score),
        'SPEC': make_scorer(recall_score, pos_label=0),
        'F1': make_scorer(f1_score),
        'ROC': make_scorer(roc_auc_score),
    }

results_columns = ['DATASET', 'MODEL', 'TRIAL'] + ["TRAIN_" + x for x 
                                                   in list(scoring.keys())] + ["TEST_" + x for x in list(scoring.keys())]

In [52]:
# perform 7 trials using each of 6 algorithms on one dataset
def perform_trials(dataset_name, models, data_X, data_y):
    
    num_trials = 7
    
    data_results = pd.DataFrame(columns=results_columns)

    for model_name in models.keys():
        model = models[model_name][0]        
        model_params_grid = models[model_name][1]
        model_results = pd.DataFrame(columns=results_columns)
        
        # perform 7 trials using each model on the dataset
        for trial_count in range(num_trials):
            # pick 5000 samples with replacement to be in the training set
            X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                                train_size=5000, 
                                                                random_state=trial_count)
            
            # grid search with 5 k-folds
            search = GridSearchCV(model, model_params_grid, cv=5, verbose=3,
                                  n_jobs=-1, refit=False, scoring=scoring)
            
            # fit grid search model with training set
            search.fit(X_train, y_train)
            
            # store 7 metrics calculated in one trial
            model_result = {
                'DATASET': dataset_name,
                'MODEL': model_name,
                'TRIAL': trial_count + 1
            }
            
            for score_name in scoring.keys():
                # find the best parameters that make model achieves best score of the metric
                best_params = search.cv_results_['params'][np.argmin(search.cv_results_['rank_test_' + score_name])]
                # use best parameters to create the optimal model for the metric
                best_model = clone(model).set_params(**best_params)
                # train the optimal model
                best_model.fit(X_train, y_train)
                
                # compute metrics
                train_score = scoring[score_name](best_model, X_train, y_train)
                test_score = scoring[score_name](best_model, X_test, y_test)

                # append scores
                model_result['TRAIN_' + score_name] = train_score
                model_result['TEST_' + score_name] = test_score
            
            # append scores of one trial to the model_results dataframe
            model_results = model_results.append(model_result, ignore_index=True)
        
        # append model_results to data_results
        data_results = data_results.append(model_results, ignore_index=True)
        
        # store scores averaged over 7 trials
        avg_result = {
            'DATASET': dataset_name,
            'MODEL': model_name,
            'TRIAL': 'avg',
            
            'TRAIN_ACC': model_results.train_accuracy.mean(),
            'TRAIN_PREC': model_results.train_precision.mean(),
            'TRAIN_REC': model_results.train_recall.mean(),
            'TRAIN_SPEC': model_results.train_specificity.mean(),
            'TRAIN_F1': model_results.train_f1.mean(),
            'TRAIN_ROC': model_results.train_roc_auc.mean(),
            
            'TEST_ACC': model_results.test_accuracy.mean(),
            'TEST_PREC': model_results.test_precision.mean(),
            'TEST_REC': model_results.test_recall.mean(),
            'TEST_SPEC': model_results.test_specificity.mean(),
            'TEST_F1': model_results.test_f1.mean(),
            'TEST_ROC': model_results.test_roc_auc.mean()
        }
        
        # append avg_result to the data_results dataframe
        data_results = data_results.append(avg_result, ignore_index=True)
    
    return data_results

### Chess Results

In [None]:
chess_results_no_svm = perform_trials('chess', models_without_svm, chess_X, chess_y)
chess_results_no_svm.to_csv('results/chess_no_svm.csv', index = False)
chess_results_no_svm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.0min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.0min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.1min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 976 tasks      | elapsed:   20.6s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 2256 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 29 candidates, totalling 145 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.9s


In [None]:
chess_results_svm = perform_trials('chess', models_only_svm, chess_X, chess_y)
chess_results_svm.to_csv('results/chess_svm.csv', index = False)
chess_results_svm

In [78]:
chess_results_no_svm = pd.read_csv('results/chess_no_svm.csv')
chess_results_svm = pd.read_csv('results/chess_svm.csv')

In [79]:
chess_results = chess_results_no_svm.append(chess_results_svm, ignore_index=True)
chess_results.to_csv('results/chess.csv', index = False)

### Shrooms Results

In [59]:
shrooms_results_no_svm = perform_trials('shrooms', models_without_svm, shrooms_X, shrooms_y)
shrooms_results_no_svm.to_csv('results/shrooms_no_svm.csv', index = False)
shrooms_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   16.6s finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   16.5s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,train_log_loss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc,test_log_loss
0,shrooms,tree,1,1.0,1.0,1.0,1.0,1.0,1.0,1.554272,1.0,1.0,1.0,1.0,1.0,1.0,1.625254
1,shrooms,tree,2,0.9994,1.0,0.99875,1.0,0.999375,0.999375,1.574997,0.998399,1.0,0.996702,1.0,0.998348,0.998351,1.592083
2,shrooms,tree,avg,0.9997,1.0,0.999375,1.0,0.999687,0.999688,1.564635,0.9992,1.0,0.998351,1.0,0.999174,0.999175,1.608669


In [None]:
shrooms_results_svm = perform_trials('shrooms', models_only_svm, shrooms_X, shrooms_y)
shrooms_results_svm.to_csv('results/shrooms_svm.csv', index = False)
shrooms_results_svm

In [81]:
shrooms_results_no_svm = pd.read_csv('results/shrooms_no_svm.csv')
shrooms_results_svm = pd.read_csv('results/shrooms_svm.csv')

In [82]:
shrooms_results = shrooms_results_no_svm.append(shrooms_results_svm, ignore_index=True)
shrooms_results.to_csv('results/shrooms.csv', index = False)

### Results of Cardio Dataset

In [51]:
# running algorithms except SVM on cardio dataset
cardio_results_no_svm = perform_trials('cardio', models_without_svm, cardio_X, cardio_y)
cardio_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc
0,cardio,tree,1,0.7272,0.76961,0.657407,0.814388,0.7054,0.726756,0.718092,0.760048,0.651762,0.803292,0.698032,0.718082
1,cardio,tree,2,0.7308,0.825455,0.771586,0.884584,0.736182,0.730678,0.716569,0.823521,0.757578,0.888329,0.717287,0.716578
2,cardio,tree,3,0.7276,0.77355,0.721763,0.817405,0.738914,0.728793,0.723985,0.753861,0.691874,0.803083,0.711474,0.72387
3,cardio,tree,4,0.7372,0.787893,0.746233,0.823245,0.724413,0.737665,0.724569,0.761494,0.689312,0.797068,0.713895,0.724521
4,cardio,tree,5,0.735,0.797519,0.734182,0.807801,0.743664,0.735456,0.726877,0.775533,0.706,0.803805,0.711856,0.726788
5,cardio,tree,6,0.7348,0.747525,0.732354,0.799922,0.720253,0.734063,0.731446,0.761724,0.67302,0.804416,0.720963,0.731479
6,cardio,tree,7,0.7414,0.784314,0.714229,0.819456,0.748012,0.741457,0.722631,0.772991,0.659143,0.811395,0.695081,0.722577
7,cardio,tree,avg,0.733429,0.783695,0.725393,0.823829,0.730977,0.733553,0.723453,0.772739,0.689813,0.815913,0.709798,0.723414
8,cardio,log_reg,1,0.7292,0.760739,0.0,1.0,0.708817,0.728782,0.726646,0.757817,0.0,1.0,0.708874,0.726637
9,cardio,log_reg,2,0.652,0.661729,1.0,0.658646,0.688273,0.652064,0.647785,0.649235,1.0,0.661212,0.681556,0.647767


In [52]:
# running SVM algorithm on cardio dataset, generally take longer time to run than other algorithms combined
cardio_results_svm = perform_trials('cardio', models_only_svm, cardio_X, cardio_y)
cardio_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc
0,cardio,svm,1,0.7372,0.805755,0.814412,1.0,0.719471,0.736826,0.7276,0.808387,0.800369,1.0,0.710771,0.727591
1,cardio,svm,2,0.7332,0.748567,1.0,0.760604,0.732665,0.733509,0.726415,0.733427,0.999784,0.757929,0.7213,0.72638
2,cardio,svm,3,0.737,0.785714,1.0,0.824319,0.726668,0.737818,0.727262,0.773072,0.99963,0.813709,0.716606,0.727192
3,cardio,svm,4,0.735,0.777726,1.0,0.80226,0.741597,0.735625,0.727154,0.752055,0.999969,0.786744,0.710793,0.727087
4,cardio,svm,5,0.7344,0.770219,1.0,0.80579,0.725506,0.734588,0.725523,0.765986,1.0,0.802022,0.715822,0.725488
5,cardio,svm,6,0.7284,0.776371,0.821297,1.0,0.704654,0.727103,0.727862,0.784359,0.813484,1.0,0.709144,0.727918
6,cardio,svm,7,0.7386,0.903614,1.0,0.996797,0.724668,0.738641,0.728046,0.831545,1.0,0.995173,0.712434,0.728008
7,cardio,svm,avg,0.734829,0.795424,0.947959,0.884253,0.725033,0.734873,0.727123,0.778404,0.944748,0.879368,0.713838,0.727095


In [53]:
# combine results of svm and non-svm algorithms and save as a csv file
cardio_final_results = cardio_results_no_svm.append(cardio_results_svm, ignore_index=True)
cardio_final_results.to_csv('results/cardio_results.csv', index = False)

### Results of Australian Rain Dataset

In [55]:
# running algorithms except SVM on Australian rain dataset
aus_results_no_svm = perform_trials('aus', models_without_svm, aus_X, aus_y)
aus_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 29 candidates, totalling 145 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc
0,aus,tree,1,0.8492,0.8,0.62841,0.98222,0.577857,0.770974,0.83213,0.793374,0.449633,0.981741,0.514832,0.681811
1,aus,tree,2,0.855,0.777567,0.853636,0.97,0.598783,0.724627,0.829432,0.734551,0.492186,0.965026,0.526989,0.687517
2,aus,tree,3,0.8428,0.784884,0.85316,0.981142,0.559332,0.710808,0.829012,0.780864,0.476153,0.978989,0.514916,0.692722
3,aus,tree,4,0.8538,0.823529,0.729849,0.9845,0.62339,0.741199,0.828421,0.801478,0.453656,0.98384,0.539103,0.694818
4,aus,tree,5,0.8622,0.838608,0.867384,0.986869,0.699805,0.793621,0.82998,0.801005,0.497741,0.983756,0.542231,0.700476
5,aus,tree,6,0.8568,0.772182,0.716822,0.975827,0.656448,0.764614,0.823978,0.76804,0.461583,0.976079,0.545628,0.699465
6,aus,tree,7,0.8388,0.775701,0.702997,0.969223,0.696734,0.786406,0.833198,0.73595,0.476605,0.96252,0.515878,0.686018
7,aus,tree,avg,0.851229,0.796067,0.764608,0.97854,0.630336,0.756036,0.82945,0.773609,0.472508,0.975993,0.528511,0.691832
8,aus,log_reg,1,0.8536,0.788591,0.493885,1.0,0.592217,0.723448,0.844789,0.822835,0.466541,0.999991,0.568289,0.708628
9,aus,log_reg,2,0.8546,0.852665,0.532727,1.0,0.614256,0.737902,0.842987,0.798091,0.504143,0.999991,0.584528,0.721091


In [56]:
# running SVM algorithm on Australian rain dataset, generally take longer time to run than other algorithms combined
aus_results_svm = perform_trials('aus', models_only_svm, aus_X, aus_y)
aus_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_roc_auc,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_roc_auc
0,aus,svm,1,0.8576,0.808743,0.484478,1.0,0.588915,0.719694,0.845016,0.842434,0.458103,1.0,0.552106,0.697773
1,aus,svm,2,0.859,0.942529,0.7,1.0,0.619383,0.737564,0.84251,0.879286,0.472528,1.0,0.574432,0.713079
2,aus,svm,3,0.8534,0.861272,0.47119,1.0,0.572881,0.711767,0.843343,0.860588,0.46755,1.0,0.561399,0.705952
3,aus,svm,4,0.8562,0.895161,0.719221,1.0,0.811189,0.851731,0.843429,0.86999,0.478763,1.0,0.561484,0.70763
4,aus,svm,5,0.8552,0.886667,0.511649,1.0,0.602321,0.728919,0.844867,0.854671,0.480836,1.0,0.569531,0.711297
5,aus,svm,6,0.856,0.946667,0.490654,1.0,0.590551,0.722045,0.84518,0.902718,0.46061,1.0,0.55775,0.703466
6,aus,svm,7,0.8472,0.85446,0.506812,1.0,0.588918,0.723142,0.844724,0.844099,0.493956,1.0,0.574148,0.715173
7,aus,svm,avg,0.854943,0.885071,0.554858,1.0,0.62488,0.742123,0.844153,0.864827,0.473192,1.0,0.564407,0.707767


In [57]:
# combine results of svm and non-svm algorithms and save as a csv file
aus_final_results = aus_results_no_svm.append(aus_results_svm, ignore_index=True)
aus_final_results.to_csv('results/aus_results.csv', index = False)

### Results of AirBnB Price Dataset

In [22]:
# running algorithms except SVM on airbnb dataset
airbnb_results_no_svm = perform_trials('airbnb', models_without_svm, airbnb_X, airbnb_y)
airbnb_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 1936 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   15.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:    9.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1528 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   10.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 824 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 2104 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:   13.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    9.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    8.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    8.9s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    8.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.1s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 253 out of 260 | elapsed:    9.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    9.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   14.9s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   14.8s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   14.7s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.0s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.5s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 398 out of 405 | elapsed:   15.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   15.3s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   56.0s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   31.6s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.0s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.6s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.5s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.5s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,airbnb,tree,1,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
1,airbnb,tree,2,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
2,airbnb,tree,3,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
3,airbnb,tree,4,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
4,airbnb,tree,5,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
5,airbnb,tree,avg,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16
6,airbnb,log_reg,1,0.9878,1.0,0.975373,1.0,0.987533,0.987687,0.4213731,0.98804,1.0,0.976078,1.0,0.987894,0.988039,0.4130962
7,airbnb,log_reg,2,0.9906,1.0,0.981033,1.0,0.990426,0.990517,0.3246645,0.987721,0.99958,0.975849,0.99959,0.987572,0.987719,0.4241123
8,airbnb,log_reg,3,0.9876,1.0,0.974684,1.0,0.987179,0.987342,0.4282808,0.989065,1.0,0.978156,1.0,0.988957,0.989078,0.377688
9,airbnb,log_reg,4,0.9884,1.0,0.976585,1.0,0.988154,0.988292,0.4006498,0.989725,1.0,0.97945,1.0,0.989618,0.989725,0.3548693


In [23]:
# running SVM algorithm on AirBnB dataset, generally take longer time to run than other algorithms combined
airbnb_results_svm = perform_trials('airbnb', models_only_svm, airbnb_X, airbnb_y)
airbnb_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   27.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   25.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.3s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   25.7s finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.2s finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,airbnb,svm,1,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999795,0.999727,0.999863,0.999727,0.999795,0.999795,0.007082
1,airbnb,svm,2,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999704,0.999408,1.0,0.999408,0.999704,0.999704,0.010229
2,airbnb,svm,3,0.9994,0.998777,1.0,0.998824,0.999388,0.999412,0.02072375,0.999271,0.998546,1.0,0.99854,0.999272,0.99927,0.02518
3,airbnb,svm,4,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999863,0.999863,0.999863,0.999863,0.999863,0.999863,0.004721
4,airbnb,svm,5,1.0,1.0,1.0,1.0,1.0,1.0,9.992007e-16,0.999954,1.0,0.999909,1.0,0.999954,0.999954,0.001574
5,airbnb,svm,avg,0.99988,0.999755,1.0,0.999765,0.999878,0.999882,0.004144749,0.999718,0.999509,0.999927,0.999508,0.999718,0.999717,0.009757


In [24]:
airbnb_final_results = airbnb_results_no_svm.append(airbnb_results_svm, ignore_index=True)
airbnb_final_results.to_csv('results/airbnb_results.csv', index = False)

### Results of Olympic Gold Medal Dataset


In [26]:
# running algorithms except SVM on Olympic dataset
olympic_results_no_svm = perform_trials('olympic', models_without_svm, olympic_X, olympic_y)
olympic_results_no_svm

Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1080 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 1892 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 2180 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.4min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.2min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.3min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.3min finished


Fitting 5 folds for each of 490 candidates, totalling 2450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 1000 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 2280 tasks      | elapsed:   58.8s
[Parallel(n_jobs=-1)]: Done 2450 out of 2450 | elapsed:  1.1min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.8min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 52 candidates, totalling 260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:   40.1s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:  1.7min finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.8s finished


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed:    2.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.6min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.6min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.5min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.7min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.6min finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,tree,1,0.9522,0.954655,0.997053,0.096386,0.975394,0.546719,1.650989,0.949827,0.953788,0.995456,0.069974,0.974177,0.532715,1.732942
1,olympic,tree,2,0.9568,0.958232,0.998108,0.144628,0.977764,0.571368,1.492108,0.950081,0.95434,0.9951,0.08262,0.974294,0.53886,1.724185
2,olympic,tree,3,0.954,0.954911,0.998737,0.104,0.976333,0.551368,1.58882,0.951,0.953734,0.996816,0.06746,0.974799,0.532138,1.692422
3,olympic,tree,4,0.95,0.952678,0.996839,0.074803,0.974259,0.535821,1.726976,0.948853,0.952527,0.995833,0.042469,0.973699,0.519151,1.766595
4,olympic,tree,5,0.9604,0.961701,0.998326,0.140271,0.979671,0.569299,1.367766,0.950568,0.953834,0.996214,0.072994,0.974563,0.534604,1.707359
5,olympic,tree,avg,0.95468,0.956436,0.997813,0.112018,0.976684,0.554915,1.565332,0.950066,0.953645,0.995884,0.067103,0.974306,0.531494,1.724701
6,olympic,log_reg,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
7,olympic,log_reg,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
8,olympic,log_reg,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
9,olympic,log_reg,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204


In [27]:
# running SVM algorithm on the Olympic dataset, generally take longer time to run than other algorithms combined
olympic_results_svm = perform_trials('olympic', models_only_svm, olympic_X, olympic_y)
olympic_results_svm

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.2min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.1min finished


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.0min finished


Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,svm,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
1,olympic,svm,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
2,olympic,svm,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
3,olympic,svm,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204
4,olympic,svm,5,0.9558,0.9558,1.0,0.0,0.977401,0.5,1.526649,0.950558,0.950558,1.0,0.0,0.974652,0.5,1.707706
5,olympic,svm,avg,0.95136,0.95136,1.0,0.0,0.975072,0.5,1.680005,0.950668,0.950668,1.0,0.0,0.97471,0.5,1.703894


In [28]:
olympic_final_results = olympic_results_no_svm.append(olympic_results_svm, ignore_index=True)
olympic_final_results.to_csv('results/olympic_results.csv', index = False)

In [29]:
# display performance
pd.read_csv('results/olympic_results.csv')

Unnamed: 0,dataset,model,trial,train_accuracy,train_precision,train_recall,train_specificity,train_f1,train_auc,train_logloss,test_accuracy,test_precision,test_recall,test_specificity,test_f1,test_auc,test_logloss
0,olympic,tree,1,0.9522,0.954655,0.997053,0.096386,0.975394,0.546719,1.650989,0.949827,0.953788,0.995456,0.069974,0.974177,0.532715,1.732942
1,olympic,tree,2,0.9568,0.958232,0.998108,0.144628,0.977764,0.571368,1.492108,0.950081,0.95434,0.9951,0.08262,0.974294,0.53886,1.724185
2,olympic,tree,3,0.954,0.954911,0.998737,0.104,0.976333,0.551368,1.58882,0.951,0.953734,0.996816,0.06746,0.974799,0.532138,1.692422
3,olympic,tree,4,0.95,0.952678,0.996839,0.074803,0.974259,0.535821,1.726976,0.948853,0.952527,0.995833,0.042469,0.973699,0.519151,1.766595
4,olympic,tree,5,0.9604,0.961701,0.998326,0.140271,0.979671,0.569299,1.367766,0.950568,0.953834,0.996214,0.072994,0.974563,0.534604,1.707359
5,olympic,tree,avg,0.95468,0.956436,0.997813,0.112018,0.976684,0.554915,1.565332,0.950066,0.953645,0.995884,0.067103,0.974306,0.531494,1.724701
6,olympic,log_reg,1,0.9502,0.9502,1.0,0.0,0.974464,0.5,1.720071,0.950697,0.950697,1.0,0.0,0.974726,0.5,1.702898
7,olympic,log_reg,2,0.9516,0.9516,1.0,0.0,0.9752,0.5,1.671715,0.950662,0.950662,1.0,0.0,0.974707,0.5,1.7041
8,olympic,log_reg,3,0.95,0.95,1.0,0.0,0.974359,0.5,1.726979,0.950702,0.950702,1.0,0.0,0.974728,0.5,1.702726
9,olympic,log_reg,4,0.9492,0.9492,1.0,0.0,0.973938,0.5,1.75461,0.950722,0.950722,1.0,0.0,0.974739,0.5,1.70204


## output tables

In [125]:
datasets = ['CARDIO', 'AUS', 'CHESS', 'SHROOMS', 'AIRBNB', 'OLYMPIC']

In [126]:
# combine datasets
cardio_results = pd.read_csv('results/cardio_results.csv')

aus_results = pd.read_csv('results/aus_results.csv')

olympic_results = pd.read_csv('results/olympic_results.csv')

airbnb_results = pd.read_csv('results/airbnb_results.csv')

shrooms_results = pd.read_csv('results/shrooms_results.csv')

chess_results = pd.read_csv('results/chess_results.csv')


results = cardio_results.append([aus_results, airbnb_results, shrooms_results,
                                 chess_results, olympic_results], ignore_index = True)

In [129]:
train_metrics = ["TRAIN_" + x for x in list(scoring.keys())]
test_metrics = ["TEST_" + x for x in list(scoring.keys())]

In [130]:
test_avg_results = results.loc[results['TRIAL'] == 'avg', ['DATASET', 'MODEL', 'TRIAL'] + test_metrics]
test_avg_results = test_avg_results.reset_index(drop = True)

In [131]:
test_results = results.loc[results['TRIAL'] != 'avg', ['DATASET', 'MODEL', 'TRIAL'] + test_metrics]
test_results = test_results.reset_index(drop = True)

In [132]:
train_avg_results = results.loc[results['TRIAL'] == 'avg', ['DATASET', 'MODEL', 'TRIAL'] + train_metrics]
train_avg_results = train_avg_results.reset_index(drop = True)

In [133]:
train_results = results.loc[results['TRIAL'] != 'avg', ['DATASET', 'MODEL', 'TRIAL'] + train_metrics]
train_results = train_results.reset_index(drop = True)

#### Table 1 - Dataset Description

In [134]:
table1 = pd.DataFrame(columns = ['Name', '#ATTR', 'TRAIN SIZE', 'TEST SIZE', '%POS'])
table1['Name'] = np.array(datasets)
table1['#ATTR'] = np.array(['11', '22', '9/15', '22', '7/15', '6/14'])
table1['TRAIN SIZE'] = np.array([5000] * 6)
table1['TEST SIZE'] = ['65000', '140460', '15058', '3124', '43895', '25181']
table1['%POS'] = ['49.97%', '21.9146%', '49.8604%', '48.2%', '49.95%', '33.69%']

In [135]:
table1

Unnamed: 0,Name,#ATTR,TRAIN SIZE,TEST SIZE,%POS
0,CARDIO,11,5000,65000,49.97%
1,AUS,22,5000,140460,21.9146%
2,CHESS,9/15,5000,15058,49.8604%
3,SHROOMS,22,5000,3124,48.2%
4,AIRBNB,7/15,5000,43895,49.95%
5,OLYMPIC,6/14,5000,25181,33.69%


In [136]:
table1.to_csv('results/table1.csv', index = False)
pd.read_csv('results/table1.csv')

Unnamed: 0,Name,#ATTR,TRAIN SIZE,TEST SIZE,%POS
0,CARDIO,11,5000,65000,49.97%
1,AUS,22,5000,140460,21.9146%
2,CHESS,9/15,5000,15058,49.8604%
3,SHROOMS,22,5000,3124,48.2%
4,AIRBNB,7/15,5000,43895,49.95%
5,OLYMPIC,6/14,5000,25181,33.69%


#### Table 2 - Model & Metrics (test avg trials & datasets)

In [137]:
table2 = test_avg_results.groupby(by='MODEL').mean()
table2['MEAN'] = table2.mean(axis = 1)
table2 = table2.reset_index()
table2

Unnamed: 0,MODEL,TEST_ACC,TEST_PREC,TEST_REC,TEST_SPEC,TEST_F1,TEST_ROC,MEAN
0,DT,0.783277,0.800428,0.698014,0.883093,0.690094,0.74225,0.766192
1,KNN,0.768266,0.759028,0.678289,0.850243,0.673057,0.72648,0.74256
2,LOGREG,0.782066,0.752993,0.632982,0.886029,0.657502,0.736934,0.741418
3,PERC,0.701554,0.613989,0.615241,0.761425,0.579136,0.67861,0.658326
4,RF,0.793308,0.764779,0.669086,0.85039,0.699296,0.752849,0.754951
5,SVM,0.790895,0.806914,0.748617,0.963586,0.706468,0.754641,0.795187


In [138]:
table2.to_csv('results/table2.csv', index = False)
pd.read_csv('results/table2.csv')

Unnamed: 0,MODEL,TEST_ACC,TEST_PREC,TEST_REC,TEST_SPEC,TEST_F1,TEST_ROC,MEAN
0,DT,0.783277,0.800428,0.698014,0.883093,0.690094,0.74225,0.766192
1,KNN,0.768266,0.759028,0.678289,0.850243,0.673057,0.72648,0.74256
2,LOGREG,0.782066,0.752993,0.632982,0.886029,0.657502,0.736934,0.741418
3,PERC,0.701554,0.613989,0.615241,0.761425,0.579136,0.67861,0.658326
4,RF,0.793308,0.764779,0.669086,0.85039,0.699296,0.752849,0.754951
5,SVM,0.790895,0.806914,0.748617,0.963586,0.706468,0.754641,0.795187


#### Appendix 3 - P values of Table 2 ( Model & Metrics (test avg trials & datasets) )

In [139]:
best_models = table2.set_index('MODEL').idxmax()

In [140]:
tl2_pvals = pd.DataFrame(columns = ['MODEL'] + test_metrics + ['MEAN'])
tl2_pvals['MODEL'] = table2['MODEL']

In [141]:
for metric in test_metrics:
    best_model = best_models[metric]
    grp = test_results.groupby(by='MODEL')
    for model in models:
        dist1 = grp.get_group(best_model)[metric].tolist()
        dist2 = grp.get_group(model)[metric].tolist()
        pval = stats.ttest_ind(dist1, dist2).pvalue
        tl2_pvals.loc[tl2_pvals['MODEL'] == model, metric] = pval
    tl2_pvals[metric] = tl2_pvals[metric].astype('float64')

In [142]:
for model in models:
    best_model = best_models['MEAN']
    dist1 = table2.loc[table2['MODEL'] == best_model, test_metrics].iloc[0].tolist()
    dist2 = table2.loc[table2['MODEL'] == model, test_metrics].iloc[0].tolist()
    pval = stats.ttest_ind(dist1, dist2).pvalue
    tl2_pvals.loc[tl2_pvals['MODEL'] == model, 'MEAN'] = pval
tl2_pvals['MEAN'] = tl2_pvals['MEAN'].astype('float64')

In [143]:
tl2_pvals

Unnamed: 0,MODEL,TEST_ACC,TEST_PREC,TEST_REC,TEST_SPEC,TEST_F1,TEST_ROC,MEAN
0,DT,0.696592,0.800221,0.315647,0.002014,0.687457,0.674781,0.551291
1,KNN,0.354139,0.117555,0.153632,0.000137,0.416928,0.35134,0.273719
2,LOGREG,0.667217,0.089935,0.074958,0.051738,0.316641,0.565281,0.32717
3,PERC,0.007785,0.000115,0.050094,7.8e-05,0.017409,0.024818,0.013765
4,RF,1.0,0.12536,0.123967,1.5e-05,0.85879,0.950381,0.394504
5,SVM,0.924287,1.0,1.0,1.0,1.0,1.0,1.0


In [144]:
tl2_pvals.to_csv('results/tl2_pvals.csv', index = False)
pd.read_csv('results/tl2_pvals.csv')

Unnamed: 0,MODEL,TEST_ACC,TEST_PREC,TEST_REC,TEST_SPEC,TEST_F1,TEST_ROC,MEAN
0,DT,0.696592,0.800221,0.315647,0.002014,0.687457,0.674781,0.551291
1,KNN,0.354139,0.117555,0.153632,0.000137,0.416928,0.35134,0.273719
2,LOGREG,0.667217,0.089935,0.074958,0.051738,0.316641,0.565281,0.32717
3,PERC,0.007785,0.000115,0.050094,7.8e-05,0.017409,0.024818,0.013765
4,RF,1.0,0.12536,0.123967,1.5e-05,0.85879,0.950381,0.394504
5,SVM,0.924287,1.0,1.0,1.0,1.0,1.0,1.0


#### Table 3 - Model & Dataset (test avg trials & metrics)

In [145]:
table3 = pd.DataFrame(columns = ['MODEL'] + datasets + ['MEAN'])
table3['MODEL'] = models

In [146]:
for data in datasets:
    for model in models:
        score = test_avg_results.loc[(test_avg_results['MODEL'] == model) &
                                     (test_avg_results['DATASET'] == data)][test_metrics].mean(axis = 1).tolist()[0]
        table3.loc[table3['MODEL'] == model, data] = score
    table3[data] = table3[data].astype('float64')
table3['MEAN'] = table3[datasets].mean(axis = 1)
table3

Unnamed: 0,MODEL,CARDIO,AUS,CHESS,SHROOMS,AIRBNB,OLYMPIC,MEAN
0,DT,0.739188,0.711984,0.679259,0.999774,0.825712,0.641237,0.766192
1,KNN,0.666772,0.699582,0.634543,1.0,0.829761,0.624705,0.74256
2,LOGREG,0.714271,0.739579,0.675994,1.0,0.827292,0.491369,0.741418
3,PERC,0.655749,0.485895,0.582336,0.999883,0.684399,0.541694,0.658326
4,RF,0.728808,0.704868,0.663366,1.0,0.832898,0.599768,0.754951
5,SVM,0.795096,0.742391,0.729506,1.0,0.868146,0.635981,0.795187


In [147]:
table3.to_csv('results/table3.csv', index = False)
pd.read_csv('results/table3.csv')

Unnamed: 0,MODEL,CARDIO,AUS,CHESS,SHROOMS,AIRBNB,OLYMPIC,MEAN
0,DT,0.739188,0.711984,0.679259,0.999774,0.825712,0.641237,0.766192
1,KNN,0.666772,0.699582,0.634543,1.0,0.829761,0.624705,0.74256
2,LOGREG,0.714271,0.739579,0.675994,1.0,0.827292,0.491369,0.741418
3,PERC,0.655749,0.485895,0.582336,0.999883,0.684399,0.541694,0.658326
4,RF,0.728808,0.704868,0.663366,1.0,0.832898,0.599768,0.754951
5,SVM,0.795096,0.742391,0.729506,1.0,0.868146,0.635981,0.795187


#### Appendix 3 - P values of Table 3 ( Model & Dataset (test avg trials & metrics) )

In [148]:
best_models = table3.set_index('MODEL').idxmax()

In [149]:
tl3_pvals = pd.DataFrame(columns = ['MODEL'] + datasets + ['MEAN'])
tl3_pvals['MODEL'] = models

In [150]:
grp = test_results.groupby(by=['MODEL', 'DATASET'])
for data in datasets:
    best_model = best_models[data]
    for model in models:
        dist1 = grp.get_group((best_model, data))[test_metrics].values.flatten()
        dist2 = grp.get_group((model, data))[test_metrics].values.flatten()
        pval = stats.ttest_ind(dist1, dist2).pvalue
        tl3_pvals.loc[tl3_pvals['MODEL'] == model, data] = pval
    tl3_pvals[data] = tl3_pvals[data].astype('float64')

In [151]:
for model in models:
    best_model = best_models['MEAN']
    dist1 = table3.loc[table3['MODEL'] == best_model, datasets].iloc[0].tolist()
    dist2 = table3.loc[table3['MODEL'] == model, datasets].iloc[0].tolist()
    pval = stats.ttest_ind(dist1, dist2).pvalue
    tl3_pvals.loc[tl3_pvals['MODEL'] == model, 'MEAN'] = pval
tl3_pvals['MEAN'] = tl3_pvals['MEAN'].astype('float64')

In [152]:
tl3_pvals

Unnamed: 0,MODEL,CARDIO,AUS,CHESS,SHROOMS,AIRBNB,OLYMPIC,MEAN
0,DT,0.002417368,0.439951,0.040054,0.028161,0.0001426333,1.0,0.703738
1,KNN,7.96578e-11,0.331824,1.2e-05,,0.0004928899,0.699824,0.519854
2,LOGREG,0.0233169,0.942313,0.053629,,0.09782886,0.011458,0.545491
3,PERC,0.0009214726,0.000102,3e-05,0.007619,2.208079e-09,0.046378,0.161766
4,RF,0.000119121,0.346393,0.001136,,0.001167387,0.354726,0.616209
5,SVM,1.0,1.0,1.0,,1.0,0.907375,1.0


In [153]:
tl3_pvals.to_csv('results/tl3_pvals.csv', index = False)
pd.read_csv('results/tl3_pvals.csv')

Unnamed: 0,MODEL,CARDIO,AUS,CHESS,SHROOMS,AIRBNB,OLYMPIC,MEAN
0,DT,0.002417368,0.439951,0.040054,0.028161,0.0001426333,1.0,0.703738
1,KNN,7.96578e-11,0.331824,1.2e-05,,0.0004928899,0.699824,0.519854
2,LOGREG,0.0233169,0.942313,0.053629,,0.09782886,0.011458,0.545491
3,PERC,0.0009214726,0.000102,3e-05,0.007619,2.208079e-09,0.046378,0.161766
4,RF,0.000119121,0.346393,0.001136,,0.001167387,0.354726,0.616209
5,SVM,1.0,1.0,1.0,,1.0,0.907375,1.0


#### Appendix 1 - Model & Metrics (train avg trials & datasets)

In [154]:
appendix1 = train_avg_results.groupby(by='MODEL').mean()
appendix1['MEAN'] = appendix1.mean(axis = 1)
appendix1 = appendix1.reset_index()
appendix1

Unnamed: 0,MODEL,TRAIN_ACC,TRAIN_PREC,TRAIN_REC,TRAIN_SPEC,TRAIN_F1,TRAIN_ROC,MEAN
0,DT,0.80141,0.821938,0.816756,0.906471,0.773649,0.800073,0.820049
1,KNN,0.786957,0.781165,0.872633,0.879235,0.861624,0.842512,0.837354
2,LOGREG,0.786262,0.770186,0.639267,0.887305,0.667421,0.744131,0.749095
3,PERC,0.70589,0.622589,0.619338,0.764205,0.583095,0.68313,0.663041
4,RF,0.92529,0.93405,0.95525,0.955693,0.948712,0.949518,0.944752
5,SVM,0.813976,0.845974,0.813235,0.980515,0.77145,0.799586,0.837456


In [155]:
appendix1.to_csv('results/appendix1.csv', index = False)
pd.read_csv('results/appendix1.csv')

Unnamed: 0,MODEL,TRAIN_ACC,TRAIN_PREC,TRAIN_REC,TRAIN_SPEC,TRAIN_F1,TRAIN_ROC,MEAN
0,DT,0.80141,0.821938,0.816756,0.906471,0.773649,0.800073,0.820049
1,KNN,0.786957,0.781165,0.872633,0.879235,0.861624,0.842512,0.837354
2,LOGREG,0.786262,0.770186,0.639267,0.887305,0.667421,0.744131,0.749095
3,PERC,0.70589,0.622589,0.619338,0.764205,0.583095,0.68313,0.663041
4,RF,0.92529,0.93405,0.95525,0.955693,0.948712,0.949518,0.944752
5,SVM,0.813976,0.845974,0.813235,0.980515,0.77145,0.799586,0.837456


#### Appendix 3 -  P value of appendix 1 ( Model & Metrics (train avg trials & datasets) )

In [156]:
best_models = appendix1.set_index('MODEL').idxmax()

appendix1_pvals = pd.DataFrame(columns = ['MODEL'] + train_metrics + ['MEAN'])
appendix1_pvals['MODEL'] = appendix1['MODEL']

for metric in train_metrics:
    best_model = best_models[metric]
    grp = train_results.groupby(by='MODEL')
    for model in models:
        dist1 = grp.get_group(best_model)[metric].tolist()
        dist2 = grp.get_group(model)[metric].tolist()
        pval = stats.ttest_ind(dist1, dist2).pvalue
        appendix1_pvals.loc[appendix1_pvals['MODEL'] == model, metric] = pval
    appendix1_pvals[metric] = appendix1_pvals[metric].astype('float64')

In [157]:
for model in models:
    best_model = best_models['MEAN']
    dist1 = appendix1.loc[appendix1['MODEL'] == best_model, train_metrics].iloc[0].tolist()
    dist2 = appendix1.loc[appendix1['MODEL'] == model, train_metrics].iloc[0].tolist()
    pval = stats.ttest_ind(dist1, dist2).pvalue
    appendix1_pvals.loc[appendix1_pvals['MODEL'] == model, 'MEAN'] = pval
appendix1_pvals['MEAN'] = appendix1_pvals['MEAN'].astype('float64')
appendix1_pvals

Unnamed: 0,MODEL,TRAIN_ACC,TRAIN_PREC,TRAIN_REC,TRAIN_SPEC,TRAIN_F1,TRAIN_ROC,MEAN
0,DT,8.287021e-08,1.701569e-08,1.853618e-10,0.000322,5.985106e-12,1.361701e-11,7.2e-05
1,KNN,2.040673e-08,6.789643e-09,0.0005382399,1.8e-05,0.0006427847,4.332876e-05,0.000159
2,LOGREG,1.854892e-08,3.371046e-10,3.695047e-08,0.015201,3.854539e-10,1.250001e-12,0.000334
3,PERC,3.25716e-10,4.448309e-10,6.357903e-08,1.5e-05,6.634743e-12,6.718829e-15,1e-06
4,RF,1.0,1.0,1.0,0.054564,1.0,1.0,1.0
5,SVM,2.8427e-07,1.015314e-05,4.506495e-06,1.0,8.881075e-12,7.155815e-11,0.005747


In [158]:
appendix1_pvals.to_csv('results/appendix1_pvals.csv', index = False)
pd.read_csv('results/appendix1_pvals.csv')

Unnamed: 0,MODEL,TRAIN_ACC,TRAIN_PREC,TRAIN_REC,TRAIN_SPEC,TRAIN_F1,TRAIN_ROC,MEAN
0,DT,8.287021e-08,1.701569e-08,1.853618e-10,0.000322,5.985106e-12,1.361701e-11,7.2e-05
1,KNN,2.040673e-08,6.789643e-09,0.0005382399,1.8e-05,0.0006427847,4.332876e-05,0.000159
2,LOGREG,1.854892e-08,3.371046e-10,3.695047e-08,0.015201,3.854539e-10,1.250001e-12,0.000334
3,PERC,3.25716e-10,4.448309e-10,6.357903e-08,1.5e-05,6.634743e-12,6.718829e-15,1e-06
4,RF,1.0,1.0,1.0,0.054564,1.0,1.0,1.0
5,SVM,2.8427e-07,1.015314e-05,4.506495e-06,1.0,8.881075e-12,7.155815e-11,0.005747


#### Appendix 2

In [160]:
test_results.to_csv('results/appendix2.csv', index = False)
pd.read_csv('results/appendix2.csv')

Unnamed: 0,DATASET,MODEL,TRIAL,TEST_ACC,TEST_PREC,TEST_REC,TEST_SPEC,TEST_F1,TEST_ROC
0,CARDIO,DT,1,0.718092,0.760048,0.651762,0.803292,0.698032,0.718082
1,CARDIO,DT,2,0.716569,0.823521,0.757578,0.888329,0.717287,0.716578
2,CARDIO,DT,3,0.723985,0.753861,0.691874,0.803083,0.711474,0.723870
3,CARDIO,DT,4,0.724569,0.761494,0.689312,0.797068,0.713895,0.724521
4,CARDIO,DT,5,0.726877,0.775533,0.706000,0.803805,0.711856,0.726788
5,CARDIO,DT,6,0.731446,0.761724,0.673020,0.804416,0.720963,0.731479
6,CARDIO,DT,7,0.722631,0.772991,0.659143,0.811395,0.695081,0.722577
7,CARDIO,LOGREG,1,0.726646,0.757817,0.000000,1.000000,0.708874,0.726637
8,CARDIO,LOGREG,2,0.647785,0.649235,1.000000,0.661212,0.681556,0.647767
9,CARDIO,LOGREG,3,0.726769,0.747150,1.000000,0.769117,0.714479,0.726690
