In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as mplt
from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

Using TensorFlow backend.


Here we'll try out different models on both feature selected and complete dataset. 

In [6]:
dataset = pd.read_csv ('dataset/Dataset_heart rate_retail stores.csv')
print("dataset shape: ({}, {})\n".format(*dataset.shape))
dataset.info()

dataset shape: (165, 23)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 23 columns):
Id                                165 non-null int64
age                               165 non-null int64
gender                            165 non-null object
BMI                               165 non-null float64
sleeping_time                     165 non-null float64
sporting_activity_(h/week)        165 non-null float64
daytime                           165 non-null object
outdoor_temperature               165 non-null int64
shopping_cart                     165 non-null object
attendants                        165 non-null object
shopping_frequency                165 non-null object
cash_point                        165 non-null object
shopping_amount                   165 non-null object
weekday                           165 non-null object
smoking                           165 non-null int64
ex_max                            165 non-null int64
ex_min 

## DATASET

Here we'll work on two datasets. The first is the complete dataset which contains all the features, the second one contains only the feature selected based on our statistical test on shopping amount.


### FEATURE SELECTION

Here we'll do initial feature selection based on the statistical test results we found on the response variable and create a new dataset.

Overview of what we'll do:

* Categorical: 
    * reaching HRmax
    * shopping_cart
    * attendants: divide into two groups, no and the rest.
    
* Continious
    * shopping_duration
    * hr_max
    * max_time
    * min_time
    
* Drop
    * The remaining features found to be not important on capturing information about shopping duration, this we'll drop them.

In [7]:
# include our model. here we'll use it for removing outliers.

from modules.StatUtils import *

statUtils = StatUtils()

In [8]:
# remove id and rename shopping duration
dataset = dataset.drop('Id', axis=1) 
dataset = dataset.rename(columns = {'shopping duration (in minutes)':'shop_dur'})

#### Balance of classification: 
check wherer the number of examples in each class is unequally distributed.

In [9]:
class_count = pd.value_counts(dataset['shopping_amount'])

for c in range(len(class_count)):
    print('{}\t\t{}\t{:.2f}%'.format(class_count.index[c], 
                            class_count.values[c], 
                            100*class_count.values[c]/sum(class_count)))

less		61	36.97%
normal		58	35.15%
heavy		28	16.97%
bag full		18	10.91%


Now we can seet that balance of classes are valid for classifications.

#### Create normal and feature selected dataset

In [10]:
# divide attendants into two new groups
dataset.loc[dataset.attendants != "no", "attendants"] = "yes"

# get selected feautes
df_selected = dataset[['shopping_amount', 'shop_dur', 'hr_max', 'max_time', \
                       'min_time', 'shopping_cart', 'attendants', 'reaching HRmax' ]]

# get all the features
df_normal = dataset

In [11]:
# remove outliers in continious vars for feature selected df:
for feature in ['shop_dur', 'hr_max', 'min_time', 'max_time']:
    df_selected = statUtils.remove_outliers(df_selected, feature)
    
# do the same for the normal df
for feature in ['shop_dur', 'hr_max', 'min_time', 'max_time', 'ex_min', \
               'age', 'BMI', 'sleeping_time', 'outdoor_temperature', 'ex_max']:
    df_normal = statUtils.remove_outliers(df_normal, feature)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
# after removing outliers now we can get traget vars

y_selected = df_selected['shopping_amount'].values
df_selected = df_selected.drop('shopping_amount', axis=1)

y_normal = df_normal['shopping_amount'].values
df_normal = df_normal.drop('shopping_amount', axis=1)

## Preprocessing

We can see that ***daytime***, ***shopping_chart***, ***shopping_frequency***, ***cahs_point***, ***weekday*** has the sense of ordering. So we'll use orinal encoding to have a natural ordered relationship between each other values so the model may be able to understand and harness this relationship. Although the sense of order doesn't exist both for ***attendants*** and ***gender***, will use binary encoding as they only contain two categories

#### for feature selected df

In [13]:
for feature in ['attendants', 'shopping_cart']:
    df_selected[feature] = OrdinalEncoder().fit_transform(df_selected[feature].values.reshape(-1, 1))
    
df_selected.head()

Unnamed: 0,shop_dur,hr_max,max_time,min_time,shopping_cart,attendants,reaching HRmax
0,12,95,8,4,0.0,0.0,0
1,26,112,24,18,1.0,0.0,0
2,24,115,18,20,0.0,0.0,0
3,46,122,26,41,1.0,1.0,0
4,41,120,39,20,1.0,1.0,1


#### for normal df

In [14]:
# do Binary Encoder
for feature in ['attendants', 'gender', 'shopping_frequency', 'shopping_cart']:
    df_normal[feature] = OrdinalEncoder().fit_transform(df_normal[feature].values.reshape(-1, 1))

In [15]:
# get the list of categorical values, aslo include
categorical_values = list(df_normal.select_dtypes(include=['object']))

for f in categorical_values:
    print('{}= {}'.format(f, np.unique(df_normal[f].values)))

daytime= ['evening' 'morning' 'noon']
cash_point= ['free' 'full' 'normal']
weekday= ['friday' 'monday' 'saturday' 'thursday' 'tuesday' 'wednesday']


In [16]:
# do Ordinal Encoder based on category order

# for daytime
df_normal.loc[df_normal.daytime == "morning", "daytime"] = 0
df_normal.loc[df_normal.daytime == "noon", "daytime"] = 1
df_normal.loc[df_normal.daytime == "evening", "daytime"] = 2

# for cash_point
df_normal.loc[df_normal.cash_point == "free", "cash_point"] = 0
df_normal.loc[df_normal.cash_point == "normal", "cash_point"] = 1
df_normal.loc[df_normal.cash_point == "full", "cash_point"] = 2


# for weekday
df_normal.loc[df_normal.weekday == "monday", "weekday"] = 0
df_normal.loc[df_normal.weekday == "tuesday", "weekday"] = 1
df_normal.loc[df_normal.weekday == "wednesday", "weekday"] = 2
df_normal.loc[df_normal.weekday == "thursday", "weekday"] = 3
df_normal.loc[df_normal.weekday == "friday", "weekday"] = 4
df_normal.loc[df_normal.weekday == "saturday", "weekday"] = 5


df_normal.head()

Unnamed: 0,age,gender,BMI,sleeping_time,sporting_activity_(h/week),daytime,outdoor_temperature,shopping_cart,attendants,shopping_frequency,...,weekday,smoking,ex_max,ex_min,hr_max,max_time,hr_min,min_time,shop_dur,reaching HRmax
0,55,0.0,30.0,7.0,2.0,1,23,0.0,0.0,1.0,...,0,0,115,71,95,8,72,4,12,0
1,25,0.0,18.4,9.0,1.0,1,23,1.0,0.0,0.0,...,0,1,110,82,112,24,70,18,26,0
2,38,1.0,25.1,5.0,3.0,1,23,0.0,0.0,0.0,...,0,0,116,74,115,18,75,20,24,0
3,21,0.0,22.6,8.0,0.0,2,23,1.0,1.0,0.0,...,0,1,146,55,122,26,62,41,46,0
5,51,0.0,26.8,8.0,0.0,2,23,1.0,0.0,0.0,...,0,0,114,65,120,40,50,10,42,1


### Scaling

Only scale continuous values or ordinal values with large size. Keep the binaries untouched.

In [17]:
X_selected = df_selected.copy()
X_normal = df_normal.copy()

# for normal
for f in ['BMI', 'sleeping_time', 'sporting_activity_(h/week)', 'daytime', 'outdoor_temperature', \
         'cash_point', 'ex_max', 'ex_min', 'max_time', 'min_time', 'hr_min', 'min_time', 'shop_dur', 'weekday']:
        scaler = StandardScaler()
        scaler.fit(X_normal[f].values.reshape(-1, 1))
        X_normal[f] = scaler.transform(X_normal[f].values.reshape(-1, 1))

# for selcted
for f in ['shop_dur', 'hr_max', 'max_time', 'min_time']:
        scaler = StandardScaler()
        scaler.fit(X_selected[f].values.reshape(-1, 1))
        X_selected[f] = scaler.transform(X_selected[f].values.reshape(-1, 1))

### Train/Test split

As the data size is quite small, we won't do train/test split, rather evaluate the model with k-fold cross-validation.

## NEURAL NETWORKS TRAIN

### define model structure

In [18]:
def init_model(optimizer='adam', activation='relu', kernel_init='uniform'):
    
    # create model
    model = Sequential()
    # add input later
    model.add(Dense(input_dim, input_dim=input_dim, activation=activation, kernel_initializer=kernel_init))
    # add droput regularizer
    model.add(Dropout(.2))
    # add hidden layer
    model.add(Dense(input_dim, activation=activation, kernel_initializer=kernel_init))
    # add output layer
    model.add(Dense(4, activation='softmax'))
    # compile the keras model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [19]:
def ecode_cetegorical(y):
    '''
    fist encode class values and then conver to categorical
    
    input:
        y: traget value
    '''
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_Y = encoder.transform(y)
    # convert integers to dummy variables (i.e. one hot encoded)
    return np_utils.to_categorical(encoded_Y)

#### prepare traget data

In [20]:
# for feature selected
cat_y_selected = ecode_cetegorical(y_selected)

# for normal
cat_y_normal = ecode_cetegorical(y_normal)

### Define girdsearch space

In [21]:
### define girds search with k-fold 5
optimizer = ['RMSprop', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
activation = ['relu', 'hard_sigmoid', 'tanh', 'linear']
kernel_init = ['glorot_normal', 'normal', 'uniform']
param_grid = dict(optimizer = optimizer, activation = activation, kernel_init=kernel_init)

#### TRAIN MODEL ON FEATURE SELECTED DATA

In [18]:
# create model
input_dim=X_selected.shape[1]
model = KerasClassifier(build_fn=init_model, epochs=100, batch_size=10, verbose=0)

# init gird search with k-fold 5
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
# fit model
grid_result = grid.fit(X_selected, cat_y_selected)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show 10 best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))



The best mean acc: 0.5455% achieved with {'activation': 'tanh', 'kernel_init': 'glorot_normal', 'optimizer': 'Adamax'}

Mean acc: 0.5455%, std: (0.1236%), params: {'activation': 'tanh', 'kernel_init': 'glorot_normal', 'optimizer': 'Adamax'}
Mean acc: 0.5245%, std: (0.0845%), params: {'activation': 'relu', 'kernel_init': 'uniform', 'optimizer': 'Adamax'}
Mean acc: 0.5245%, std: (0.0981%), params: {'activation': 'linear', 'kernel_init': 'glorot_normal', 'optimizer': 'Adam'}
Mean acc: 0.5175%, std: (0.1061%), params: {'activation': 'tanh', 'kernel_init': 'uniform', 'optimizer': 'Adam'}
Mean acc: 0.5175%, std: (0.0838%), params: {'activation': 'relu', 'kernel_init': 'normal', 'optimizer': 'Adamax'}
Mean acc: 0.5175%, std: (0.1084%), params: {'activation': 'tanh', 'kernel_init': 'glorot_normal', 'optimizer': 'Nadam'}
Mean acc: 0.5175%, std: (0.1084%), params: {'activation': 'linear', 'kernel_init': 'uniform', 'optimizer': 'Adam'}
Mean acc: 0.5175%, std: (0.1035%), params: {'activation': 'li

#### TRAIN MODEL ON NORMAL DATA

In [19]:
# # create model
input_dim=X_normal.shape[1]
model = KerasClassifier(build_fn=init_model, epochs=100, batch_size=10, verbose=0)

# init gird search with k-fold 5
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
# fit model
grid_result = grid.fit(X_normal, cat_y_normal)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show 10 best result
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))



The best mean acc: 0.5489% achieved with {'activation': 'tanh', 'kernel_init': 'glorot_normal', 'optimizer': 'RMSprop'}

Mean acc: 0.5489%, std: (0.1135%), params: {'activation': 'tanh', 'kernel_init': 'glorot_normal', 'optimizer': 'RMSprop'}
Mean acc: 0.5338%, std: (0.1695%), params: {'activation': 'linear', 'kernel_init': 'glorot_normal', 'optimizer': 'Adamax'}
Mean acc: 0.5263%, std: (0.1036%), params: {'activation': 'linear', 'kernel_init': 'normal', 'optimizer': 'Adam'}
Mean acc: 0.5263%, std: (0.1265%), params: {'activation': 'tanh', 'kernel_init': 'normal', 'optimizer': 'Adamax'}
Mean acc: 0.5188%, std: (0.0877%), params: {'activation': 'linear', 'kernel_init': 'normal', 'optimizer': 'Adadelta'}
Mean acc: 0.5188%, std: (0.1083%), params: {'activation': 'relu', 'kernel_init': 'normal', 'optimizer': 'RMSprop'}
Mean acc: 0.5188%, std: (0.0704%), params: {'activation': 'linear', 'kernel_init': 'normal', 'optimizer': 'RMSprop'}
Mean acc: 0.5188%, std: (0.1224%), params: {'activation'

# OTER CLASSIFIERS

## RidgeClassifier

In [20]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import RidgeClassifier

#### Search space

In [21]:
# define grid search
reg_alpha = [0.001, 0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
grid = dict(alpha=reg_alpha)

#### TRAIN MODEL ON FEATURE SELECTED DATA

In [22]:
# define models with k-fold 10
model = RidgeClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
# fit model
grid_result = grid_search.fit(X_selected, y_selected)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True):
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.5105% achieved with {'alpha': 0.5}

Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 0.5}
Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 0.6}
Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 0.7}
Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 0.8}
Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 0.9}
Mean acc: 0.5105%, std: (0.1035%), params: {'alpha': 1.0}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.001}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.001}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.05}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.1}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.2}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.3}
Mean acc: 0.5082%, std: (0.1052%), params: {'alpha': 0.4}




#### TRAIN MODEL ON NORMAL DATA

In [23]:
# define models with k-fold 10
model = RidgeClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
# fit model
grid_result = grid_search.fit(X_normal, y_normal)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True):
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.5464% achieved with {'alpha': 0.05}

Mean acc: 0.5464%, std: (0.1073%), params: {'alpha': 0.05}
Mean acc: 0.5464%, std: (0.1073%), params: {'alpha': 0.1}
Mean acc: 0.5464%, std: (0.1073%), params: {'alpha': 0.2}
Mean acc: 0.5464%, std: (0.1057%), params: {'alpha': 0.5}
Mean acc: 0.5439%, std: (0.1058%), params: {'alpha': 0.001}
Mean acc: 0.5439%, std: (0.1058%), params: {'alpha': 0.001}
Mean acc: 0.5439%, std: (0.1058%), params: {'alpha': 0.3}
Mean acc: 0.5439%, std: (0.1058%), params: {'alpha': 0.4}
Mean acc: 0.5439%, std: (0.1058%), params: {'alpha': 0.6}
Mean acc: 0.5414%, std: (0.1053%), params: {'alpha': 0.7}
Mean acc: 0.5414%, std: (0.1053%), params: {'alpha': 0.8}
Mean acc: 0.5414%, std: (0.1053%), params: {'alpha': 0.9}
Mean acc: 0.5414%, std: (0.1053%), params: {'alpha': 1.0}




## K-Nearest Neighbors (KNN)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

#### Search Space

In [25]:
n_neighbors = range(1, 24, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors = n_neighbors, weights = weights, metric = metric)

#### TRAIN MODEL ON FEATURE SELECTED DATA

In [26]:
model = KNeighborsClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# fit model
grid_result = grid_search.fit(X_selected, y_selected)


# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.5128% achieved with {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}

Mean acc: 0.5128%, std: (0.1276%), params: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}
Mean acc: 0.5105%, std: (0.1146%), params: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
Mean acc: 0.5035%, std: (0.1156%), params: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}
Mean acc: 0.5035%, std: (0.1104%), params: {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}
Mean acc: 0.5035%, std: (0.1117%), params: {'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}
Mean acc: 0.5012%, std: (0.1023%), params: {'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'uniform'}
Mean acc: 0.4988%, std: (0.1128%), params: {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'distance'}
Mean acc: 0.4988%, std: (0.0989%), params: {'metric': 'euclidean', 'n_neighbors': 23, 'weights': 'uniform'}
Mean acc: 0.4988%, std:



#### TRAIN MODEL ON NORMAL DATA

In [27]:
# define models with k-fold 10
model = KNeighborsClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# fit model
grid_result = grid_search.fit(X_normal, y_normal)


# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.4687% achieved with {'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}

Mean acc: 0.4687%, std: (0.1149%), params: {'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}
Mean acc: 0.4637%, std: (0.1306%), params: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
Mean acc: 0.4637%, std: (0.1116%), params: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}
Mean acc: 0.4612%, std: (0.0984%), params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Mean acc: 0.4612%, std: (0.1108%), params: {'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}
Mean acc: 0.4561%, std: (0.1098%), params: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Mean acc: 0.4561%, std: (0.0957%), params: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
Mean acc: 0.4536%, std: (0.0940%), params: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}
Mean acc: 0.4486%, std: (



# Support Vector Machine (SVM)

In [28]:
from sklearn.svm import SVC

#### Define search space

In [29]:
# define grid search
kernel = ['poly', 'rbf', 'sigmoid'] # kernel
reg_pram = [1.0, 0.1, 0.05, 0.01] # regression penalty
grid = dict(kernel = kernel, C = reg_pram)

#### TRAIN MODEL ON FEATURE SELECTED DATA

In [30]:
# define models with k-fold 10
model = SVC()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
# fit model
grid_result = grid_search.fit(X_selected, y_selected)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.5105% achieved with {'C': 0.1, 'kernel': 'sigmoid'}

Mean acc: 0.5105%, std: (0.1001%), params: {'C': 0.1, 'kernel': 'sigmoid'}
Mean acc: 0.5058%, std: (0.1026%), params: {'C': 1.0, 'kernel': 'sigmoid'}
Mean acc: 0.4895%, std: (0.0924%), params: {'C': 1.0, 'kernel': 'rbf'}
Mean acc: 0.4872%, std: (0.0913%), params: {'C': 0.1, 'kernel': 'rbf'}
Mean acc: 0.4848%, std: (0.0925%), params: {'C': 0.05, 'kernel': 'sigmoid'}
Mean acc: 0.4545%, std: (0.1077%), params: {'C': 1.0, 'kernel': 'poly'}
Mean acc: 0.4452%, std: (0.0527%), params: {'C': 0.1, 'kernel': 'poly'}
Mean acc: 0.4336%, std: (0.0481%), params: {'C': 0.05, 'kernel': 'poly'}
Mean acc: 0.3916%, std: (0.0197%), params: {'C': 0.05, 'kernel': 'rbf'}
Mean acc: 0.3916%, std: (0.0197%), params: {'C': 0.01, 'kernel': 'poly'}




#### TRAIN MODEL ON NORMAL DATA

In [31]:
# define models with k-fold 10
model = SVC()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
# fit model
grid_result = grid_search.fit(X_normal, y_normal)

# summarize results
print("The best mean acc: {:.4f}% achieved with {}\n".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score'] # get mean score
stds = grid_result.cv_results_['std_test_score'] # get std score
params = grid_result.cv_results_['params'] # get params

# show the best results
for mean, stdev, param in sorted(list(zip(means, stds, params)), key=lambda t: t[0], reverse=True)[:10]:
    print("Mean acc: {:.4f}%, std: ({:.4f}%), params: {}".format(mean, stdev, param))

The best mean acc: 0.3985% achieved with {'C': 1.0, 'kernel': 'sigmoid'}

Mean acc: 0.3985%, std: (0.0226%), params: {'C': 1.0, 'kernel': 'sigmoid'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.1, 'kernel': 'rbf'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.1, 'kernel': 'sigmoid'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.05, 'kernel': 'rbf'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.05, 'kernel': 'sigmoid'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.01, 'kernel': 'rbf'}
Mean acc: 0.3985%, std: (0.0226%), params: {'C': 0.01, 'kernel': 'sigmoid'}
Mean acc: 0.3835%, std: (0.1044%), params: {'C': 1.0, 'kernel': 'rbf'}
Mean acc: 0.3709%, std: (0.1364%), params: {'C': 1.0, 'kernel': 'poly'}
Mean acc: 0.3709%, std: (0.1364%), params: {'C': 0.1, 'kernel': 'poly'}


