In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import neurokit2 as nk
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import AdaBoostClassifier
import pyhrv.time_domain as td
import pywt
import matplotlib.pyplot as plt
import pickle
import pyhrv
import ipynb
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# functions for features extraction
from ipynb.fs.full.ECG_features import get_ecgfeatures
from ipynb.fs.full.EDA import get_edaindex, get_edafeatures
from ipynb.fs.full.BVP import get_bvpfeatures, bvp_prep

# **Data Collection & Feature extraction**

In [2]:
# Dataset reading
class read_data_of_one_subject:
            """Read data from WESAD dataset"""
            def __init__(self, path, subject):
                self.keys = ['label', 'subject', 'signal']
                self.signal_keys = ['wrist', 'chest']
                self.chest_sensor_keys = ['ACC', 'ECG', 'EDA', 'EMG', 'Resp', 'Temp']
                self.wrist_sensor_keys = ['ACC', 'BVP', 'EDA', 'TEMP']
                #os.chdir(path)
                #os.chdir(subject)
                with open(path + subject +'/'+subject + '.pkl', 'rb') as file:
                    data = pickle.load(file, encoding='latin1')
                self.data = data

            def get_labels(self):
                return self.data[self.keys[0]]

            def get_wrist_data(self):
                """"""
                #label = self.data[self.keys[0]]
                #assert subject == self.data[self.keys[1]]
                signal = self.data[self.keys[2]]
                wrist_data = signal[self.signal_keys[0]]
                #wrist_ACC = wrist_data[self.wrist_sensor_keys[0]]
                #wrist_ECG = wrist_data[self.wrist_sensor_keys[1]]
                return wrist_data

            def get_chest_data(self):
                """"""
                signal = self.data[self.keys[2]]
                chest_data = signal[self.signal_keys[1]]
                return chest_data

In [3]:
# Set up empty dataframes for the features
columns_ecg =['index', 'ecg_HR_mean', 'ecg_HR_min', 'ecg_HR_max', 'ecg_HR_std', 'ecg_SDNN', 'ecg_SDANN', 'ecg_RMSSD', 
                                           'ecg_SDSD','ecg_pNN50', 'ecg_pNN20', "ecg_triangular_index", "ecg_tinn", "ecg_sd1", "ecg_sd2",
                                            "ecg_ratio_sd2_sd1", 'ecg_abs_power_VLF', 'ecg_abs_power_LF', 'ecg_abs_power_HF', 'ecg_tot_power',
                                            'ecg_LF/HF', 'ecg_peak_vlf', 'ecg_peak_lf', 'ecg_peak_hf', 'ecg_norm_power_LF', 'ecg_norm_power_HF',
                                            'max_cwt_absmean', 'min_cwt_absmean', 'mean_cwt_absmean','std_cwt_absmean', 
                                            'max_cwt_std', 'min_cwt_std', 'mean_cwt_std','std_cwt_std',
                                            'max_cwt_energy', 'min_cwt_energy', 'mean_cwt_energy','std_cwt_energy',
                                            'max_cwt_var', 'min_cwt_var', 'mean_cwt_var','std_cwt_var',
                                            'entropy']
columns_resp = ['RSP_Rate_Mean','RRV_RMSSD', 'RRV_MeanBB', 'RRV_SDBB', 'RRV_SDSD', 'RRV_CVBB', 'RRV_CVSD', 'RRV_MedianBB',
                'RRV_MadBB', 'RRV_MCVBB', 'RRV_VLF', 'RRV_LF', 'RRV_HF', 'RRV_LFHF', 'RRV_LFn', 'RRV_HFn', 'RRV_SD1',
                'RRV_SD2', 'RRV_SD2SD1', 'RRV_ApEn', 'RRV_SampEn', 'RSP_Amplitude_Mean', 'RSP_RVT', 'RSP_Symmetry_PeakTrough',
                'RSP_Symmetry_RiseDecay', 'RSP_Phase_Duration_Inspiration', 'RSP_Phase_Duration_Expiration', 'RSP_Phase_Duration_Ratio']

columns = [*columns_ecg, *columns_resp]
ecgfeatures = pd.DataFrame(None, columns= columns)
edafeatures = None
bvpfeatures = None

print(len(ecgfeatures.keys()))
total_samples = 0
fs = 700 # sampling frequency
fs_bvp = 64
duration = 45
y = []
amountpsample = dict()
# data_set_path= "C:/Users/JackC/Documents/EPO4/WESAD/WESAD/" # Folder path
data_set_path= "C:/Users/riche/Downloads/WESAD/WESAD/" # Folder path
# data_set_path = "C:/Users/Adnane/Downloads/WESAD/WESAD/"

for i in range(16): # 15 subjects
    subject = 'S'+str(i+2) # Cycle through S2 to S17
    print(f"subject: {subject}")
    amountpsample[subject] = 0
    if subject != 'S12': # Skip S12, because it does not exist
        # Object instantiation
        obj_data = {}

        # Accessing class attributes and method through objects
        obj_data[subject] = read_data_of_one_subject(data_set_path, subject)

        chest_data_dict = obj_data[subject].get_chest_data()
        wrist_data_dict = obj_data[subject].get_wrist_data()
        chest_dict_length = {key: len(value) for key, value in chest_data_dict.items()}
        wrist_dict_length = {key: len(value) for key, value in wrist_data_dict.items()}

        # Get labels
        labels = obj_data[subject].get_labels()

        for label in [1,2,4]: # for every state of stress
            
            baseline = np.asarray([idx for idx,val in enumerate(labels) if val == label])

            # Obtaining the chest data
            eda_base=chest_data_dict['EDA'][baseline,0] # Select the EDA data
            ecg_base=chest_data_dict['ECG'][baseline,0] # Select the ECG data
            resp_base=chest_data_dict['Resp'][baseline,0] # Select the respiration data

            # Obtaining the wrist data
            baseline_BVP = baseline * fs_bvp // fs
            bvp_base=wrist_data_dict['BVP'][baseline_BVP,0] # Select the BVP data

            # Multiple smaller samples per signal
            # samples = 10 # specify amount of samples

            for j in range(0, len(eda_base)//(duration*fs), 2): # loop over all segments
                amountpsample[subject] += 1
                # adding the labels to y
                if label==1 or label==4: # if amusement or medidation, classify as non-stress
                    y.append(0)
                else:
                    y.append(1)

                # Cutting the signal into a segment
                start, end = j*fs*duration, (j+1)*fs*duration
                ecg = ecg_base[start: end]
                eda = eda_base[start: end]
                bvp = bvp_base[start: end]

                # splitting sample into 10 smaller even-sized parts
                #ecg = ecg_base[int(len(ecg_base)* (j/samples)) : int(len(ecg_base)* ((j+1)/samples))]
                #eda = eda_base[int(len(eda_base)* (j/samples)) : int(len(eda_base)* ((j+1)/samples))]

                # getting the eda and ecg features
                index = subject + str(label) + str(j)
                tonic, phasic, start, end = get_edaindex(eda, fs)
                edafeatures = get_edafeatures(index, edafeatures, phasic, tonic, fs)
                ecgfeatures = get_ecgfeatures(ecg, fs, ecgfeatures, index)
                
                bvpfeature = get_bvpfeatures(bvp, fs_bvp, index)
                bvpfeatures = pd.concat([bvpfeatures, bvpfeature], axis=0)
    print(f"total samples {subject}: {amountpsample[subject]}")

71
subject: S2
total samples S2: 29
subject: S3
total samples S3: 29
subject: S4
total samples S4: 29
subject: S5
total samples S5: 29
subject: S6
total samples S6: 29
subject: S7
total samples S7: 29
subject: S8
total samples S8: 29
subject: S9
total samples S9: 29
subject: S10
total samples S10: 30
subject: S11
total samples S11: 30
subject: S12
total samples S12: 0
subject: S13
total samples S13: 29
subject: S14
total samples S14: 30
subject: S15
total samples S15: 30
subject: S16
total samples S16: 29
subject: S17
total samples S17: 29


In [4]:
# saved_edafeatures = edafeatures
# saved_ecgfeatures = ecgfeatures
# saved_bvpfeatures = bvpfeatures

In [5]:
bvp_filt = bvp_prep(bvpfeatures) # remove nan and inf in BVP features
# ecgfeatures = bvp_prep(ecgfeatures)

# saving the feature data in csv files
edafeatures.to_csv("EDA_features.csv")
ecgfeatures.to_csv("ECG_features.csv")
bvp_filt.to_csv("BVP_features.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "    features_event.reset_index(drop=True, inplace=True)\n",


In [3]:
# #reading from csv file if necessary
# edafeatures = pd.read_csv("EDA_features.csv", index_col=0)
# ecgfeatures = pd.read_csv("ECG_features.csv", index_col=0)
# bvp_filt = pd.read_csv("BVP_features.csv", index_col=0)
#features = pd.read_csv("features_"+str(samples)+"_samp.csv", index_col=0)

In [22]:
features1 = pd.merge(ecgfeatures, edafeatures, left_index=True, right_index=True)
total_features = pd.merge(bvp_filt, features1, on='index')
total_features['y'] = y

total_features = total_features[total_features.RSP_RVT != 0.0]
total_features.to_csv("out_features.csv")

In [5]:
# saved_total_features = total_features
# total_features = pd.read_csv("out_features.csv", index_col=0)

In [6]:
total_features = bvp_prep(total_features)
features = total_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "    features_event.reset_index(drop=True, inplace=True)\n",


# Feature Selection

In [120]:
# Scaling the data
# scaler = StandardScaler().fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

num_features = 25
y = total_features['y']
X_total = total_features.drop(['y', 'index'], axis=1)
print(len(y), len(X_total))

402 402


In [17]:
# # Variance Threshold
# scaler = MinMaxScaler().fit(X_total)
# X_train = scaler.transform(X_total)

# total_features_index = []
# for feat in X_total:
#     total_features_index.append(feat)

# X_vt = X_train[0:,0:len(X_total.keys())]

# v_threshold = VarianceThreshold(threshold=0.005) # Set a threshold
# v_threshold.fit(X_vt)
# index = v_threshold.get_support()
# true_index = [i for i, x in enumerate(index) if x]
# vt_features = [total_features_index[i] for i in true_index]

In [121]:
#Pearson correlation
def cor_selector(X_cor, y,num_feats):
    cor_list = []
    feature_name = X_cor.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X_cor.columns.tolist():
        cor = np.corrcoef(X_cor[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X_cor.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

def pears(X):
    cor_support, cor_feature = cor_selector(X, y, num_features)
    return cor_feature

pc_features = pears(X_total)

In [122]:
#Chi-squared
X_norm = MinMaxScaler().fit_transform(X_total)

chi_selector = SelectKBest(chi2, k=num_features)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X_total.loc[:,chi_support].columns.tolist()

In [123]:
#Wrapper Forwards
sfs = SFS(svm.SVC(),
          k_features=num_features,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)

sfs.fit(X_total, y)
fwrapper_feature = list(sfs.k_feature_names_)

In [124]:
#Wrapper Backwards
sbs = SFS(svm.SVC(),
         k_features=num_features,
         forward=False,
         floating=False,
         cv=0)
sbs.fit(X_total, y)
bwrapper_feature = list(sbs.k_feature_names_)

In [125]:
#Step-wise wrapper
sffs = SFS(svm.SVC(),
         k_features=num_features,
         forward=True,
         floating=True,
         cv=0)
sffs.fit(X_total, y)
swrapper_feature = list(sffs.k_feature_names_)

In [126]:
# LASSO
num_feats = num_features
scaler = StandardScaler().fit(X_total)
X_train = scaler.transform(X_total)
embeded_lr_selector = SelectFromModel(LogisticRegression(C=1, penalty="l1", solver='liblinear'), max_features=num_feats)
embeded_lr_selector.fit(scaler.transform(X_total), y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X_total.loc[:,embeded_lr_support].columns.tolist()

In [127]:
#Random Forest
scaler = StandardScaler().fit(X_total)
X_train = scaler.transform(X_total)

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_features)
embeded_rf_selector.fit(X_total, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X_total.loc[:,embeded_rf_support].columns.tolist()

In [128]:
lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X_total, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X_total.loc[:,embeded_lgb_support].columns.tolist()

In [129]:
# Lists of features and printing the amount of features selected
selected_features = [ pc_features, chi_feature, fwrapper_feature, bwrapper_feature, swrapper_feature, 
                    embeded_lr_feature, embeded_rf_feature, embeded_lgb_feature]
feature_sel_name = ["pearson correlation", "chi-squared", "forward wrapper", "backwards wrapper", 
                  "step-wise wrapper", "Lasso", "Random Forest", 'lightGBM']

print(f"total features: {len(total_features.keys())}")
print("Filter:")
for i in range(len(selected_features)):
    print(f"\t {feature_sel_name[i]}: {len(selected_features[i])}")
    if feature_sel_name[i] == "chi-squared":
        print("Wrapper")
    if feature_sel_name[i] == "step-wise wrapper":
        print("Embedded")

total features: 177
Filter:
	 pearson correlation: 25
	 chi-squared: 25
Wrapper
	 forward wrapper: 25
	 backwards wrapper: 25
	 step-wise wrapper: 25
Embedded
	 Lasso: 25
	 Random Forest: 25
	 lightGBM: 25


# Classifiers

Logistic Regression

In [133]:
def logistic_func(X_train, X_test, y_train, y_test):
    # define the model
    logistic_model = LogisticRegression(solver='lbfgs', max_iter=100)

    # fit/train the model on all features
    logistic_model.fit(X_train, y_train)
    # Y_test_pred = logistic_model.predict(X_test)

    # # score
    # log_score = logistic_model.score(X_test,y_test)
    return logistic_model #, Y_test_pred, log_score

SVM

In [134]:
def svm_func(X_train, X_test, y_train, y_test):
    # define the model
    svm_model = svm.SVC()

    # fit/train the model on all features
    svm_model.fit(X_train, y_train)
    # Y_test_pred = svm_model.predict(X_test)

    # # score
    # svm_score = svm_model.score(X_test,y_test)
    return svm_model #, Y_test_pred, svm_score

Random Forest

In [29]:
def randomforest_func(X_train, X_test, y_train, y_test):
    # define the model
    randomf_model = RandomForestClassifier()

    # fit/train the model on all features
    randomf_model.fit(X_train, y_train)
    # Y_test_pred = randomf_model.predict(X_test)

    # # score
    # rf_score=randomf_model.score(X_test, y_test)
    return randomf_model #, Y_test_pred, rf_score

Neural Networks

In [109]:
def neuralnetworks_func(X_train, X_test, Y_train, Y_test):
    val_accuracies = []

    # Scaling the data
    scaler = StandardScaler().fit(X_train)
    x_train = scaler.transform(X_train)
    x_test = scaler.transform(X_test)
    ## Convert labels to categorical
    y_train_cat = to_categorical(Y_train)
    y_test_cat = to_categorical(Y_test)

    # Define the feedforward neural network
    neural_model = Sequential()
    neural_model.add(Dense(64, activation='relu'))
    neural_model.add(Dense(64, activation='relu'))
    neural_model.add(Dense(64, activation='relu'))
    neural_model.add(Dense(64, activation = 'relu'))
    neural_model.add(Dropout(0.5))
    neural_model.add(Dense(3, activation='softmax'))

    # Compile the model
    optimizer = Adam(learning_rate=0.001)
    neural_model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

    # Train the model
    # history = model.fit(X_train_pca, y_train, validation_data=(X_test_pca, y_test), epochs=45, batch_size=100, verbose=2)
    history = neural_model.fit(x_train, y_train_cat, validation_data=(x_test, y_test_cat), epochs=45, batch_size=100, verbose=0)
    val_accuracy = history.history['val_accuracy'][-1]
    val_accuracies.append(val_accuracy) # accuracy score

    # print(val_accuracies)
    avg_val_accuracy = sum(val_accuracies) / len(val_accuracies) # average of accurracy scores
    # print("Average Validation Accuracy:", avg_val_accuracy)

    # plt.plot(history.history['accuracy'])
    # plt.plot(history.history['val_accuracy'])
    # plt.legend(['accuracy', 'val_accuracy'])
    # print(neural_model.predict(X_test))
    return val_accuracies


Adaboost

In [45]:
# Import the AdaBoost classifier
def adaboost_func(X_train, X_test, Y_train, Y_test):

        # Create adaboost classifer object
    abc= AdaBoostClassifier(n_estimators=50, learning_rate=1, random_state=5)

    # Train Adaboost Classifer
    adaboost_model = abc.fit(X_train, Y_train)

    return adaboost_model
    #Predict the response for test dataset
    # y_pred = model1.predict(X_test)
        # score30= adaboost_model.score(X_test, y_test)
        # Ada_scores[i] = score30
    # print(Ada_scores)
    # print(np.mean(Ada_scores))
    #predictions = adaboost_model.predict(X_test)
    #print(predictions != y_test)
    #print(score30)



In [None]:
scores_ada = []
trainscores_ada = []
for i in range (50):
      dtree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=1)
      adbclassifier = AdaBoostClassifier(base_estimator=dtree,
                                    n_estimators=100,
                                    learning_rate=1,
                                    algorithm = 'SAMME',
                                    random_state=1)
      X_train, X_test, y_train, y_test = train_test_split(x_train, y, test_size=0.2, random_state= i+1)
      adbclassifier.fit(X_train, y_train)
      scores_ada.append(adbclassifier.score(X_test,y_test))
      trainscores_ada.append(adbclassifier.score(X_train,y_train))
print(np.mean(scores_ada))
print(len(X_test))
plt.ylim(0,1)
plt.plot(scores_ada)
plt.plot(trainscores_ada)

XGBoost

In [118]:
def xgboost_func(X_train, X_test, Y_train, Y_test):
    label = LabelEncoder()
    y2 = label.fit_transform(Y_train)
    y2 = list(y2)

    xgb_clf = XGBClassifier(n_estimators = 500,
                            learning_rate = 1,
                            eval_metric = "logloss",
                            early_stopping_rounds = 5,
                            n_jobs = -1,
                            )
    xgb_clf.fit(X_train, y2,                    
                eval_set = [(X_test,Y_test)],
                verbose = False)
    pred_test = xgb_clf.predict(X_test)
    test_score = accuracy_score(pred_test, Y_test)

    return test_score

# Train - test split

In [31]:
# function which gives predicted and correct values with missclassifications and scores for rest models
def model_predict(X_test, y_test1, model):
    predictions = model.predict(X_test)
    miss_class = np.where(predictions != y_test1)
    miss_class = miss_class[0]
    score=model.score(X_test, y_test1)
    return score

In [32]:
# def model_predict_neural(X_test, y_test1, model):
#     predictions = model.predict(X_test)
#     predictions = np.argmax(predictions, axis = 1)
#     y_test1 = y_test1.astype('int64')
#     testYarg = np.argmax(y_test1, axis = 1)
#     miss_class = np.where(predictions != testYarg)
#     miss_class = miss_class[0]
#     print("Neural network")
#     print("Predicted:",predictions)
#     print("Correct:  ",testYarg)
#     print("Index missclassified:", miss_class)
#     # print("Score:", val_accuracy, "\n")
#     return predictions, miss_class, testYarg

In [33]:
# y_test_cat = to_categorical(y_test)
# y_test1 = y_test_cat.astype('int64')
# testYarg = np.argmax(y_test1, axis = 1)

NameError: name 'y_test' is not defined

General Approach

In [119]:
subjects = ['S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S13','S14','S15','S16','S17']
model_names = ["LogisticRegression()", "SVC()", "RandomForestClassifier()", "AdaBoostClassifier(learning_rate=1, random_state=5)", "Sequential()",
               "LabelEncoder()"]

first_line = ""
for model in model_names:
    first_line += "\t" + model
print(first_line)

for i in range(len(selected_features)):
    feat = selected_features[i]
    scores_dict = dict()
    for model in model_names:
        scores_dict[model] = []

    for subject in subjects:
        X_test = total_features[total_features['index'].str.startswith(subject)]
        y_test = X_test['y']
        X_test = X_test.drop(['y', 'index'], axis=1)
        X_train = total_features[~total_features['index'].str.startswith(subject)]
        y_train = X_train['y']
        X_train = X_train.drop(['y', 'index'], axis=1)
        X_train = X_train[list(feat)]
        X_test = X_test[list(feat)]
        
        # fit the data
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Get models
        lg_model = logistic_func(X_train, X_test, y_train, y_test)
        svm_model = svm_func(X_train, X_test, y_train, y_test)
        rf_model = randomforest_func(X_train, X_test, y_train, y_test)
        ada_model = adaboost_func(X_train, X_test, y_train, y_test)
        nn_accurary = neuralnetworks_func(X_train, X_test, y_train, y_test)
        xgb_accuracy = xgboost_func(X_train, X_test, y_train, y_test)
        
        # Get the score
        models= [lg_model, svm_model, rf_model, ada_model]

        for model in models:
            score = model_predict(X_test, y_test, model)
            scores_dict[str(model)].append(model_predict(X_test, y_test, model))
        scores_dict["Sequential()"].append(nn_accurary)
        scores_dict["LabelEncoder()"].append(xgb_accuracy)

    line = str(feature_sel_name[i])

    for a in scores_dict:
        score = scores_dict[a]
        line += "\t" + f"{np.mean(score):.5f}" + "+-" + f"{np.std(score):.3f}"

    # line += "\t" + f"{np.mean(nn_accurary):.5f}" + "+-" + f"{np.std(nn_accurary):.3f}"
    print(line)

	LogisticRegression()	SVC()	RandomForestClassifier()	AdaBoostClassifier(learning_rate=1, random_state=5)	Sequential()	LabelEncoder()
variance threshold	0.90311+-0.085	0.89683+-0.103	0.89258+-0.086	0.85659+-0.117	0.89654+-0.101	0.03683+-0.082
pearson correlation	0.91013+-0.076	0.91539+-0.083	0.89221+-0.100	0.86647+-0.118	0.87382+-0.088	0.08292+-0.126
chi-squared	0.90069+-0.082	0.91301+-0.087	0.90253+-0.091	0.84920+-0.125	0.91440+-0.087	0.05164+-0.082
forward wrapper	0.87243+-0.118	0.86299+-0.088	0.86649+-0.103	0.83321+-0.133	0.85941+-0.115	0.07707+-0.113
backwards wrapper	0.84681+-0.112	0.83705+-0.106	0.82942+-0.107	0.81053+-0.088	0.79503+-0.089	0.09691+-0.082
step-wise wrapper	0.87243+-0.118	0.86299+-0.088	0.86854+-0.090	0.83321+-0.133	0.86986+-0.093	0.07707+-0.113
Lasso	0.95385+-0.045	0.92857+-0.084	0.89771+-0.100	0.88200+-0.130	0.93385+-0.067	0.05799+-0.127
Random Forest	0.89792+-0.103	0.91768+-0.086	0.90217+-0.100	0.87202+-0.125	0.91860+-0.071	0.05168+-0.109
lightGBM	0.73425+-0.060	

Subject-based Approach

In [137]:
sb_dict = dict()
feat = selected_features[-1]

for model in model_names:
    sb_dict[model] = []
    
for subject in subjects:
    X_sb = total_features[~total_features['index'].str.startswith(subject)]
    y_sb = X_sb['y']
    X_sb = X_sb.drop(['y', 'index'], axis=1)
    X_sb = X_sb[list(feat)]

    X_train_sb, X_test_sb, y_train_sb, y_test_sb = train_test_split(X_sb, y_sb, test_size=0.2, random_state=5)

    scaler_sb = StandardScaler().fit(X_train_sb)
    X_train_sb = scaler_sb.transform(X_train_sb)
    X_test_sb = scaler_sb.transform(X_test_sb)

    lg_model_sb = logistic_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    svm_model_sb = svm_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    rf_model_sb = randomforest_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    # nn_model = neuralnetworks_func(X_train, X_test, Y_train, Y_test)
    ada_model_sb = adaboost_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    nn_accurary_sb = neuralnetworks_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    xgb_accuracy_sb = xgboost_func(X_train_sb, X_test_sb, y_train_sb, y_test_sb)
    # Get the score
    models= [lg_model_sb, svm_model_sb, rf_model_sb, ada_model_sb]

    for model in models:
        score = model_predict(X_test_sb, y_test_sb, model)
        sb_dict[str(model)].append(model_predict(X_test_sb, y_test_sb, model))
    sb_dict["Sequential()"].append(nn_accurary_sb)
    sb_dict["LabelEncoder()"].append(xgb_accuracy_sb)

    # for model in models:
    #     sb_dict[str(model)].append(model_predict(X_test_sb, y_test_sb, model))
    
for a in sb_dict:
    print(np.mean(sb_dict[a]), np.std(sb_dict[a]))

0.7595789473684211 0.029213260501086486
0.7560584795321638 0.03490718859492099
0.7578713450292397 0.045916981691085046
0.7241520467836255 0.039160507713583356
0.7356842080752055 0.03299481856033999
0.14500584795321636 0.038223066837712226


Hybrid Approach

In [None]:
subjects = ['S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S13','S14','S15','S16','S17']
model_names = ["LogisticRegression()", "SVC()", "RandomForestClassifier()", "AdaBoostClassifier(learning_rate=1, random_state=5)", "Sequential()",
               "LabelEncoder()"]

first_line = ""
for model in model_names:
    first_line += "\t" + model
print(first_line)

x_trains = []
for subject in subjects:
    X_h = total_features[total_features['index'].str.startswith(subject)]
    scaler_hy = StandardScaler().fit(features)
    x_trains.append(scaler_hy.transform(features))


for i in range(len(selected_features)):
    feat = selected_features[i]
    scores_dict = dict()

    for model in model_names:
        scores_dict[model] = []

    for subject in subjects:
        X_test = total_features[total_features['index'].str.startswith(subject)]
        y_test = X_test['y']
        X_test = X_test.drop(['y', 'index'], axis=1)
        X_train = total_features[~total_features['index'].str.startswith(subject)]
        y_train = X_train['y']
        X_train = X_train.drop(['y', 'index'], axis=1)
        X_train = X_train[list(feat)]
        X_test = X_test[list(feat)]
        
        # fit the data
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Get models
        lg_model = logistic_func(X_train, X_test, y_train, y_test)
        svm_model = svm_func(X_train, X_test, y_train, y_test)
        rf_model = randomforest_func(X_train, X_test, y_train, y_test)
        ada_model = adaboost_func(X_train, X_test, y_train, y_test)
        nn_accurary = neuralnetworks_func(X_train, X_test, y_train, y_test)
        xgb_accuracy = xgboost_func(X_train, X_test, y_train, y_test)
        
        # Get the score
        models= [lg_model, svm_model, rf_model, ada_model]

        for model in models:
            score = model_predict(X_test, y_test, model)
            scores_dict[str(model)].append(model_predict(X_test, y_test, model))
        scores_dict["Sequential()"].append(nn_accurary)
        scores_dict["LabelEncoder()"].append(xgb_accuracy)

    line = str(feature_sel_name[i])

    for a in scores_dict:
        score = scores_dict[a]
        line += "\t" + f"{np.mean(score):.5f}" + "+-" + f"{np.std(score):.3f}"

    # line += "\t" + f"{np.mean(nn_accurary):.5f}" + "+-" + f"{np.std(nn_accurary):.3f}"
    print(line)