In [57]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Constuct Dataframes

In [58]:
filenames = os.listdir(r'/Users/apostoloskalatzis/Downloads/ML-Project/GAN_method_data/acute_stress/no_overlap/normalized')

li = []
for filename in filenames:
    dataframe=pd.read_csv(os.path.join(r'/Users/apostoloskalatzis/Downloads/ML-Project/GAN_method_data/acute_stress/no_overlap/normalized',filename))
    li.append(dataframe)

no_overlap_data = pd.concat(li, axis=0, ignore_index=True)

In [59]:
thirty_overlap_data=pd.read_csv(r'/Users/apostoloskalatzis/Downloads/ML-Project/GAN_method_data/acute_stress/30_overlap/30_overlap.csv')

In [60]:
fifty_overlap_data = pd.read_csv(r'/Users/apostoloskalatzis/Downloads/ML-Project/GAN_method_data/acute_stress/50_overlap/50_overlap.csv')

In [61]:
IDS = [1,2,3,4,5,6,7,8,12,13,14,17,20,21,22,23,24,25,26,28,29,30,31,32,33]

In [62]:
ID = [1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,12.0,13.0,14.0,17.0,20.0,
      21.0,22.0,23.0,24.0,25.0,26.0,28.0,29.0,30.0,31.0,32.0,33.0]

# Acute Stress Predictions GAN Method

## Random Forest

In [63]:
def RFLOOCV(data, ids, outcomevar, dropcols, idcolumn):
    """
        Intermediate function. Please use loocvRF function.
            
    """
    #numestimators=300
    #fs=0.00
    #Get important features 
    #listimportances = LOOCV_featureselection(data, ids, outcomevar, dropcols, idcolumn, numestimators)
    #filteredi = listimportances[listimportances['importances'] < fs]
    #filteredi = filteredi['value']
    
    LOOCV_O = str(ids)
    data[idcolumn] = data[idcolumn].apply(str)
    data_filtered = data[data[idcolumn] != LOOCV_O]
    data_cv = data[data[idcolumn] == LOOCV_O]
   
    # Test data - the person left out of training
    data_test = data_cv.drop(columns=dropcols)
    #data_test = data_test.drop(columns=filteredi) #cvf
    X_test = data_test.drop(columns=[outcomevar])
    y_test = data_test[outcomevar] #This is the outcome variable
    
    # Train data - all other people in dataframe
    data_train = data_filtered.drop(columns=dropcols)
    #data_train = data_train.drop(columns=filteredi)
    X_train = data_train.drop(columns=[outcomevar])
    
    feature_list = list(X_train.columns)
    X_train= np.array(X_train)
    y_train = np.array(data_train[outcomevar]) #Outcome variable here
    RF = RandomForestClassifier(n_estimators=400, min_samples_split=10, 
                                min_samples_leaf=1, max_features='auto', max_depth=80, bootstrap=False)
    RF.fit(X_train, y_train)
    predictions = RF.predict(X_test)
    # Use the forest's predict method on the test data
    accuracy = accuracy_score(y_test, predictions)
    pre_score = precision_score(y_test, predictions, pos_label=0, average='binary')
    re_score= recall_score(y_test, predictions, pos_label=0, average='binary')
    f_score = f1_score(y_test, predictions, pos_label=0, average='binary')
    data = {'Accuracy': [accuracy],'Precision': [pre_score], 'Recall':[re_score],
        'F1-Score': [f_score],'Participant' : [ids]}
    results = pd.DataFrame (data, columns = ['Accuracy','Precision','Recall','F1-Score','Participant'])
    return results

### No overlap

In [64]:
li=[]
for i in IDS:
    RF=RFLOOCV(data=no_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')
    li.append(RF)
RFResults_no_overlap = pd.concat(li, axis=0, ignore_index=True) 
RFResults_no_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/RF/no_overlap.csv')

In [65]:
RFResults_no_overlap['Accuracy'].mean()

0.9647619047619047

### 30% Overlap 

In [66]:
li=[]
for i in IDS:
    RF=RFLOOCV(data=thirty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')

    li.append(RF)
RFResults_thirty_overlap = pd.concat(li, axis=0, ignore_index=True) 
RFResults_thirty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/RF/30_overlap.csv')

In [67]:
RFResults_thirty_overlap['Accuracy'].mean()

0.8632323232323234

### 50% Overlap

In [68]:
li=[]
for i in IDS:
    RF=RFLOOCV(data=fifty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')

    li.append(RF)
RFResults_fifty_overlap = pd.concat(li, axis=0, ignore_index=True) 
RFResults_fifty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/RF/50_overlap.csv')

In [69]:
RFResults_fifty_overlap['Accuracy'].mean()

0.8025201465201465

## Linear SVM

In [70]:
 def LSVMLOOCV(data, ids, outcomevar, dropcols, idcolumn):
    """
        Intermediate function. Please use loocvRF function.
            
    """
    #numestimators=300
    #fs=0.00
    #Get important features 
    #listimportances = LOOCV_featureselection(data, ids, outcomevar, dropcols, idcolumn, numestimators)
    #filteredi = listimportances[listimportances['importances'] < fs]
    #filteredi = filteredi['value']
    
    LOOCV_O = str(ids)
    data[idcolumn] = data[idcolumn].apply(str)
    data_filtered = data[data[idcolumn] != LOOCV_O]
    data_cv = data[data[idcolumn] == LOOCV_O]
   
    # Test data - the person left out of training
    data_test = data_cv.drop(columns=dropcols)
    #data_test = data_test.drop(columns=filteredi) #cvf
    X_test = data_test.drop(columns=[outcomevar])
    y_test = data_test[outcomevar] #This is the outcome variable
    
    # Train data - all other people in dataframe
    data_train = data_filtered.drop(columns=dropcols)
    #data_train = data_train.drop(columns=filteredi)
    X_train = data_train.drop(columns=[outcomevar])
    
    feature_list = list(X_train.columns)
    X_train= np.array(X_train)
    y_train = np.array(data_train[outcomevar]) #Outcome variable here   
    SV = SVC(C=1.120000138515899, class_weight=None, gamma=0.0009260906281129682, kernel='linear')
    SV.fit(X_train, y_train)
    predictions = SV.predict(X_test)
    # Use the forest's predict method on the test data
    accuracy = accuracy_score(y_test, predictions)
    pre_score = precision_score(y_test, predictions, pos_label=0, average='binary')
    re_score= recall_score(y_test, predictions, pos_label=0, average='binary')
    f_score = f1_score(y_test, predictions, pos_label=0, average='binary')
    data = {'Accuracy': [accuracy],'Precision': [pre_score], 'Recall':[re_score],
        'F1-Score': [f_score],'Participant' : [ids]}
    results = pd.DataFrame (data, columns = ['Accuracy','Precision','Recall','F1-Score','Participant'])
    return results

### No overlap

In [71]:
li=[]
for i in IDS:
    LSVM = LSVMLOOCV(data=no_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')
    li.append(LSVM)
LSVMResults_no_overlap = pd.concat(li, axis=0, ignore_index=True) 
LSVMResults_no_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/LSVM/no_overlap.csv')

In [72]:
LSVMResults_no_overlap['Accuracy'].mean()

0.9540476190476191

### 30% Overlap

In [73]:
li=[]
for i in IDS:
    LSVM = LSVMLOOCV(data=thirty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')
    li.append(LSVM)
LSVMResults_thirty_overlap = pd.concat(li, axis=0, ignore_index=True) 
LSVMResults_thirty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/LSVM/30_overlap.csv')

In [74]:
LSVMResults_thirty_overlap['Accuracy'].mean()

0.8627878787878788

### 50% Overlap

In [75]:
li=[]
for i in IDS:
    LSVM = LSVMLOOCV(data=fifty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')

    li.append(LSVM)
LSVMResults_fifty_overlap = pd.concat(li, axis=0, ignore_index=True) 
LSVMResults_fifty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/LSVM/50_overlap.csv')

In [76]:
LSVMResults_fifty_overlap['Accuracy'].mean()

0.829772893772894

## Non Linear SVM 

In [77]:
 def NLSVMLOOCV(data, ids, outcomevar, dropcols, idcolumn):
    """
        Intermediate function. Please use loocvRF function.
            
    """
    #numestimators=300
    #fs=0.00
    #Get important features 
    #listimportances = LOOCV_featureselection(data, ids, outcomevar, dropcols, idcolumn, numestimators)
    #filteredi = listimportances[listimportances['importances'] < fs]
    #filteredi = filteredi['value']
    
    LOOCV_O = str(ids)
    data[idcolumn] = data[idcolumn].apply(str)
    data_filtered = data[data[idcolumn] != LOOCV_O]
    data_cv = data[data[idcolumn] == LOOCV_O]
   
    # Test data - the person left out of training
    data_test = data_cv.drop(columns=dropcols)
    #data_test = data_test.drop(columns=filteredi) #cvf
    X_test = data_test.drop(columns=[outcomevar])
    y_test = data_test[outcomevar] #This is the outcome variable
    
    # Train data - all other people in dataframe
    data_train = data_filtered.drop(columns=dropcols)
    #data_train = data_train.drop(columns=filteredi)
    X_train = data_train.drop(columns=[outcomevar])
    
    feature_list = list(X_train.columns)
    X_train= np.array(X_train)
    y_train = np.array(data_train[outcomevar]) #Outcome variable here   
    SV = SVC(C=556.7119515302165, class_weight=None, gamma=0.0009621376423426009, kernel='rbf')
    SV.fit(X_train, y_train)
    predictions = SV.predict(X_test)
    # Use the forest's predict method on the test data
    accuracy = accuracy_score(y_test, predictions)
    pre_score = precision_score(y_test, predictions, pos_label=0, average='binary')
    re_score= recall_score(y_test, predictions, pos_label=0, average='binary')
    f_score = f1_score(y_test, predictions, pos_label=0, average='binary')
    data = {'Accuracy': [accuracy],'Precision': [pre_score], 'Recall':[re_score],
        'F1-Score': [f_score],'Participant' : [ids]}
    results = pd.DataFrame (data, columns = ['Accuracy','Precision','Recall','F1-Score','Participant'])
    return results

### No overlap

In [78]:
li=[]
for i in IDS:
    NLSVM = NLSVMLOOCV(data=no_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')
    li.append(NLSVM)
NLSVMResults_no_overlap = pd.concat(li, axis=0, ignore_index=True) 
NLSVMResults_no_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/NLSVM/no_overlap.csv')

In [79]:
NLSVMResults_no_overlap['Accuracy'].mean()

0.9540476190476191

### 30% Overlap

In [80]:
li=[]
for i in IDS:
    NLSVM = NLSVMLOOCV(data=thirty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')
    li.append(NLSVM)
NLSVMResults_thirty_overlap = pd.concat(li, axis=0, ignore_index=True) 
NLSVMResults_thirty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/NLSVM/30_overlap.csv')

In [81]:
NLSVMResults_thirty_overlap['Accuracy'].mean()

0.8627878787878788

### 50% Overlap

In [82]:
li=[]
for i in IDS:
    NLSVM = NLSVMLOOCV(data=fifty_overlap_data, ids=i, outcomevar='label', dropcols=['Unnamed: 0','ID'], idcolumn='ID')

    li.append(NLSVM)
NLSVMResults_fifty_overlap = pd.concat(li, axis=0, ignore_index=True) 
NLSVMResults_fifty_overlap.to_csv('/Users/apostoloskalatzis/Downloads/ML-Project/Results/Acute_stress_results/GAN_method/NLSVM/50_overlap.csv')

In [83]:
NLSVMResults_fifty_overlap['Accuracy'].mean()

0.829772893772894