# Major Depressive Disorder Diagnosis

In [102]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [204]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

- - -

# Data Handling

## Dataset check

In [6]:
# HRV 데이터셋 불러오기
hrv_df = pd.read_csv('E:/RESEARCH/Datasets/HRV/HRV_samsung/HRV_REV_all.csv', sep=',')
hrv_df.head()

Unnamed: 0,sub,VISIT,disorder,age,gender,HAMD,HAMA,PDSS,ASI,APPQ,...,cRMSSD,cVLF,cLF,cHF,cLF/HF,cPOWER,cHR,cRESP,cSC,cTEMP
0,E001,4,2,23,1,2,2,1,12,22,...,41.544667,190.107,298.508333,206.862333,1.284,695.477333,65.707,14.054333,3.911333,34.998
1,E001,5,2,23,1,12,7,0,12,24,...,39.825333,143.756667,115.695333,202.602667,0.585,462.054667,69.04,14.117333,6.255,35.544333
2,E002,1,2,38,1,14,17,14,31,122,...,20.052,22.006,50.182,32.529333,2.499333,104.717,92.789333,11.013333,0.945667,35.086
3,E002,2,2,38,1,13,36,16,32,139,...,20.201667,55.579,84.441,18.754,5.803,158.774333,85.980667,12.608333,0.785667,36.141
4,E002,3,2,38,1,7,10,11,23,70,...,74.788,182.229,530.565667,546.574,1.685667,1259.368667,84.368667,14.285667,0.648,35.879


* disorder(=label): 1(Depression), 2(Panic Disorder), 3(Control)

In [7]:
hrv_df.shape

(479, 93)

HRV measuring steps
* b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables)
> SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

- - -

## Data preprocessing

In [65]:
## Separating HRV dataset by experimental steps.
hrv = hrv_df
hrv_b1 = hrv_df.filter(regex='^b1')
hrv_s = hrv_df.filter(regex='^s')
hrv_b2 = hrv_df.filter(regex='^b2')
hrv_r = hrv_df.filter(regex='^r')
hrv_b3 = hrv_df.filter(regex='^b3')
hrv_c = hrv_df.filter(regex='^c')

In [72]:
## Check whethere each phase contains the same variables.
print("HRV baseline #1 shape is:", hrv_b1.shape[1])
print("HRV stress shape is:", hrv_s.shape[1])
print("HRV baseline #2 shape is:", hrv_b2.shape[1])
print("HRV rest shape is:", hrv_r.shape[1])
print("HRV baseline #3 shape is:", hrv_b3.shape[1])
print("HRV c shape is:", hrv_b1.shape[1])

HRV baseline #1 shape is: 13
HRV stress shape is: 13
HRV baseline #2 shape is: 13
HRV rest shape is: 13
HRV baseline #3 shape is: 13
HRV c shape is: 13


In [70]:
hrv_s = hrv_s.drop(columns=['sub'])

In [212]:
## Renaming the columns for further calculation.
## We need to generate new dataframes to compare the phases.
hrv_sub = hrv_df.loc[:, ['sub']]
hrv_disorder = hrv_df.loc[:,['disorder']] -1 ## 0(Depression), 1(Panic Disorder), 2(Control)
hrv_gender = hrv_df.loc[:,['gender']]
hrv_HAMD = hrv_df.loc[:,['HAMD']]
hrv_PDSS = hrv_df.loc[:,['PDSS']]

hrv_variables = ["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"]
hrv_b1_rename = hrv_b1.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b2_rename = hrv_b2.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b3_rename = hrv_b3.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_s_rename = hrv_s.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_r_rename = hrv_r.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_c_rename = hrv_c.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)

In [213]:
hrv_disorder.value_counts()

disorder
2           194
1           149
0           136
dtype: int64

- - -

## Comparisons between Phases

* HRV measuring steps: b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables): SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

Since the experimental phase steps are "b1-s-b2-r-b3-c", there are total 5 between phases

### 1) Baseline 1 - Stress phase

In [84]:
hrv_b1_s_sub = hrv_b1_rename - hrv_s_rename
hrv_b1_s_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,0.287667,4.666667,0.086333,10.664333,-2.756,-279.967,184.906667,-2.611,-97.816333,-4.520333,2.624333,-4.828,1.002
1,-21.138333,3.666667,0.070667,-24.260667,-373.648667,-1140.831334,-1154.247667,-1.071333,-2668.727667,-8.211,1.535,-9.487,0.614
2,2.049333,0.0,0.0,-1.726,76.143333,-29.314667,-26.714333,1.287667,20.114667,-2.449667,2.892333,-0.319667,-0.020333
3,6.312333,0.0,0.0,1.099333,98.432667,136.848,24.599,2.798,259.880333,-0.202667,4.266333,-0.033667,-0.057667
4,28.445667,3.0,0.032667,43.376,57.050333,46.734333,59.237667,-0.705667,163.023,2.514667,3.533,-0.017,-0.001


### 2) Stress - Baseline 2 phase

In [85]:
hrv_s_b2_sub = hrv_s_rename - hrv_b2_rename
hrv_s_b2_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,-11.36,-5.333333,-0.096667,-18.286,-58.71,-66.028667,-406.971333,2.004,-531.71,4.353333,-1.105333,-0.297667,-0.856667
1,9.035667,-4.333333,-0.080333,20.483333,234.864333,1052.266334,993.647,1.265333,2280.777667,8.344667,0.931,2.27,0.033667
2,2.347,0.0,0.0,3.369,13.332333,71.931667,45.656333,-0.551333,130.92,3.359667,-1.622667,0.132,0.003
3,-5.727667,0.0,0.0,-2.342667,-48.445667,-33.052,-11.412,0.121,-92.909667,4.459,-3.775667,0.003333,-0.043667
4,25.174667,1.666667,0.018333,16.042,-6.794,182.404333,200.162,-0.467333,375.771667,2.845333,-3.480667,-0.013333,-0.057667


### 3) Baseline2 - Rest phase

In [86]:
hrv_b2_r_sub = hrv_b2_rename - hrv_r_rename
hrv_b2_r_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,10.395333,-1.0,-0.012,6.659,-44.986,-60.517,89.346333,-0.14,-16.157,-0.738333,-0.327333,2.811667,-0.516667
1,8.564,2.333333,0.038667,4.713,88.049333,-244.483333,115.041667,-0.824333,-41.393,-1.832333,-0.871,4.765,-0.102333
2,3.167333,0.0,0.0,0.588667,-18.453333,-2.815667,-6.873333,0.441,-28.142,-2.467333,0.249667,0.058333,0.066
3,-10.102333,-1.0,-0.012,-6.834,7.107667,-150.309,-52.806333,-0.724,-196.007667,2.282,0.243467,-0.048667,0.001667
4,-60.977,-1.333333,-0.016333,-79.491,-46.755667,-70.231333,-440.990333,1.082333,-557.976667,1.675667,-0.406267,-0.018333,-0.025


### 4) Rest - Baseline 3 phase

In [88]:
hrv_r_b3_sub = hrv_r_rename - hrv_b3_rename
hrv_r_b3_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,0.013,3.0,0.052333,4.031,103.554,184.618,56.917333,0.183,345.089,-1.751333,-0.039,0.217,0.008
1,-10.302,4.0,0.066667,2.679,-335.397667,-1035.652,-117.790667,-1.463,-1488.84,-3.499333,-0.183,0.000333,0.033667
2,-6.37,0.0,0.0,-0.883333,-9.966667,-36.395667,0.591667,-0.573,-45.771,0.824667,0.156333,0.031,-0.006333
3,2.291667,0.333333,0.004,0.361,-9.414,29.256667,20.084333,-0.369,39.926667,0.664667,0.459533,-0.097667,0.031
4,22.474,-2.0,-0.024,29.576,-33.946333,-416.315333,-38.017,-0.863333,-488.278333,1.778333,1.090933,-0.048,0.232333


### 5) Baseline 3 - Recovery phase

In [89]:
hrv_b3_c_sub = hrv_b3_rename - hrv_c_rename
hrv_b3_c_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,12.304667,5.0,0.082,11.538,-87.053,22.708667,183.949333,-0.400667,119.605667,-2.086,0.554,-0.973,0.049667
1,11.43,0.0,-0.000333,3.136,344.203333,1457.863334,196.513333,2.423,1998.580333,0.804333,-0.045667,-3.214333,-0.499667
2,5.140667,-0.333333,-0.003667,-6.298333,47.976667,35.215667,-2.485667,-0.249,80.707333,0.584667,4.988667,-0.041,0.039
3,7.316333,0.666667,0.008,1.034,28.962333,110.159333,48.298,-2.942667,187.419333,-2.421667,4.454,0.009333,0.096667
4,-4.424,-4.333333,-0.051333,3.227667,-66.960667,53.581,-23.631,0.085667,-37.011334,-1.356,2.324333,-0.033333,-0.065667


### 6) Stress - Rest phase

* This is what SMC checks for the research

In [81]:
hrv_s_r_sub = hrv_s_rename - hrv_r_rename
hrv_s_r_sub.head()

Unnamed: 0,SDNN,NN50,PNN50,RMSSD,VLF,LF,HF,LF/HF,POWER,HR,RESP,SC,TEMP
0,-0.964667,-6.333333,-0.108667,-11.627,-103.696,-126.545667,-317.625,1.864,-547.867,3.615,-1.432667,2.514,-1.373333
1,17.599667,-2.0,-0.041667,25.196333,322.913667,807.783,1108.688667,0.441,2239.384667,6.512333,0.06,7.035,-0.068667
2,5.514333,0.0,0.0,3.957667,-5.121,69.116,38.783,-0.110333,102.778,0.892333,-1.373,0.190333,0.069
3,-15.83,-1.0,-0.012,-9.176667,-41.338,-183.361,-64.218333,-0.603,-288.917333,6.741,-3.5322,-0.045333,-0.042
4,-35.802333,0.333333,0.002,-63.449,-53.549667,112.173,-240.828333,0.615,-182.205,4.521,-3.886933,-0.031667,-0.082667


- - -

## Data Selection

In [99]:
hrv_data = pd.concat([hrv_s_r_sub, hrv_disorder], axis=1)

In [214]:
X = hrv_s_r_sub
Y = hrv_disorder

- - -

# Data Analysis

## Train-Test Split

In [163]:
X.columns

Index(['SDNN', 'NN50', 'PNN50', 'RMSSD', 'VLF', 'LF', 'HF', 'LF/HF', 'POWER',
       'HR', 'RESP', 'SC', 'TEMP'],
      dtype='object')

In [129]:
X = X[["PNN50", "VLF","LF","HF","LF/HF"]]

In [216]:
X = X.drop(columns=['POWER'])

In [217]:
## Split X and Y into training dataset and test dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [218]:
print("x_train dataset shape is", x_train.shape)
print("y_train dataset shape is", y_train.shape)

print("x_test dataset shape is", x_test.shape)
print("y_test dataset shape is", y_test.shape)

x_train dataset shape is (383, 12)
y_train dataset shape is (383, 1)
x_test dataset shape is (96, 12)
y_test dataset shape is (96, 1)


## Regression

In [185]:
logistic_reg = LogisticRegression(solver='lbfgs', max_iter = 4000)
logistic_reg.fit(x_train, y_train.values.ravel())

LogisticRegression(max_iter=4000)

In [188]:
predictions = logistic_reg.predict(x_test)

In [189]:
print(confusion_matrix(y_test,predictions))

[[ 5  1 19]
 [ 5  8 26]
 [ 5  3 24]]


## Decision Tree

In [191]:
### dt_model=DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

DecisionTreeClassifier()

In [192]:
dt_pred = dt_model.predict(x_test)

In [193]:
print(confusion_matrix(y_test,dt_pred))

[[12  2 11]
 [11  9 19]
 [ 6  5 21]]


In [194]:
print(classification_report(y_test,dt_pred))

              precision    recall  f1-score   support

           1       0.41      0.48      0.44        25
           2       0.56      0.23      0.33        39
           3       0.41      0.66      0.51        32

    accuracy                           0.44        96
   macro avg       0.46      0.46      0.43        96
weighted avg       0.47      0.44      0.42        96



## Random Forest Classification

In [197]:
rf= RandomForestClassifier(n_estimators=500)
rf.fit(x_train, y_train.values.ravel())

RandomForestClassifier(n_estimators=500)

In [198]:
rf_pre=rf.predict(x_test)

In [199]:
print(confusion_matrix(y_test, rf_pre))

[[ 9  4 12]
 [ 8  7 24]
 [ 4  7 21]]


In [200]:
print(classification_report(y_test, rf_pre))

              precision    recall  f1-score   support

           1       0.43      0.36      0.39        25
           2       0.39      0.18      0.25        39
           3       0.37      0.66      0.47        32

    accuracy                           0.39        96
   macro avg       0.40      0.40      0.37        96
weighted avg       0.39      0.39      0.36        96



## XGBoosts Classifier

In [219]:
xgboost = XGBClassifier(n_estimators=1000)
xgboost.fit(x_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [222]:
xg_pred = xgboost.predict(x_test)

In [224]:
print(confusion_matrix(y_test, xg_pred))

[[ 9  5 11]
 [ 8 11 20]
 [ 7  6 19]]


In [223]:
print(classification_report(y_test, xg_pred))

              precision    recall  f1-score   support

           0       0.38      0.36      0.37        25
           1       0.50      0.28      0.36        39
           2       0.38      0.59      0.46        32

    accuracy                           0.41        96
   macro avg       0.42      0.41      0.40        96
weighted avg       0.43      0.41      0.40        96



## Multi-Layer Perceptron