# Major Depressive Disorder Diagnosis

In [1]:
import os
import time
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torchvision
import torch.nn as nn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

from torch.nn import functional as F
from torch.autograd import Variable

In [3]:
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.7.1
Torchvision Version:  0.8.2


In [4]:
class Args:
    # arugments
    epochs=50
    bs=16
    lr=0.001
    momentum=0.9
    num_classes=3
    verbose='store_true'
    seed=674

args = Args()    

np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)

<torch._C.Generator at 0x1edfc781b50>

In [5]:
#Setting torch environment

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
    
print('Using PyTorch version:', torch.__version__, ' Device: ', DEVICE)

Using PyTorch version: 1.7.1  Device:  cuda


- - -

# Data Handling

## Dataset check

In [6]:
# HRV 데이터셋 불러오기
hrv_df = pd.read_csv('E:/RESEARCH/Datasets/HRV/HRV_samsung/HRV_REV_all.csv', sep=',')
hrv_df.head()

Unnamed: 0,sub,VISIT,disorder,age,gender,HAMD,HAMA,PDSS,ASI,APPQ,...,cRMSSD,cVLF,cLF,cHF,cLF/HF,cPOWER,cHR,cRESP,cSC,cTEMP
0,E001,4,2,23,1,2,2,1,12,22,...,41.544667,190.107,298.508333,206.862333,1.284,695.477333,65.707,14.054333,3.911333,34.998
1,E001,5,2,23,1,12,7,0,12,24,...,39.825333,143.756667,115.695333,202.602667,0.585,462.054667,69.04,14.117333,6.255,35.544333
2,E002,1,2,38,1,14,17,14,31,122,...,20.052,22.006,50.182,32.529333,2.499333,104.717,92.789333,11.013333,0.945667,35.086
3,E002,2,2,38,1,13,36,16,32,139,...,20.201667,55.579,84.441,18.754,5.803,158.774333,85.980667,12.608333,0.785667,36.141
4,E002,3,2,38,1,7,10,11,23,70,...,74.788,182.229,530.565667,546.574,1.685667,1259.368667,84.368667,14.285667,0.648,35.879


* disorder(=label): 1(Depression), 2(Panic Disorder), 3(Control)

In [7]:
hrv_df.shape

(479, 93)

HRV measuring steps
* b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables)
> SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

## Dataset Separation

* disorder= 1:(Depression), 2:(Panic Disorder), 3:(Control)

In [8]:
hrv_df.columns

Index(['sub', 'VISIT', 'disorder', 'age', 'gender', 'HAMD', 'HAMA', 'PDSS',
       'ASI', 'APPQ', 'PSWQ', 'SPI', 'PSS', 'BIS', 'SSI', 'b1SDNN', 'b1NN50',
       'b1PNN50', 'b1RMSSD', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF', 'b1POWER',
       'b1HR', 'b1RESP', 'b1SC', 'b1TEMP', 'sSDNN', 'sNN50', 'sPNN50',
       'sRMSSD', 'sVLF', 'sLF', 'sHF', 'sLF/HF', 'sPOWER', 'sHR', 'sRESP',
       'sSC', 'sTEMP', 'b2SDNN', 'b2NN50', 'b2PNN50', 'b2RMSSD', 'b2VLF',
       'b2LF', 'b2HF', 'b2LF/HF', 'b2POWER', 'b2HR', 'b2RESP', 'b2SC',
       'b2TEMP', 'rSDNN', 'rNN50', 'rPNN50', 'rRMSSD', 'rVLF', 'rLF', 'rHF',
       'rLF/HF', 'rPOWER', 'rHR', 'rRESP', 'rSC', 'rTEMP', 'b3SDNN', 'b3NN50',
       'b3PNN50', 'b3RMSSD', 'b3VLF', 'b3LF', 'b3HF', 'b3LF/HF', 'b3POWER',
       'b3HR', 'b3RESP', 'b3SC', 'b3TEMP', 'cSDNN', 'cNN50', 'cPNN50',
       'cRMSSD', 'cVLF', 'cLF', 'cHF', 'cLF/HF', 'cPOWER', 'cHR', 'cRESP',
       'cSC', 'cTEMP'],
      dtype='object')

In [9]:
hrv_df["disorder"].value_counts() ## MDD 136, PD 149, Control 194

3    194
2    149
1    136
Name: disorder, dtype: int64

In [10]:
hrv_ALL = hrv_df
hrv_MDDPD = hrv_df[hrv_df["disorder"].isin([1,2])]  ## for MDD vs PD task
hrv_MDDC = hrv_df[hrv_df["disorder"].isin([1,3])]   ## for MDD vs Control task
hrv_PDC = hrv_df[hrv_df["disorder"].isin([2,3])]    ## for PD  vs Control task

In [14]:
hrv_MDDC.shape

(330, 93)

- - -

## Data preprocessing

In [13]:
## scaler setting for data standardization.
scaler = MinMaxScaler()

In [18]:
## Selecting hrv dataset for the task
# hrv = hrv_ALL
# hrv = hrv_MDDPD
hrv = hrv_MDDC
# hrv = hrv_PDC

In [19]:
hrv

Unnamed: 0,sub,VISIT,disorder,age,gender,HAMD,HAMA,PDSS,ASI,APPQ,...,cRMSSD,cVLF,cLF,cHF,cLF/HF,cPOWER,cHR,cRESP,cSC,cTEMP
7,E003,1,1,57,2,17,7,0,7,24,...,11.042667,132.564333,85.853000,15.020333,5.159667,233.437667,76.412667,12.575667,0.655000,35.631333
8,E003,2,1,57,2,7,7,0,16,23,...,17.207000,38.888000,33.040000,12.504333,3.691333,84.432667,56.474000,12.367667,0.347000,33.237000
9,E003,3,1,57,2,5,3,0,6,11,...,58.545667,30.004333,26.829333,59.476000,0.653000,116.309667,53.746667,13.226667,0.575000,32.853000
10,E003,4,1,57,2,1,1,0,20,31,...,17.261000,37.713333,34.427333,16.011333,2.146000,88.152000,62.778667,12.404000,0.773000,35.046000
11,E003,5,1,57,2,12,13,0,3,12,...,34.473333,40.431333,18.012333,52.686667,0.343333,111.129667,63.868000,10.994667,2.080333,34.751667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,E103,1,1,71,1,20,14,0,47,114,...,13.506000,10.706000,13.746700,12.347300,14.575300,36.799700,86.526300,10.268700,4.586330,34.528700
456,E103,2,1,71,1,14,3,0,41,84,...,11.523700,34.633700,16.170700,3.551670,5.129330,54.356000,74.020700,11.658700,3.782000,33.023700
457,E103,3,1,71,1,16,15,0,33,56,...,22.506700,18.495700,22.447300,38.283000,1.808670,79.226300,73.282300,12.570000,4.173670,32.164700
458,E103,4,1,71,1,0,0,0,28,65,...,69.218700,41.291700,39.600700,225.653000,0.177000,306.545000,65.475700,13.548300,1.566000,31.549000


In [20]:
## Separating HRV dataset by experimental steps.
hrv_only = hrv.drop(columns=['sub', 'VISIT', 'disorder', 'age','gender','HAMD', 'HAMA', 'PDSS', 'ASI', 'APPQ','PSWQ','SPI','PSS','BIS','SSI']) ##leave the variables only about HRV features.
hrv_only[:] = scaler.fit_transform(hrv_only[:])  ##Standardizing. if not necessary, delete.  
hrv_b1 = hrv.filter(regex='^b1')
hrv_s = hrv.filter(regex='^s')
hrv_b2 = hrv.filter(regex='^b2')
hrv_r = hrv.filter(regex='^r')
hrv_b3 = hrv.filter(regex='^b3')
hrv_c = hrv.filter(regex='^c')

In [21]:
hrv_only.head()

Unnamed: 0,b1SDNN,b1NN50,b1PNN50,b1RMSSD,b1VLF,b1LF,b1HF,b1LF/HF,b1POWER,b1HR,...,cRMSSD,cVLF,cLF,cHF,cLF/HF,cPOWER,cHR,cRESP,cSC,cTEMP
7,0.029739,0.0,0.0,0.013392,0.001768,0.000557,0.001484,0.021171,0.001327,0.424606,...,0.007717,0.001743,0.000415,0.000355,0.145641,0.000741,0.429173,0.370844,0.04256,0.934622
8,0.080975,0.0,0.0,0.041879,0.005922,0.001584,0.004429,0.020513,0.00432,0.0931,...,0.015267,0.00049,0.000143,0.000292,0.102908,0.000239,0.132928,0.345104,0.021541,0.724881
9,0.684928,0.105263,0.134041,0.473378,0.044866,0.02281,0.058072,0.021797,0.045589,0.155741,...,0.065897,0.000371,0.000111,0.001478,0.014484,0.000347,0.092406,0.451407,0.0371,0.691243
10,0.07652,0.052632,0.060752,0.055723,0.004645,0.001566,0.006871,0.010026,0.004524,0.188282,...,0.015333,0.000474,0.000151,0.00038,0.057934,0.000252,0.226602,0.3496,0.050612,0.883347
11,0.148438,0.039474,0.047252,0.146307,0.005542,0.005562,0.04198,0.006874,0.017082,0.161077,...,0.036414,0.00051,6.6e-05,0.001307,0.005471,0.000329,0.242787,0.175192,0.139826,0.857564


In [22]:
hrv_only.shape

(330, 78)

In [23]:
## Check whethere each phase contains the same variables.
print("HRV baseline #1 shape is:", hrv_b1.shape[1])
print("HRV stress shape is:", hrv_s.shape[1])
print("HRV baseline #2 shape is:", hrv_b2.shape[1])
print("HRV rest shape is:", hrv_r.shape[1])
print("HRV baseline #3 shape is:", hrv_b3.shape[1])
print("HRV c shape is:", hrv_b1.shape[1])

HRV baseline #1 shape is: 13
HRV stress shape is: 14
HRV baseline #2 shape is: 13
HRV rest shape is: 13
HRV baseline #3 shape is: 13
HRV c shape is: 13


In [24]:
hrv_s = hrv_s.drop(columns=['sub'])

In [25]:
## Renaming the columns for further calculation.
## We need to generate new dataframes to compare the phases.
hrv_sub = hrv.loc[:, ['sub']]
hrv_disorder = hrv.loc[:,['disorder']] -1 ## 0(Depression), 1(Panic Disorder), 2(Control)
hrv_gender = hrv.loc[:,['gender']]
hrv_HAMD = hrv.loc[:,['HAMD']]
hrv_PDSS = hrv.loc[:,['PDSS']]

hrv_variables = ["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"]
hrv_b1_rename = hrv_b1.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b2_rename = hrv_b2.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b3_rename = hrv_b3.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_s_rename = hrv_s.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_r_rename = hrv_r.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_c_rename = hrv_c.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)

In [26]:
hrv_disorder.value_counts()

disorder
2           194
0           136
dtype: int64

- - -

## Comparisons between Phases

* HRV measuring steps: b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables): SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

Since the experimental phase steps are "b1-s-b2-r-b3-c", there are total 5 between phases

### 1) Baseline 1 - Stress phase

In [None]:
hrv_b1_s_sub = hrv_b1_rename - hrv_s_rename
hrv_b1_s_sub.head()

### 2) Stress - Baseline 2 phase

In [None]:
hrv_s_b2_sub = hrv_s_rename - hrv_b2_rename
hrv_s_b2_sub.head()

### 3) Baseline2 - Rest phase

In [None]:
hrv_b2_r_sub = hrv_b2_rename - hrv_r_rename
hrv_b2_r_sub.head()

### 4) Rest - Baseline 3 phase

In [None]:
hrv_r_b3_sub = hrv_r_rename - hrv_b3_rename
hrv_r_b3_sub.head()

### 5) Baseline 3 - Recovery phase

In [None]:
hrv_b3_c_sub = hrv_b3_rename - hrv_c_rename
hrv_b3_c_sub.head()

### 6) Stress - Rest phase

* This is what SMC checks for the research

In [None]:
hrv_s_r_sub = hrv_s_rename - hrv_r_rename
hrv_s_r_sub.head()

- - -

# Data Visualization

In [None]:
hrv.columns

In [None]:
hrv.describe()

## Age and Disorder

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(hrv, col='disorder')
g.map(plt.hist, 'age', bins=20)

## Gender and Disorder

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(hrv, col='disorder')
g.map(plt.hist, 'gender', bins=20)

- - -

# Data Analysis

## Data Selection

In [None]:
## Generating dataset with y label on it. 
hrv_data = pd.concat([hrv_s_r_sub, hrv_disorder], axis=1)

In [None]:
X = hrv_s_b2_sub
Y = hrv_disorder

## Train-Test Split

In [None]:
X.columns

In [None]:
# X = X[["PNN50", "VLF","LF","HF","LF/HF"]]

In [None]:
X = X.drop(columns=['POWER'])

In [None]:
## Split X and Y into training dataset and test dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
print("x_train dataset shape is", x_train.shape)
print("y_train dataset shape is", y_train.shape)

print("x_test dataset shape is", x_test.shape)
print("y_test dataset shape is", y_test.shape)

## Regression

In [None]:
logistic_reg = LogisticRegression(solver='lbfgs', max_iter = 4000)
logistic_reg.fit(x_train, y_train.values.ravel())

In [None]:
predictions = logistic_reg.predict(x_test)

In [None]:
print(confusion_matrix(y_test,predictions))

## Decision Tree

In [None]:
### dt_model=DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print(confusion_matrix(y_test,dt_pred))

In [None]:
print(classification_report(y_test,dt_pred))

## Random Forest Classification

In [None]:
rf= RandomForestClassifier(n_estimators=500)
rf.fit(x_train, y_train.values.ravel())

In [None]:
rf_pre=rf.predict(x_test)

In [None]:
print(confusion_matrix(y_test, rf_pre))

In [None]:
print(classification_report(y_test, rf_pre))

## XGBoosts Classifier

In [None]:
xgboost = XGBClassifier(n_estimators=1000)
xgboost.fit(x_train, y_train)

In [None]:
xg_pred = xgboost.predict(x_test)

In [None]:
print(confusion_matrix(y_test, xg_pred))

In [None]:
print(classification_report(y_test, xg_pred))

- - -

## Multi-Layer Perceptron

* Simple MLP

In [None]:
input_size = x_train.shape[1]

In [None]:
class MLP_HRV(nn.Module):
    def __init__(self):
        super(MLP_HRV, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, 3)

    def forward(self, x):
        x = x.view(-1, input_size)
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        x = F.relu(x)
        x = self.layer3(x)
        x = F.log_softmax(x, dim=1)
        return x

In [None]:
model = MLP_HRV().to(DEVICE)
print(model)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
x_train = torch.tensor(x_train.values)

In [None]:
for epoch in range(args.epochs):
    loss = 0
    batch = len(train_loader)

    for images, labels in train_loader: 
        images = images.view(-1, input_size).to(DEVICE) 
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        hypothesis = model(images)
        cost = criterion(hypothesis, labels)
        cost.backward()
        optimizer.step()
        loss += cost / batch

    print('Epoch:', '%03d' % (epoch + 1), 'Training loss =', '{:.5f}'.format(loss))

- - -

## Convolutional Neural Network

In [None]:
print("X shape is ", X.shape)
print("Y shape is ", Y.shape)

In [None]:
X.head()