# Major Depressive Disorder Diagnosis

In [None]:
import os
import time
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torchvision
import torch.nn as nn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, TensorDataset

In [None]:
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

In [None]:
class Args:
    # arugments
    epochs=50
    bs=16
    lr=0.001
    momentum=0.9
    num_classes=3
    verbose='store_true'
    seed=674

args = Args()    

np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)

In [None]:
#Setting torch environment

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
    
print('Using PyTorch version:', torch.__version__, ' Device: ', DEVICE)

- - -

# Data Handling

## Dataset check

In [None]:
# HRV 데이터셋 불러오기
hrv_df = pd.read_csv('E:/RESEARCH/Datasets/HRV/HRV_samsung/HRV_REV_all.csv', sep=',')
hrv_df.head()

* disorder(=label): 1(Depression), 2(Panic Disorder), 3(Control)

In [None]:
hrv_df.shape

HRV measuring steps
* b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables)
> SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

## Dataset Separation

* disorder= 1:(Depression), 2:(Panic Disorder), 3:(Control)

In [None]:
hrv_df.columns

In [None]:
hrv_df["disorder"].value_counts() ## MDD 136, PD 149, Control 194

In [None]:
hrv_ALL = hrv_df
hrv_MDDPD = hrv_df[hrv_df["disorder"].isin([1,2])]  ## for MDD vs PD task
hrv_MDDC = hrv_df[hrv_df["disorder"].isin([1,3])]   ## for MDD vs Control task
hrv_PDC = hrv_df[hrv_df["disorder"].isin([2,3])]    ## for PD  vs Control task

In [None]:
hrv_MDDC.shape

- - -

## Data preprocessing

In [None]:
## scaler setting for data standardization.
scaler = MinMaxScaler()

In [None]:
## Selecting hrv dataset for the task
# hrv = hrv_ALL
# hrv = hrv_MDDPD
hrv = hrv_MDDC
# hrv = hrv_PDC

In [None]:
hrv

In [None]:
## Separating HRV dataset by experimental steps.
hrv_only = hrv.drop(columns=['sub', 'VISIT', 'disorder', 'age','gender','HAMD', 'HAMA', 'PDSS', 'ASI', 'APPQ','PSWQ','SPI','PSS','BIS','SSI']) ##leave the variables only about HRV features.
hrv_only[:] = scaler.fit_transform(hrv_only[:])  ##Standardizing. if not necessary, delete.  
hrv_b1 = hrv.filter(regex='^b1')
hrv_s = hrv.filter(regex='^s')
hrv_b2 = hrv.filter(regex='^b2')
hrv_r = hrv.filter(regex='^r')
hrv_b3 = hrv.filter(regex='^b3')
hrv_c = hrv.filter(regex='^c')

In [None]:
hrv_only.head()

In [None]:
hrv_only.shape

In [None]:
## Check whethere each phase contains the same variables.
print("HRV baseline #1 shape is:", hrv_b1.shape[1])
print("HRV stress shape is:", hrv_s.shape[1])
print("HRV baseline #2 shape is:", hrv_b2.shape[1])
print("HRV rest shape is:", hrv_r.shape[1])
print("HRV baseline #3 shape is:", hrv_b3.shape[1])
print("HRV c shape is:", hrv_b1.shape[1])

In [None]:
hrv_s = hrv_s.drop(columns=['sub'])

In [None]:
## Renaming the columns for further calculation.
## We need to generate new dataframes to compare the phases.
hrv_sub = hrv.loc[:, ['sub']]
hrv_disorder = hrv.loc[:,['disorder']] -1 ## 0(Depression), 1(Panic Disorder), 2(Control)
hrv_gender = hrv.loc[:,['gender']]
hrv_HAMD = hrv.loc[:,['HAMD']]
hrv_PDSS = hrv.loc[:,['PDSS']]

hrv_variables = ["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"]
hrv_b1_rename = hrv_b1.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b2_rename = hrv_b2.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_b3_rename = hrv_b3.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_s_rename = hrv_s.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_r_rename = hrv_r.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)
hrv_c_rename = hrv_c.set_axis(["SDNN", "NN50", "PNN50", "RMSSD", "VLF", "LF", "HF", "LF/HF", "POWER", "HR", "RESP", "SC", "TEMP"], axis=1)

In [None]:
hrv_disorder.value_counts()

- - -

## Comparisons between Phases

* HRV measuring steps: b1 - s - b2 - r - b3 - c
* Each has following variables (Total 13 variables): SDNN, NN50, PNN50, RMSSD, VLF, LF, HF, LF/HF, POWER, HR, RESP, SC, TEMP

Since the experimental phase steps are "b1-s-b2-r-b3-c", there are total 5 between phases

### 1) Baseline 1 - Stress phase

In [None]:
hrv_b1_s_sub = hrv_b1_rename - hrv_s_rename
hrv_b1_s_sub.head()

### 2) Stress - Baseline 2 phase

In [None]:
hrv_s_b2_sub = hrv_s_rename - hrv_b2_rename
hrv_s_b2_sub.head()

### 3) Baseline2 - Rest phase

In [None]:
hrv_b2_r_sub = hrv_b2_rename - hrv_r_rename
hrv_b2_r_sub.head()

### 4) Rest - Baseline 3 phase

In [None]:
hrv_r_b3_sub = hrv_r_rename - hrv_b3_rename
hrv_r_b3_sub.head()

### 5) Baseline 3 - Recovery phase

In [None]:
hrv_b3_c_sub = hrv_b3_rename - hrv_c_rename
hrv_b3_c_sub.head()

### 6) Stress - Rest phase

* This is what SMC checks for the research

In [None]:
hrv_s_r_sub = hrv_s_rename - hrv_r_rename
hrv_s_r_sub.head()

- - -

# Data Visualization

In [None]:
hrv.describe()

## Age and Disorder

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(hrv, col='disorder')
g.map(plt.hist, 'age', bins=20)

## Gender and Disorder

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(hrv, col='disorder')
g.map(plt.hist, 'gender', bins=20)

- - -

# Data Analysis

## Data Selection

In [None]:
X = hrv_b1_s_sub
Y = hrv_disorder

In [None]:
var_selection = ["SDNN", "NN50","PNN50", "RMSSD", "LF", "HF", "LF/HF", "HR"] ## Choose the variables that must be adopted for input values
X = X.loc[:,var_selection]

In [None]:
## Generating dataset with y label on it. 
hrv_data = pd.concat([hrv_s_r_sub, hrv_disorder], axis=1)

In [None]:
hrv_data.head()

## Train-Test Split

In [None]:
X.columns

In [None]:
## Split X and Y into training dataset and test dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
print("x_train dataset shape is", x_train.shape)
print("y_train dataset shape is", y_train.shape)

print("x_test dataset shape is", x_test.shape)
print("y_test dataset shape is", y_test.shape)

In [None]:
## Converting dataframe format into numpy array
x_train_np = x_train.to_numpy()
y_train_np = y_train.to_numpy()
x_test_np = x_test.to_numpy()
y_test_np = y_test.to_numpy()

In [None]:
## Use TensorDataset to create dataset with ndarray
train_dataset = TensorDataset(torch.tensor(x_train_np), torch.tensor(y_train_np))
test_dataset = TensorDataset(torch.tensor(x_test_np), torch.tensor(y_test_np))

In [None]:
## Setting trainloader and testloader for training
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.bs, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.bs, shuffle=False, num_workers=4)

- - -

## Regression

In [None]:
logistic_reg = LogisticRegression(solver='lbfgs', max_iter = 4000)
logistic_reg.fit(x_train, y_train.values.ravel())

In [None]:
predictions = logistic_reg.predict(x_test)

In [None]:
print(confusion_matrix(y_test,predictions))

- - -

## Decision Tree

In [None]:
dt_model=DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print(confusion_matrix(y_test,dt_pred))

In [None]:
print(classification_report(y_test,dt_pred))

- - -

## Random Forest Classification

In [None]:
rf= RandomForestClassifier(n_estimators=5000)
rf.fit(x_train, y_train.values.ravel())

In [None]:
rf_pre=rf.predict(x_test)

In [None]:
print(confusion_matrix(y_test, rf_pre))

In [None]:
print(classification_report(y_test, rf_pre))

- - -

## XGBoosts Classifier

In [None]:
xgboost = XGBClassifier(n_estimators=1000, eval_metric='mlogloss')
xgboost.fit(x_train, y_train)

In [None]:
xg_pred = xgboost.predict(x_test)

In [None]:
print(confusion_matrix(y_test, xg_pred))

In [None]:
print(classification_report(y_test, xg_pred))

- - -

## Multi-Layer Perceptron

* Simple MLP

In [None]:
input_size = x_train.shape[1]

In [None]:
class MLP_HRV(nn.Module):
    def __init__(self):
        super(MLP_HRV, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, 3)

    def forward(self, x):
        x = x.view(-1, input_size)
        x = self.layer1(x)
        x = F.relu(x)
        x = self.layer2(x)
        x = F.relu(x)
        x = self.layer3(x)
        x = F.log_softmax(x, dim=1)
        return x

In [None]:
model = MLP_HRV().to(DEVICE)
print(model)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
x_train = torch.tensor(x_train.values)

In [None]:
for epoch in range(args.epochs):
    loss = 0
    batch = len(train_loader)

    for images, labels in train_loader: 
        images = images.view(-1, input_size).to(DEVICE) 
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        hypothesis = model(images)
        cost = criterion(hypothesis, labels)
        cost.backward()
        optimizer.step()
        loss += cost / batch

    print('Epoch:', '%03d' % (epoch + 1), 'Training loss =', '{:.5f}'.format(loss))

- - -

## Convolutional Neural Network

In [None]:
print("X shape is ", X.shape)
print("Y shape is ", Y.shape)

In [None]:
X.head()