# ==========================

# Heart Rate Variability Dataset

DATA Provided by SMC Professor J.A.

In [None]:
import csv
import random
# import torch
# import torchvision
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# from PIL import Image

In [None]:
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense , Activation, Dropout
from keras.optimizers import Adam ,RMSprop
from keras import  backend as K
from keras.optimizers import SGD
# from tensorflow.keras import utils as np_utils
# from tensorflow.keras.metrics import binary_focal_crossentropy
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
from torch.utils.data import TensorDataset, DataLoader

In [None]:
## Set this if you want to check all information from dataframe without ... auto.
np.set_printoptions(threshold=np.inf, linewidth=np.inf) #inf = infinity 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

# Data Handling

## Dataset variables

> "Demographic info" : Gender, Age \
> "Questionnaires" : HAMD, HAMA, BDI-II, BAI, MDQ, HCL-32 \
> "HRV's 17 Features" : SDNN, PSI, VLF, LF, HF, TP, LFNORM, HFNORM, LF/HF Ratio, RMSSD, APEN, SRD, TSRD, TP_ln, LF_ln, HF_ln \
> "Main Disease Groups" : 
>> MDDs = 602 \
>> MDDr = 529 \
>> PD = 353 \
>> Adj = 341 \
>> BP_II = 337 \
>> SSD = 324 \
>> PDAG = 291 \
>> PDD = 189 \
>> DEP_NOS = 188 \
>> GAD = 165 \
>> ANX_NOS = 146 \
>> ADHD = 70 \
>> BP_I = 67 \
>> OCD = 51 \
>> SAD = 45 \
>> PTSD = 41 \
>> INSOMNIA = 50 \
>> Alcohol_dependence = 36 \
>> SPR = 34 \
>> Tourette = 22 \
>> Normal = 19 \
>> Tic = 12 \
>> OMS = 10

## Importing Original Dataset

In [None]:
hrv_ori = pd.read_excel('E:/RESEARCH/Datasets/HRV/JA/HRV_.xlsx')
# hrv_ori = pd.read_csv('E:/RESEARCH/Datasets/HRV/JA/HRV_dataset_processed_.csv')

In [None]:
print(hrv_ori.dtypes)

In [None]:
hrv_ori

* Some patients data have "NA" values in hrv_features and their biological dataset. 
* Therefore, we will remove those data in this analysis.
* From the result below, total 280 patients are removed.

In [None]:
hrv = hrv_ori[~pd.isnull(hrv_ori['sdnn'])]
hrv_only = hrv.loc[:, ['sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen','srd','tsrd',
                          'tp_ln', 'vlf_ln','lf_ln','hf_ln' ]]

In [None]:
print("Original import data shape is:", hrv_ori.shape)
print("Data with NA patients removed is:", hrv.shape)
print("The number of removed patients is:", hrv_ori.shape[0] - hrv.shape[0])

## Generating new Variables for analysis

* Here, we are trying to use current depressive, anxious mood status instead of using main dx or sub

In [None]:
hrv_data = hrv.copy()
hrv_data['HAMD_'] = "NULL"
hrv_data['HAMA_'] = "NULL"
hrv_data['BDI_'] = "NULL"
hrv_data['BAI_'] = "NULL"
# hrv_data['MDQ_'] = 0
# hrv_data['HCL_'] = 0

In [None]:
## Categorizing HAMD scores to generate 'HAMD_' label variable.
hrv_data.loc[hrv_data['HAMD']<7, 'HAMD_'] = "normal"
hrv_data.loc[(hrv_data['HAMD']>=7) & (hrv_data['HAMD']<18), 'HAMD_'] = "mild"
hrv_data.loc[(hrv_data['HAMD']>=18) & (hrv_data['HAMD']<25), 'HAMD_'] = "moderate"
hrv_data.loc[hrv_data['HAMD']>=25, 'HAMD_'] = "severe"

In [None]:
## Categorizing HAMA scores to generate 'HAMA_' label variable.
hrv_data.loc[hrv_data['HAMA']<18, 'HAMA_'] = "normal"
hrv_data.loc[(hrv_data['HAMA']>=18) & (hrv_data['HAMA']<25), 'HAMA_'] = "mild"
hrv_data.loc[(hrv_data['HAMA']>=25) & (hrv_data['HAMA']<30), 'HAMA_'] = "moderate"
hrv_data.loc[hrv_data['HAMA']>=30, 'HAMA_'] = "severe"

In [None]:
## Categorizing BDI-II scores to generate 'BDI_' label variable.
hrv_data.loc[hrv_data['BDI-II']<14, 'BDI_'] = "normal"
hrv_data.loc[(hrv_data['BDI-II']>=14) & (hrv_data['BDI-II']<19), 'BDI_'] = "mild"
hrv_data.loc[(hrv_data['BDI-II']>=19) & (hrv_data['BDI-II']<28), 'BDI_'] = "moderate"
hrv_data.loc[hrv_data['BDI-II']>=28, 'BDI_'] = "severe"

In [None]:
## Categorizing BAI scores to generate 'BAI_' label variable.
hrv_data.loc[hrv_data['BAI']<8, 'BAI_'] = "normal"
hrv_data.loc[(hrv_data['BAI']>=8) & (hrv_data['BAI']<16), 'BAI_'] = "mild"
hrv_data.loc[(hrv_data['BAI']>=16) & (hrv_data['BAI']<26), 'BAI_'] = "moderate"
hrv_data.loc[hrv_data['BAI']>=26, 'BAI_'] = "severe"

In [None]:
# hrv_data.to_csv("E:/RESEARCH/Datasets/HRV/JA/HRV_prep.csv", index = False)

* main_dx is first diagnostic result of the patient.
* Most of the patients in our dataset are MDD, PD, BP, ...

In [None]:
hrv_data["HAMD_"].value_counts()
# hrv_data["HAMA_"].value_counts()

# hrv_data["main_dx"].value_counts()
# hrv_data["subtype"].value_counts()
# hrv_data["gender"].value_counts()

In [None]:
hrv.columns  ## Check the variables 

## Separating dataset for its Usage

In [None]:
hrv_total = hrv  ## Overall dataset
hrv_subtype = hrv[~pd.isnull(hrv['subtype'])] ## selecting the dataset with subtype included
hrv_mdd = hrv[hrv['main_dx'].isin (['MDDs', 'MDDr','PDD'])]  ## Selecting the dataset with main_dx MDDs and MDDr
hrv_mdd_subtype = hrv_mdd[~pd.isnull(hrv_mdd['subtype'])] ## selecting the datsaet with MDDs and MDDr + having subtype

In [None]:
print("Original data shape is:", hrv_total.shape)
print("Data Shape with subtype is:", hrv_subtype.shape)
print("Data Shape with main_dx MDDs or MDDr or PDD is:", hrv_mdd.shape)
print("Data Shape with main_dx MDDs or MDDr or PDD + having subtype is:", hrv_mdd_subtype.shape)

In [None]:
# hrv_total["main_dx"].value_counts()
# hrv_subtype["main_dx"].value_counts()
hrv_mdd["main_dx"].value_counts()
# hrv_mdd_subtype["subtype"].value_counts()

## Selecting data features for further Analysis

In [None]:
mdd_lab = hrv_mdd.loc[:, ['main_dx']]
mdd_hrv = hrv_mdd.loc[:, ['sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen','srd','tsrd',
                          'tp_ln', 'vlf_ln','lf_ln','hf_ln' ]]  ## Selecting the dataset with hrv feature variables
mdd_analysis = hrv_mdd.loc[:, ['main_dx','sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen','srd','tsrd',
                               'tp_ln', 'vlf_ln','lf_ln','hf_ln']]
mdd_analysis_core = hrv_mdd.loc[:, ['main_dx','sdnn', 'tp','vlf','lfnorm','hfnorm','rmssd','apen','srd','tsrd']]

In [None]:
mdd_hrv

In [None]:
# mdd_analysis.to_csv('E:/RESEARCH/Datasets/HRV/JA/HRV_dataset_mdd_extracted.csv')

In [None]:
mdds = mdd_analysis[mdd_analysis['main_dx']=='MDDs']
mddr = mdd_analysis[mdd_analysis['main_dx']=='MDDr']
pdd  = mdd_analysis[mdd_analysis['main_dx']=='PDD']

# ==========================

# Statistical Data Analysis

* For HRV variables used in the research, we have to check which feature shows significant difference between comparison groups

## Stat on MDD groups (MDDr, MDDs)

In [None]:
mdd_analysis.columns

In [None]:
var_list = ['sdnn', 'tp', 'vlf', 'lf', 'hf', 'lfnorm', 'hfnorm', 'lf_hf', 'rmssd', 'apen', 'srd', 'tsrd', 'tp_ln', 'vlf_ln', 'lf_ln', 'hf_ln']

In [None]:
len(var_list)

* T-test or ANOVA test based on the selected hypothesis

In [None]:
## Comparing 3 groups (MDDs, MDDr, PDD) with ANOVA test
mdd_stat_result =[]
for va in var_list:
    a = mdds[va].values
    b = mddr[va].values
    c = pdd[va].values
    f_val , p_val = stats.f_oneway(a, b, c)
    mdd_stat_result.append([f_val, p_val])

In [None]:
# ## Comparing 2 groups (MDDs, MDDr) with t-test
# mdd_stat_result =[]
# for va in var_list:
#     a = mdds[va].values
#     b = mddr[va].values
#     t_val , p_val = stats.ttest_ind((a), (b))
#     mdd_stat_result.append([t_val, p_val])

In [None]:
mdd_stat_result

In [None]:
mdd_stat_result_df = pd.DataFrame (mdd_stat_result, columns = ['F-value', 'p-value'])
# mdd_stat_result_df = pd.DataFrame (mdd_stat_result, columns = ['t-value', 'p-value'])

In [None]:
mdd_stat_result_df = mdd_stat_result_df.assign(HRV_feature=var_list)

In [None]:
mdd_stat_result_df

## Stats on Bipolar groups (BP I, BP II)

In [None]:
hrv_bp = hrv[hrv['main_dx'].isin (['BP_I', 'BP_II'])]  ## Selecting the dataset with main_dx BP_I and BP_II
bp_analysis = hrv_bp.loc[:, ['main_dx','sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen','srd','tsrd',
                               'tp_ln', 'vlf_ln','lf_ln','hf_ln']]

In [None]:
bp_I  = bp_analysis[bp_analysis['main_dx']=='BP_I']
bp_II = bp_analysis[bp_analysis['main_dx']=='BP_II']

In [None]:
var_list = ['sdnn', 'tp', 'vlf', 'lf', 'hf', 'lfnorm', 'hfnorm', 'lf_hf', 'rmssd', 'apen', 'srd', 'tsrd', 'tp_ln', 'vlf_ln', 'lf_ln', 'hf_ln']

In [None]:
bp_stat_result =[]
for va in var_list:
    a = bp_I[va].values
    b = bp_II[va].values
    t_val , p_val = stats.ttest_ind((a), (b))
    bp_stat_result.append([t_val, p_val])

In [None]:
bp_stat_result_df = pd.DataFrame (bp_stat_result, columns = ['t-value', 'p-value'])
bp_stat_result_df = bp_stat_result_df.assign(HRV_feature=var_list)

In [None]:
bp_stat_result_df

* Statistical results on HRV_features between MDDs and MDDr are insignificant. \
-> Can we make significant clinical result with machine learning analysis methods?

# ==========================

# Machine Learning approaches

## Pytorch MLP

In [None]:
class Args:
    # arugments
    epochs=50
    bs=16
    lr=0.001
    momentum=0.9
    num_classes=2
    verbose='store_true'
    seed=674

args = Args()    

np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)

In [None]:
hrv.shape

In [None]:
bp = hrv[hrv['main_dx'].isin (['BP_I', 'BP_II'])]  ## Selecting the dataset with main_dx BP_I and BP_II
bp_lab = bp.loc[:, 'main_dx']  ## Selecting label variable (BP_I or BP_II)
bp_hrv = bp.loc[:, ['sdnn','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen','srd','tsrd',
                          'tp_ln', 'vlf_ln','lf_ln','hf_ln' ]]  ## Selecting only hrv features from overall dataset

In [None]:
bp_lab

In [None]:
print("Bipolar Shape is:", bp.shape) ## bipolar disorder selection from overall dataset
print("Bipolar HRV Shape is: ", bp_hrv.shape) ## overall 16 hrv variables

* remove psi feature, because it contains some string data. Which occurs an error.

In [None]:
# bp_hrv.dtypes
bp_lab.dtypes

In [None]:
## Normalize the values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
bp_hrv[:] = scaler.fit_transform(bp_hrv[:])

In [None]:
## Split X and Y into training dataset and test dataset
x_train, x_test, y_train, y_test = train_test_split(bp_hrv, bp_lab, test_size = 0.2, random_state = 42)

In [None]:
# convert to one-hot vector
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
print("x_train dataset shape is", x_train.shape)
print("y_train dataset shape is", y_train.shape)

print("x_test dataset shape is", x_test.shape)
print("y_test dataset shape is", y_test.shape)

In [None]:
## Converting dataframe format into numpy array
x_train_np = x_train.to_numpy()
y_train_np = y_train.to_numpy()
x_test_np = x_test.to_numpy()
y_test_np = y_test.to_numpy()

In [None]:
## Counting the number of unique train set labels
unique, counts = np.unique(y_train, return_counts=True)
print("Train labels: ", dict(zip(unique, counts)))

## Counting the number of unique test labels
unique, counts = np.unique(y_test, return_counts=True)
print("\nTest labels: ", dict(zip(unique, counts)))

In [None]:
## Use TensorDataset to create dataset with ndarray
train_dataset = TensorDataset(torch.tensor(x_train_np), torch.tensor(y_train_np))
test_dataset  = TensorDataset(torch.tensor(x_test_np), torch.tensor(y_test_np))

In [None]:
## Setting trainloader and testloader for training
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.bs, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.bs, shuffle=False, num_workers=4)

## KERAS

### Setting arguments

In [None]:
class Args:
    # arugments
    epochs=1000
    bs=128
    lr=0.0001
    momentum=0.9
#     num_classes=3
    verbose='store_true'
    seed=710674

args = Args()    

np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)

### Three classes classification

In [None]:
mdd_analysis #contains 3 different disorder(PDD, MDDr, MDDs)

In [None]:
x_data = mdd_analysis.drop(['main_dx'], axis = 1)
x_data = np.asarray(x_data).astype(np.float32)

label = mdd_analysis['main_dx']

In [None]:
label = label.replace({'MDDs': 0})
label = label.replace({'MDDr': 1})
label = label.replace({'PDD': 2})

In [None]:
label.value_counts()

In [None]:
num_classes = 3
y_data = to_categorical((label), num_classes)
print(y_data[0])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 7)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
## Generate the model
model = Sequential()
model.add(Dense(64, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.3)) #drop out
model.add(Dense(32, activation = 'relu'))
model.add(Dense(6, activation = 'relu'))
model.add(Dense(3, activation = 'sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs = args.epochs, batch_size = args.bs, verbose=0)

In [None]:
scores = model.evaluate(x_test, y_test)
print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))

### What if two classes?

In [None]:
hrv_mdd_rs = hrv[hrv['main_dx'].isin (['MDDs', 'MDDr'])]  ## Selecting the dataset with main_dx MDDs and MDDr
mdd_rs_analysis = hrv_mdd_rs.loc[:, ['main_dx','sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen',
                                  'srd','tsrd', 'tp_ln', 'vlf_ln','lf_ln','hf_ln']] ## selecting variables from mdds, mddr

In [None]:
x_data = mdd_rs_analysis.drop(['main_dx'], axis = 1)
x_data = np.asarray(x_data).astype(np.float32)

label = mdd_rs_analysis['main_dx']

In [None]:
label = label.replace({'MDDs': 0})
label = label.replace({'MDDr': 1})

In [None]:
num_classes = 2
y_data = to_categorical((label), num_classes)
print(y_data[0])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 7)

In [None]:
## Generate the model
model = Sequential()
model.add(Dense(32, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dense(18, activation = 'relu'))
model.add(Dense(6, activation = 'relu'))
model.add(Dense(2, activation = 'sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs = args.epochs, batch_size = args.bs, verbose = 0, shuffle=True)

In [None]:
scores = model.evaluate(x_test, y_test)
print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))

#### Logistic Regression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, label, test_size = 0.2, random_state = 7)

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
print(model.score(x_train, y_train))

# ==========================

# Data Augmentation Approaches

## Simple - 3 classes

In [None]:
data = mdd_analysis_core

In [None]:
mdds = data[data['main_dx']=='MDDs']
mddr = data[data['main_dx']=='MDDr']
pdd  = data[data['main_dx']=='PDD']

In [None]:
mddr_aug = mddr.sample(frac=0.2)

In [None]:
mdd_aug = pd.concat([data, pdd,mddr_aug])

In [None]:
mdd_aug['main_dx'].value_counts()

In [None]:
mdd_aug.columns

In [None]:
mdd_aug_labels = mdd_aug.loc[:, 'main_dx']
mdd_aug_values = mdd_aug.loc[:, ['sdnn', 'tp', 'vlf', 'lfnorm', 'hfnorm', 'rmssd', 'apen', 'srd', 'tsrd']]

In [None]:
## Normalize between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdd_aug_values[:] = scaler.fit_transform(mdd_aug_values[:])

In [None]:
mdd_aug_final = pd.concat([mdd_aug_labels, mdd_aug_values], axis=1)

In [None]:
x_data = mdd_aug.drop(['main_dx'], axis = 1)
x_data = np.asarray(x_data).astype(np.float32)

label = mdd_aug['main_dx']

In [None]:
label = label.replace({'MDDs': 0})
label = label.replace({'MDDr': 1})
label = label.replace({'PDD': 2})

In [None]:
label.value_counts()

In [None]:
num_classes = 3
y_data = to_categorical((label), num_classes)
print(y_data[0])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2, random_state = 42)

In [None]:
## Generate the model
model = Sequential()
model.add(Dense(64, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.3)) #drop out
model.add(Dense(32, activation = 'relu'))
model.add(Dense(6, activation = 'relu'))
model.add(Dense(3, activation = 'sigmoid'))

In [None]:
sgd = SGD(lr=0.001, decay=1e-6, momentum=args.momentum, nesterov=True)

In [None]:
model.compile(loss = 'categorical_crossentropy',
#               optimizer = 'adam', 
              optimizer = sgd,
              metrics = ['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs = args.epochs, batch_size = args.bs, verbose=0)

In [None]:
scores = model.evaluate(x_test, y_test)
print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))

# #1. Statistical (distribution-based)

## Check dataset

In [None]:
mdd_analysis #contains 3 different disorder(PDD, MDDr, MDDs)
mdd_hrv

In [None]:
mdd_analysis['main_dx'].value_counts() ## lacking PDD

In [None]:
## Normalize between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdd_hrv[:] = scaler.fit_transform(mdd_hrv[:])

In [None]:
mdd_hrv.head()

## Check data variables distribution

In [None]:
SDNN = mdd_hrv['sdnn']
PSI = mdd_hrv['psi']
TP = mdd_hrv['tp']
VLF = mdd_hrv['vlf']
LF = mdd_hrv['lf']
HF = mdd_hrv['hf']
LFNORN = mdd_hrv['lfnorm']
HFNORM = mdd_hrv['hfnorm']
LFHF = mdd_hrv['lf_hf']
RMSSD = mdd_hrv['rmssd']
APEN = mdd_hrv['apen']
SRD = mdd_hrv['srd']
TSRD = mdd_hrv['tsrd']
TPLN = mdd_hrv['tp_ln']
VLFLN= mdd_hrv['vlf_ln']
LFLN = mdd_hrv['lf_ln']
HFLN = mdd_hrv['hf_ln']

In [None]:
plt.figure(figsize = (10, 5))
sns.set_style("whitegrid")
plt.grid(True)
plt.xlabel('Standardized Variables',fontsize=10)
plt.ylabel('Density',fontsize=10)

sns.kdeplot(SDNN)
sns.kdeplot(PSI)
sns.kdeplot(TP)
sns.kdeplot(VLF)
sns.kdeplot(LF)
sns.kdeplot(HF)
sns.kdeplot(RMSSD)
sns.kdeplot(APEN)
sns.kdeplot(SRD)


# plt.legend()
plt.legend(['SDNN', 'PSI', 'TP', 'VLF', 'LF', 'HF', 'RMSSD', 'APEN', 'SRD'], fontsize=10)

# plt.savefig('distribution.png')

In [None]:
# fig, ax = plt.subplots()
plt.figure(figsize = (10, 5))
plt.xlabel('HRV feature variables',fontsize=10)
plt.ylabel('Feature value',fontsize=10)
plt.boxplot(mdd_hrv)
plt.show()

In [None]:
mdds_box = mdds.loc[:, ['sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen',
                                  'srd','tsrd', 'tp_ln', 'vlf_ln','lf_ln','hf_ln']]

In [None]:
## Normalize between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdds_box[:] = scaler.fit_transform(mdds_box[:])

In [None]:
# fig, ax = plt.subplots()
plt.figure(figsize = (10, 5))
plt.xlabel('HRV feature variables',fontsize=10)
plt.ylabel('Feature value',fontsize=10)
plt.boxplot(mdds_box)
plt.show()

In [None]:
## Normalize between 0 and 1
# from sklearn.preprocessing import MinMaxScaler
mddr_box = mddr.loc[:, ['sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen',
                                  'srd','tsrd', 'tp_ln', 'vlf_ln','lf_ln','hf_ln']]
scaler = MinMaxScaler()
mddr_box[:] = scaler.fit_transform(mddr_box[:])

In [None]:
# fig, ax = plt.subplots()
plt.figure(figsize = (10, 5))
plt.xlabel('HRV feature variables',fontsize=10)
plt.ylabel('Feature value',fontsize=10)
plt.boxplot(mddr_box)
plt.show()

In [None]:
## Normalize between 0 and 1
# from sklearn.preprocessing import MinMaxScaler
pdd_box = pdd.loc[:, ['sdnn','psi','tp','vlf','lf','hf','lfnorm','hfnorm','lf_hf','rmssd','apen',
                                  'srd','tsrd', 'tp_ln', 'vlf_ln','lf_ln','hf_ln']]
scaler = MinMaxScaler()
pdd_box[:] = scaler.fit_transform(pdd_box[:])

In [None]:
# fig, ax = plt.subplots()
plt.figure(figsize = (10, 5))
plt.xlabel('HRV feature variables',fontsize=10)
plt.ylabel('Feature value',fontsize=10)
plt.boxplot(pdd_box)
plt.show()

## Check data variables correlation coefficient

In [None]:
mdd_hrv_core_variables = hrv_mdd.loc[:, ['sdnn', 'psi', 'tp', 'vlf', 'lf', 'hf', 'rmssd', 'apen', 'srd']]
mdd_hrv_core_variables_with_label = hrv_mdd.loc[:, ['main_dx', 'sdnn', 'psi', 'tp', 'vlf', 'lf', 'hf', 'rmssd', 'apen', 'srd']]

In [None]:
corr = mdd_hrv_core_variables.corr()
corr

In [None]:
correlation = pd.DataFrame(corr)

In [None]:
corr = mdd_hrv_core_variables.corr()
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)
cax = ax.matshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, len(mdd_hrv_core_variables.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(mdd_hrv_core_variables.columns)
ax.set_yticklabels(mdd_hrv_core_variables.columns)
plt.show()

## Multiple regression

In [None]:
from sklearn import linear_model
import statsmodels.api as sm

In [None]:
x = mdd_hrv_core_variables_with_label[['sdnn', 'tp', 'vlf', 'lf', 'hf', 'rmssd', 'apen', 'srd']]
y = mdd_hrv_core_variables_with_label['main_dx']

In [None]:
y = y.replace({'MDDs': 0})
y = y.replace({'MDDr': 1})
y = y.replace({'PDD': 2})

In [None]:
x.dtypes

In [None]:
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)