In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import os
from datetime import datetime
import matplotlib.dates as mdates
from scipy.stats import zscore


import statsmodels.api as sm
import statsmodels.formula.api as smf


In [2]:
FS = 4 # E4 sampling rate
FREQ_ARRAY = np.append(np.arange(23, 26, 1), np.arange(6*24, 9*24, 24)) # for morlet
PANDAS_RESAMPLE_RATE = "{:.4f}".format(1/FS)+'S' # find missing samples (datetime)

MODALITY = 'eda'


In [3]:
HDRS_17_CUTOFF = 20
HDRS_24_CUTOFF = 10
RCI_HDRS_17 = 6

HDRS_variant = 'hamd_17_score'

LABEL_FOLDER_PATH = 'labels/'
SAVE_CWT_FOLDER = '/media/bayesian-posterior/sdc/sensecode_data/' + MODALITY + '/cwt_' + MODALITY + '_all_freq/'

DATA_FOLDER_PATH = '/media/bayesian-posterior/sdc/sensecode_data/' + MODALITY + '/'
DATA_FOLDER = os.fsencode(DATA_FOLDER_PATH)

In [10]:
def get_x_y(df, 
            subject,
            HDRS_df, 
            HDRS_variant, 
            test_freq = str(1.0*24)):
    
    resampled_df = df[test_freq].resample('D').median()
    resampled_df.index = resampled_df.index.tz_localize(None)
    # print(resampled_df.index)

    t = resampled_df.index.to_pydatetime()
    E4_start_date = t[0]
    E4_end_date = t[-1]
    
    # assert len(HDRS_array) == len(HDRS_date)
    x, y = [], []
    HDRS_date = HDRS_df['Date']
    
    for idx in range(len(HDRS_date) - 1):

        date_in_datetime = datetime.strptime(HDRS_date.iloc[idx], '%Y-%m-%d %H:%M:%S')
        
        if date_in_datetime >= E4_start_date and date_in_datetime <= E4_end_date:
            
            left = HDRS_date.iloc[idx].split(' ')[0]
            right = HDRS_date.iloc[idx + 1].split(' ')[0]
            # print(left, right)
            
            segment = resampled_df.loc[left : right]
            power = np.median(segment)
            # power = resampled_df.loc[left]
            # print(power, type(power))

            x.append(power)
            y.append(HDRS_df[HDRS_variant].iloc[idx])
            
    last_date = datetime.strptime(HDRS_date.iloc[-1], '%Y-%m-%d %H:%M:%S')
    if last_date >= E4_start_date and last_date <= E4_end_date:
        power = np.median(resampled_df.loc[HDRS_date.iloc[-1].split(' ')[0] : ])
        # power = resampled_df.loc[last_date]
        
        x.append(power)
        y.append(HDRS_df[HDRS_variant].iloc[-1])
        # print(last_date)
        
    return x, y

In [11]:
dummy_hdrs_list, dummy_power_list, dummy_subject_list = [], [], []

for file in os.listdir(SAVE_CWT_FOLDER):
    
    filename = os.fsdecode(file)
    
    if filename.endswith("cwt.h5"):
        
        subject = filename.split('_')[0]
        eda_filepath = SAVE_CWT_FOLDER+filename
        df_cwt = pd.read_hdf(eda_filepath, 'df')
        
        HDRS_file_name = LABEL_FOLDER_PATH + subject + '_HDRS.csv'
        HDRS_df = pd.read_csv(HDRS_file_name)
        HDRS_df.dropna(inplace = True) # in case of missed HDRS assessment
        HDRS_df = HDRS_df.reset_index()      
        
        x, y = get_x_y(df_cwt, 
                        subject = subject,
                        HDRS_df = HDRS_df, 
                       HDRS_variant = HDRS_variant)
        # x.sort()
        # y.sort(reverse=True)
        
        dummy_hdrs_list += y
        dummy_power_list += x
        dummy_subject_list += [subject] * len(y)
        
        
df_lmm = pd.DataFrame({HDRS_variant : dummy_hdrs_list, 'power' : dummy_power_list, 'subject' : dummy_subject_list})

In [12]:
lmm_df_name = 'lmm_' + MODALITY + '_data.csv'

df_lmm.to_csv(lmm_df_name, index=False) 

In [13]:
# df_lmm = pd.read_csv('lmm_data.csv')  

In [14]:
df_lmm

Unnamed: 0,hamd_17_score,power,subject
0,14.0,0.199428,SP41
1,11.0,9.240672,SP41
2,12.0,0.023355,SP41
3,11.0,110.412889,SP41
4,13.0,381.214353,SP41
...,...,...,...
415,11.0,298.595250,SP31
416,4.0,31.556355,SP31
417,1.0,63.312018,SP31
418,1.0,130.695109,SP31


In [15]:
function = HDRS_variant + ' ~ power'

md = smf.mixedlm(function, 
                 df_lmm, 
                 groups = df_lmm['subject'])

mdf = md.fit(method=["lbfgs"])
print(mdf.summary())

           Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: hamd_17_score
No. Observations: 420     Method:             REML         
No. Groups:       35      Scale:              15.4244      
Min. group size:  7       Log-Likelihood:     -1229.2996   
Max. group size:  13      Converged:          Yes          
Mean group size:  12.0                                     
------------------------------------------------------------
             Coef.   Std.Err.    z     P>|z|  [0.025  0.975]
------------------------------------------------------------
Intercept    12.999     0.918  14.166  0.000  11.201  14.798
power        -0.000     0.001  -0.288  0.773  -0.002   0.001
Group Var    27.703     1.871                               

