In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from matplotlib.colors import LogNorm, Normalize
import os
from datetime import datetime
import matplotlib.dates as mdates
from scipy.stats import zscore


import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

In [2]:
FS = 4 # E4 sampling rate

DATA_FOLDER_PATH = '/media/bayesian-posterior/sdc/sensecode_data/cwt_data/'
DATA_FOLDER = os.fsencode(DATA_FOLDER_PATH)

# DAYS_ARRAY = np.asarray([0.5, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31])
DAYS_ARRAY = np.asarray([0.5, 1, 7])
FREQ_ARRAY = 1 / (DAYS_ARRAY * 24 * 60 * 60) # for morlet
FREQ_NAMES = ["{:.1f}".format(day)+' Day(s)' for day in DAYS_ARRAY]

PANDAS_RESAMPLE_RATE = "{:.2f}".format(1/4)+'S' # find missing samples (datetime)
PANDAS_RESAMPLE_RATE

'0.25S'

In [3]:
HDRS_17_CUTOFF = 20
HDRS_24_CUTOFF = 10
RCI_HDRS_17 = 6

HDRS_variant = 'hamd_17_score'

LABEL_FOLDER_PATH = 'labels/'

In [4]:
def get_x_y(df_eda, 
            window_size,
            subject,
            HDRS_array, 
            HDRS_date):
    
    df_eda['rolling_power'] = df_eda['1.0 Day(s)'].rolling(window=window_size, center = True).max()
    resampled_df = df_eda['rolling_power'].resample('D').max()
    resampled_df.index = resampled_df.index.tz_localize(None)
    # print(resampled_df.index)
    t = resampled_df.index.to_pydatetime()
    
    assert len(HDRS_array) == len(HDRS_date)
    x, y = [], []
    
    for idx, date in HDRS_date.items():
        E4_start_date = t[0]
        E4_end_date = t[-1]

        if date[0].isdigit():
            date_in_datetime = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
            if date_in_datetime >= E4_start_date and date_in_datetime <= E4_end_date:
                # print(date)
                power = resampled_df[date.split(' ')[0]] 
                # print(power, type(power))
                
                if np.isnan(power) == False:
                    x.append(power)
                    y.append(HDRS_array[idx])
                    
    return x, y

In [5]:
dummy_hdrs_list, dummy_power_list, dummy_subject_list = [], [], []

for file in os.listdir(DATA_FOLDER):
    
    filename = os.fsdecode(file)
    
    if filename.endswith("cwt.h5"):
        
        subject = filename.split('_')[0]
        print(subject)
        eda_filepath = DATA_FOLDER_PATH+filename
        retrieved = pd.read_hdf(eda_filepath, 'df')
        
        HDRS_file_name = LABEL_FOLDER_PATH + subject + '_HDRS.csv'
        HDRS_df = pd.read_csv(HDRS_file_name)
        HDRS_array = HDRS_df[HDRS_variant].to_numpy()        
        
        
        x, y = get_x_y(retrieved, 
            window_size = '7D',
            subject = subject,
            HDRS_array = HDRS_array, 
            HDRS_date = HDRS_df['Date'])
        
        
        dummy_hdrs_list += y
        dummy_power_list += x
        dummy_subject_list += [subject] * len(y)
        
        
df_lmm = pd.DataFrame({HDRS_variant : dummy_hdrs_list, 'power' : dummy_power_list, 'subject' : dummy_subject_list})

SP41
SP71
SP70
SP36
SP43
SP22
SP61
SP15
SP3
SP40
SP34
SP69
SP29
SP60
SP6
SP65
SP24
SP16
SP33
SP45
SP59
SP27
SP55
SP2
SP47
SP21
SP1
SP12
SP11
SP5
SP44
SP30
SP28
SP72
SP31


In [6]:
df_lmm.to_csv('lmm_data_7D_mean.csv', index=False) 

In [7]:
df_lmm = pd.read_csv('lmm_data_7D_mean.csv')  

In [8]:
# df_lmm['power'] = df_lmm.groupby('subject', group_keys=False)['power'].apply(lambda x: (x - x.mean()) / x.std())


In [9]:
df_lmm

Unnamed: 0,hamd_17_score,power,subject
0,16.0,54.767736,SP41
1,14.0,51.138198,SP41
2,11.0,46.642458,SP41
3,12.0,60.781873,SP41
4,11.0,74.090799,SP41
...,...,...,...
416,6.0,119.670368,SP31
417,11.0,142.217990,SP31
418,4.0,72.876819,SP31
419,1.0,70.278987,SP31


In [10]:
function = HDRS_variant + ' ~ power'

md = smf.mixedlm(function, df_lmm, groups=df_lmm['subject'])
mdf = md.fit(method=["lbfgs"])
print(mdf.summary())

           Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: hamd_17_score
No. Observations: 421     Method:             REML         
No. Groups:       35      Scale:              15.8883      
Min. group size:  7       Log-Likelihood:     -1237.0323   
Max. group size:  13      Converged:          Yes          
Mean group size:  12.0                                     
------------------------------------------------------------
             Coef.   Std.Err.    z     P>|z|  [0.025  0.975]
------------------------------------------------------------
Intercept    13.101     0.951  13.775  0.000  11.237  14.965
power         0.001     0.001   0.776  0.438  -0.001   0.003
Group Var    26.926     1.802                               

