In [None]:
"""
Recreation of l3s8 for objective data derived from digital biomarkers (hereafter referred to as objective data) provided by empatica. This might help see if patterns observed in sbjective data are also seen in the objective data.
"""
"""
1. Import objective data as before 
2. Organise them into 15 minute bins
3. Group these bins into categories of early morning, morning, noon (afternoon) and night. Use the same time divisions as give_binned_vals_category does to bin values
4. Check each group for normality (owing to less number of observations, likely that non-parametric tests needed)
5. Conduct non-parametric (or parametric if applicable) ANOVA on the data and tabulate and visualise results
6. Do the above for per day and all day (all days together)
"""

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import shapiro
from scipy.stats import kruskal #for independent groups
from scipy.stats import friedmanchisquare #for dependent or paired groups
import seaborn as sns

import warnings
import datetime
from datetime import datetime
import pytz

In [None]:
"""
define binning function for objective measures
"""
def give_binned_vals_obj_meas(df, obj_meas, hour, half_hour, quarter_hour, timezone, category_yn = False):
    
    bin_dict = {}
    bin_dict_scl = {}
    bin_dict_scr = {}

    #only for objective measures, include the extra hour of data (if in cet)
    if category_yn:
        bin_arr = np.arange(0,25,6)
    elif hour: #hour seperation
        bin_arr = np.arange(0,25)
    elif half_hour: #half hour seperation
        bin_arr = np.arange(0,24.5, 0.5)
    elif quarter_hour: #quarter hour seperation
        bin_arr = np.arange(0,24.25, 0.25)
    # Define the time zones
    utc_zone = pytz.utc
    req_zone = pytz.timezone(timezone)
    #if aggr_p_min data, time conversion block (add an extra column to the dataframe with required timezone timestamps)
    def from_isoutc_to_req(iso_timestamp):
            # Parse the ISO 8601 timestamp into a datetime object
            utc_time = datetime.fromisoformat(iso_timestamp.replace("Z", "+00:00"))
    
            
            # Convert to required (cet) time
            req_time = utc_time.astimezone(req_zone)
            #print(req_time, type(req_time))
    
            return req_time #.isoformat()

    # Apply the conversion function to the 'utc_timestamps' column and create a new column 'converted_timestamps'
    df['converted_timestamps'] = df['timestamp_iso'].apply(from_isoutc_to_req)

    
    first_day = df['converted_timestamps'].iloc[0].day 

    x_val = df['converted_timestamps'].apply(
        lambda x: (24 + int(str(x).split()[1].split(':')[0]) + int(str(x).split()[1].split(':')[1])/60) 
        if x.day > first_day 
        else (int(str(x).split()[1].split(':')[0]) + int(str(x).split()[1].split(':')[1])/60)
    ).tolist()

    """
    #range of permissible eda range
    min_val = 0.05
    max_val = 60
    not used so far
    """
    if obj_meas == 'eda':
        y_val = df['eda_scl_usiemens'].tolist()
    elif obj_meas == 'pulse_rate':
        y_val = df['pulse_rate_bpm'].tolist()
    elif obj_meas == 'prv':
        y_val = df['prv_rmssd_ms'].tolist()
    elif obj_meas == 'resp_rate':
        y_val = df['respiratory_rate_brpm'].tolist()
    elif obj_meas == 'temp':
        y_val = df['temperature_celsius'].tolist()
    elif obj_meas == 'step_count':
        y_val = df['step_counts'].tolist()
    elif obj_meas == 'acc_std':
        y_val = df['accelerometers_std_g'].tolist()
    elif obj_meas == 'activity_counts':
        y_val = df['activity_counts'].tolist()
    elif obj_meas == 'met':
        y_val = df['met'].tolist() 
    elif obj_meas == 'wearing_det':
        y_val = df['wearing_detection_percentage'].tolist()
    
    
    for i in range(0, len(bin_arr) - 1):
            #Create the key for the dictionary
            key = str(bin_arr[i]) + '_' + str(bin_arr[i+1])
    
            #Initialize an empty list for this key
            templst = []
            bin_dict[key] = templst
        
            #Iterate over x_val, append to templst if condition is met
            for j in range(0, len(x_val)):
                if x_val[j] >= bin_arr[i] and x_val[j] < bin_arr[i+1] and x_val[j]<bin_arr[-1]:
                    #Append y_val[j] directly to the list in the dictionary
                    bin_dict[key].append(y_val[j])
                if x_val[j]>=bin_arr[-1]:
                    print("not appending value at time: ", x_val[j])

    #for conversion of lists to numpy arrays
    for key in bin_dict:
            bin_dict[key] = np.array(bin_dict[key])
            #print(bin_dict)

    bin_dict_mean = {}
    for key in bin_dict:
            if np.all(np.isnan(bin_dict[key])):
                #print('list only has nan values')
                bin_dict_mean[key] = np.nan
            elif ~np.all(np.isnan(bin_dict[key])) and len(bin_dict[key])!=0:
                #print('list is not empty')
                bin_dict_mean[key] = np.nanmean(bin_dict[key]) 
            else:
                print('-5000 has been appended') #this print statement added to debug if there is ever a situation where this would happen (technically it shouldn't)
                bin_dict_mean[key] = -5000

    return  df['converted_timestamps'], x_val, y_val, bin_dict_mean 

In [None]:
folder1 = 'empatica'
folder2 = 'saved_figures'

folder11 = 'aggr_p_min'
folder12 = 'avro_files'
folder13 = 'avro2csv'


In [None]:
#Collecting objective data for all

mainfolder = input('enter subject folder: ')
timezone =  'Europe/Berlin' #default timezone; enter required timezone if different #'utc' this seemed to be used as per the cell: "Notes, if any". Maybe revert to this after consideration
eda_dict_plot_x = {}
eda_dict_plot_y = {}
eda_dict_bin = {}
pulse_rate_dict_plot_x = {}
pulse_rate_dict_plot_y = {}
pulse_rate_dict_bin = {}
prv_dict_plot_x = {}
prv_dict_plot_y = {}
prv_dict_bin = {}
resp_rate_dict_plot_x = {}
resp_rate_dict_plot_y = {}
resp_rate_dict_bin = {}
temp_dict_plot_x = {}
temp_dict_plot_y = {}
temp_dict_bin = {}
step_dict_plot_x = {}
step_dict_plot_y = {}
step_dict_bin = {}
acc_std_dict_plot_x = {}
acc_std_dict_plot_y = {}
acc_std_dict_bin = {}
activity_dict_plot_x = {}
activity_dict_plot_y = {}
activity_dict_bin = {}
met_dict_plot_x = {}
met_dict_plot_y = {}
met_dict_bin = {}
wearing_det_dict_plot_x = {}
wearing_det_dict_plot_y = {}
wearing_det_dict_bin = {}



#in the lines below, take out "_converted_timestamp," variables after timestamp verification check
eda_converted_timestamp = {}
pulse_rate_converted_timestamp = {}
prv_converted_timestamp = {}
resp_rate_converted_timestamp = {}
temp_converted_timestamp = {}
step_converted_timestamp = {}
acc_converted_timestamp = {}
activity_converted_timestamp = {}
met_converted_timestamp = {}
wearing_det_converted_timestamp = {}

#storing the dates for which the variables are recorded. Required for time-stitching
eda_dates = []
pulse_rate_dates = []
prv_dates = []
resp_rate_dates = []
temp_dates = []
step_dates = []
acc_std_dates = []
activity_dates = []
met_dates = []
wearing_det_dates = []


for subfolder in os.listdir(mainfolder):
    if subfolder.endswith('d') and os.path.exists(os.path.join(mainfolder, subfolder, folder1, folder11)):
        print(subfolder)
        for file in os.listdir(os.path.join(mainfolder, subfolder, folder1, folder11)):
            if file.endswith('eda.csv'):
                eda_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                eda_converted_timestamp[subfolder], eda_dict_plot_x[subfolder], eda_dict_plot_y[subfolder], eda_dict_bin[subfolder] = give_binned_vals_obj_meas(eda_df, 'eda', False, False, True, timezone)
                eda_dates.append(subfolder)
            
            elif file.endswith('pulse-rate.csv'):
                pulse_rate_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                pulse_rate_converted_timestamp[subfolder], pulse_rate_dict_plot_x[subfolder], pulse_rate_dict_plot_y[subfolder], pulse_rate_dict_bin[subfolder] = give_binned_vals_obj_meas(pulse_rate_df, 'pulse_rate', False, False, True, timezone)
                pulse_rate_dates.append(subfolder)
            
            elif file.endswith('prv.csv'):
                prv_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                prv_converted_timestamp[subfolder], prv_dict_plot_x[subfolder], prv_dict_plot_y[subfolder], prv_dict_bin[subfolder] = give_binned_vals_obj_meas(prv_df, 'prv', False, False, True, timezone)
                prv_dates.append(subfolder)
                
            elif file.endswith('respiratory-rate.csv'):
                resp_rate_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                resp_rate_converted_timestamp[subfolder], resp_rate_dict_plot_x[subfolder], resp_rate_dict_plot_y[subfolder], resp_rate_dict_bin[subfolder] = give_binned_vals_obj_meas(resp_rate_df, 'resp_rate', False, False, True, timezone)
                resp_rate_dates.append(subfolder)
            
            elif file.endswith('temperature.csv'):
                temp_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                temp_converted_timestamp[subfolder], temp_dict_plot_x[subfolder], temp_dict_plot_y[subfolder], temp_dict_bin[subfolder] = give_binned_vals_obj_meas(temp_df, 'temp', False, False, True, timezone)
                temp_dates.append(subfolder)
            
            elif file.endswith('step-counts.csv'):
                step_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                step_converted_timestamp[subfolder], step_dict_plot_x[subfolder], step_dict_plot_y[subfolder], step_dict_bin[subfolder] = give_binned_vals_obj_meas(step_df, 'step_count', False, False, True, timezone)
                step_dates.append(subfolder)
                
            elif file.endswith('accelerometers-std.csv'):
                acc_std_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                acc_converted_timestamp[subfolder], acc_std_dict_plot_x[subfolder], acc_std_dict_plot_y[subfolder], acc_std_dict_bin[subfolder] = give_binned_vals_obj_meas(acc_std_df, 'acc_std', False, False, True, timezone)
                acc_std_dates.append(subfolder)
                
            elif file.endswith('activity-counts.csv'):
                activity_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                activity_converted_timestamp[subfolder], activity_dict_plot_x[subfolder], activity_dict_plot_y[subfolder], activity_dict_bin[subfolder] = give_binned_vals_obj_meas(activity_df, 'activity_counts', False, False, True, timezone)
                activity_dates.append(subfolder)
            
            elif file.endswith('met.csv'):
                met_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                met_converted_timestamp[subfolder], met_dict_plot_x[subfolder], met_dict_plot_y[subfolder], met_dict_bin[subfolder] = give_binned_vals_obj_meas(met_df, 'met', False, False, True, timezone)
                met_dates.append(subfolder)
            
            elif file.endswith('wearing-detection.csv'):
                wearing_det_df = pd.read_csv(os.path.join(mainfolder, subfolder, folder1, folder11, file))
                wearing_det_converted_timestamp[subfolder], wearing_det_dict_plot_x[subfolder], wearing_det_dict_plot_y[subfolder], wearing_det_dict_bin[subfolder] = give_binned_vals_obj_meas(wearing_det_df, 'wearing_det', False, False, True, timezone)
                wearing_det_dates.append(subfolder)                
    #(df, obj_meas, hour, half_hour, quarter_hour, timezone):
    
                

In [None]:
acc_std_dict_bin

In [None]:
acc_std_dict_bin #after new code

In [None]:
#timestitch
#function version of the above
def process_time_values(non_cont_dates, dates_list, dict_plot_x, dict_plot_y):
    """
    Process time values to move data points with x >= 24 to the next day.
    Also creates binned data and calculates bin means for each date.
    
    Parameters:
    non_cont_dates (list): List of dates that are not continuous i.e; dates where the next night is not the very next date but further off. 
    dates_list (list): List of dates for the specific measure
    dict_plot_x (dict): Dictionary with dates as keys and x-values as values
    dict_plot_y (dict): Dictionary with dates as keys and y-values as values
    
    Returns:
    tuple: Modified dict_plot_x, dict_plot_y dictionaries, and bin_dict_mean (nested dictionary)
    """
    import numpy as np
    
    # Create copies to avoid modifying the originals directly
    modified_dict_x = {k: v.copy() for k, v in dict_plot_x.items()}
    modified_dict_y = {k: v.copy() for k, v in dict_plot_y.items()}
    
    for i in range(0, len(dates_list)):
        current_date = dates_list[i]
        
        if current_date in non_cont_dates:
            # Discard values in dict_plot_x[current_date] that are >= 24 and also corresponding values in dict_plot_y[current_date]
            indices_to_remove = [idx for idx, x_val in enumerate(modified_dict_x[current_date]) if x_val >= 24]
            
            # Remove these values from both x and y arrays (in reverse order to avoid index shifts)
            for idx in sorted(indices_to_remove, reverse=True):
                modified_dict_x[current_date].pop(idx)
                modified_dict_y[current_date].pop(idx)

            print(f"Warning: Discarding values >=24 for the this date {current_date} as it is listed in non_cont_dates.")
        else:
            # Check if this isn't the last date
            if i + 1 < len(dates_list):
                next_date = dates_list[i+1]
                print(current_date, next_date)
                # Find indices where x values are >= 24
                indices_to_move = [idx for idx, x_val in enumerate(modified_dict_x[current_date]) if x_val >= 24]
                
                if indices_to_move:  # Only process if there are values to move
                    # Values to be moved
                    x_values_to_move = [modified_dict_x[current_date][idx] - 24 for idx in indices_to_move]  # Subtract 24
                    y_values_to_move = [modified_dict_y[current_date][idx] for idx in indices_to_move]
                    
                    # Add these values to the next day's data
                    modified_dict_x[next_date] = x_values_to_move + modified_dict_x[next_date]
                    modified_dict_y[next_date] = y_values_to_move + modified_dict_y[next_date]
                    
                    # Remove these values from the current day (in reverse order to avoid index shifts)
                    for idx in sorted(indices_to_move, reverse=True):
                        modified_dict_x[current_date].pop(idx)
                        modified_dict_y[current_date].pop(idx)
            else:
                # This is the last date, so we can't move values to the next day
                print(f"Warning: Discarding values >=24 for the last date {current_date} as there's no next day.")
                indices_to_remove = [idx for idx, x_val in enumerate(modified_dict_x[current_date]) if x_val >= 24]
                
                # Remove these values (in reverse order to avoid index shifts)
                for idx in sorted(indices_to_remove, reverse=True):
                    modified_dict_x[current_date].pop(idx)
                    modified_dict_y[current_date].pop(idx)
    
    # Create binned data for each date
    bin_dict_mean = {}
    
    # Process each date separately
    for date in dates_list:
        x_val = modified_dict_x[date]
        y_val = modified_dict_y[date]
        
        # Create bin dictionary for this date
        bin_dict = {}
        
        # Create bins
        bin_arr = np.arange(0, 24.25, 0.25)
        
        for i in range(0, len(bin_arr) - 1):
            # Create the key for the dictionary
            key = str(bin_arr[i]) + '_' + str(bin_arr[i+1])
            
            # Initialize an empty list for this key
            templst = []
            bin_dict[key] = templst
            
            # Iterate over x_val, append to templst if condition is met
            for j in range(0, len(x_val)):
                if x_val[j] >= bin_arr[i] and x_val[j] < bin_arr[i+1] and x_val[j] < bin_arr[-1]:
                    # Append y_val[j] directly to the list in the dictionary
                    bin_dict[key].append(y_val[j])
                if x_val[j] >= bin_arr[-1]:
                    print(f"Date {date}: not appending value at time: {x_val[j]}")
        
        # Convert lists to numpy arrays
        for key in bin_dict:
            bin_dict[key] = np.array(bin_dict[key])
        
        # Calculate means for this date's bins
        date_bin_dict_mean = {}
        for key in bin_dict:
            if len(bin_dict[key]) == 0:
                date_bin_dict_mean[key] = np.nan
            elif np.all(np.isnan(bin_dict[key])):
                # List only has nan values
                date_bin_dict_mean[key] = np.nan
            elif ~np.all(np.isnan(bin_dict[key])) and len(bin_dict[key]) != 0:
                # List is not empty and contains non-nan values
                date_bin_dict_mean[key] = np.nanmean(bin_dict[key]) 
            else:
                print(f'Date {date}: -5000 has been appended')  # Debug statement
                date_bin_dict_mean[key] = -5000
        
        # Add this date's bin means to the overall dictionary
        bin_dict_mean[date] = date_bin_dict_mean
    
    return modified_dict_x, modified_dict_y, bin_dict_mean

In [None]:
# Process each measure
non_cont_dates = ["09_3_24_n2_10_3_24_d"] #for asd_001
eda_dict_plot_x, eda_dict_plot_y, eda_dict_bin = process_time_values(non_cont_dates, eda_dates, eda_dict_plot_x, eda_dict_plot_y)
pulse_rate_dict_plot_x, pulse_rate_dict_plot_y, pulse_rate_dict_bin = process_time_values(non_cont_dates, pulse_rate_dates, pulse_rate_dict_plot_x, pulse_rate_dict_plot_y)
prv_dict_plot_x, prv_dict_plot_y, prv_dict_bin = process_time_values(non_cont_dates, prv_dates, prv_dict_plot_x, prv_dict_plot_y)
resp_rate_dict_plot_x, resp_rate_dict_plot_y, resp_rate_dict_bin = process_time_values(non_cont_dates, resp_rate_dates, resp_rate_dict_plot_x, resp_rate_dict_plot_y)
temp_dict_plot_x, temp_dict_plot_y, temp_dict_bin = process_time_values(non_cont_dates, temp_dates, temp_dict_plot_x, temp_dict_plot_y)
step_dict_plot_x, step_dict_plot_y, step_dict_bin = process_time_values(non_cont_dates, step_dates, step_dict_plot_x, step_dict_plot_y)
acc_std_dict_plot_x, acc_std_dict_plot_y, acc_std_dict_bin = process_time_values(non_cont_dates, acc_std_dates, acc_std_dict_plot_x, acc_std_dict_plot_y)
activity_dict_plot_x, activity_dict_plot_y, activity_dict_bin = process_time_values(non_cont_dates, activity_dates, activity_dict_plot_x, activity_dict_plot_y)
met_dict_plot_x, met_dict_plot_y, met_dict_bin = process_time_values(non_cont_dates, met_dates, met_dict_plot_x, met_dict_plot_y)
wearing_det_dict_plot_x, wearing_det_dict_plot_y, wearing_det_dict_bin = process_time_values(non_cont_dates, wearing_det_dates, wearing_det_dict_plot_x, wearing_det_dict_plot_y)

In [None]:
#need to collect each binned objective measure into separate dictionaries each containing one measure for all days 

list_meas = [eda_dict_bin, pulse_rate_dict_bin, prv_dict_bin, resp_rate_dict_bin, temp_dict_bin, step_dict_bin, acc_std_dict_bin, activity_dict_bin, met_dict_bin, wearing_det_dict_bin]

meas = {}

list_name_meas = ['eda_dict_bin', 'pulse_rate_dict_bin', 'prv_dict_bin', 'resp_rate_dict_bin', 
                  'temp_dict_bin', 'step_dict_bin', 'acc_std_dict_bin', 'activity_dict_bin', 
                  'met_dict_bin', 'wearing_det_dict_bin']

dict_meas = {}

for item_name, item in zip(list_name_meas, list_meas):
    dict_meas[item_name] = item

In [None]:
dict_meas

In [None]:
#From this cell onwards, borrowed from l3s8
#Group these bins into categories of early morning, morning, noon (afternoon) and night. Use the same time divisions as give_binned_vals_category does to bin values
#grouped dictionaries for every dimension
def group_bin_day_period(dim_q):
    bin_arr = np.arange(0,25,6) #going by #3: Group these bins into categories of early morning, morning, noon (afternoon) and night. Use the same time divisions as give_binned_vals_category does to bin values
    #grouped dictionaries for every dimension
    earlyMorning = {}
    morning = {}
    afterNoon = {}
    night = {}
    
    for dim in dim_q.keys():
        earlyMorning[dim] = {}
        morning[dim] = {}
        afterNoon[dim] = {}
        night[dim] = {}
        for day in dim_q[dim].keys():
            earlyMorning[dim][day] = []
            morning[dim][day] = []
            afterNoon[dim][day] = []
            night[dim][day] = []
            for i in range(0,len(list(dim_q[dim][day].keys()))):
                binStartTime = float(list(dim_q[dim][day].keys())[i].split('_')[0])
                if binStartTime >= bin_arr[0] and binStartTime < bin_arr[1]:
                    earlyMorning[dim][day].append(list(dim_q[dim][day].items())[i])
                elif binStartTime >= bin_arr[1] and binStartTime < bin_arr[2]:
                    morning[dim][day].append(list(dim_q[dim][day].items())[i])
                elif binStartTime >= bin_arr[2] and binStartTime < bin_arr[3]:
                    afterNoon[dim][day].append(list(dim_q[dim][day].items())[i])
                else:
                    night[dim][day].append(list(dim_q[dim][day].items())[i])
            earlyMorning[dim][day] = dict(earlyMorning[dim][day])
            morning[dim][day] = dict(morning[dim][day])
            afterNoon[dim][day] = dict(afterNoon[dim][day])
            night[dim][day] = dict(night[dim][day])

    return earlyMorning, morning, afterNoon, night
                
earlyMorning, morning, afterNoon, night = group_bin_day_period(dict_meas)
night    

In [None]:
earlyMorning

In [None]:
#Step 4 onwards requires a separate set one for per day analysis and the other for all days together 

#Check each group for normality (owing to less number of observations, likely that non-parametric tests needed)
dayPeriod = {'earlyMorning': earlyMorning, 'morning': morning, 'afterNoon': afterNoon, 'night': night}
shapiroResults = {}
for dim in dict_meas:
    shapiroResults[dim] = {}
    for day in dict_meas[dim]:
        shapiroResults[dim][day] = {}
        for period, periodDict in dayPeriod.items():
            shapiroResults[dim][day][period] = {}
            #normality test of earlyMorning[dim][day], morning[dim][day], afterNoon[dim][day], night[dim][day]          
            filtered_data = {key: value for key, value in periodDict[dim][day].items() if not np.isnan(value)}
            dataValues = list(filtered_data.values())
            if len(dataValues)>2:
                    try:
                        data_range = max(dataValues) - min(dataValues)
                        if data_range == 0:
                            print(f"Warning: Zero range data for {dim}, {day}, {period}")
                            shapiroResults[dim][day][period]['stat'] = None
                            shapiroResults[dim][day][period]['p_val'] = None
                            shapiroResults[dim][day][period]['normal_yes_or_no'] = None
                            shapiroResults[dim][day][period]['data_length'] = len(dataValues)
                            continue
                        #when range != 0, run shapiro
                        stat_eM, p_val_eM = shapiro(dataValues)
                        if np.isnan(stat_eM) and not np.isnan(p_val_eM):
                            print(f"Warning: stat is nan but p val not nan for {dim}, {day}, {period}, but p val is {p_val_eM} and length of data after filteration is {len(dataValues)}")
                        if p_val_eM>0.05:
                            normal_yn = 1 #normal distribution
                        else:
                            normal_yn = 0 #not normal distribution
                        shapiroResults[dim][day][period]['stat'] = stat_eM
                        shapiroResults[dim][day][period]['p_val'] = p_val_eM
                        shapiroResults[dim][day][period]['normal_yes_or_no'] = normal_yn
                        shapiroResults[dim][day][period]['data_length'] = len(dataValues)
                    except Exception as e:
                        print(f"Error in Shapiro test for {dim}, {day}, {period}: {str(e)}")
                        shapiroResults[dim][day][period]['stat'] = None
                        shapiroResults[dim][day][period]['p_val'] = None
                        shapiroResults[dim][day][period]['normal_yes_or_no'] = None
                        shapiroResults[dim][day][period]['data_length'] = len(dataValues)
            else:
                    shapiroResults[dim][day][period]['stat'] = None
                    shapiroResults[dim][day][period]['p_val'] = None
                    shapiroResults[dim][day][period]['normal_yes_or_no'] = None
                    shapiroResults[dim][day][period]['data_length'] = len(dataValues)
        

In [None]:
len(dataValues)

In [None]:
filtered_data

In [None]:
shapiroResults['wearing_det_dict_bin']

In [None]:
shapiroResults['eda_dict_bin']

In [None]:
dayPeriod

In [None]:
#Conduct non-parametric (or parametric if applicable) ANOVA on the data and tabulate and visualise results
#FRIEDMAN
npAnovaResults = {}
for dim in dict_meas:
    npAnovaResults[dim] = {}
    for day in dict_meas[dim]:
        npAnovaResults[dim][day] = {}
        periodDaily = {'earlyMorning': list(earlyMorning[dim][day].values()), 'morning': list(morning[dim][day].values()), 'afterNoon': list(afterNoon[dim][day].values()), 'night': list(night[dim][day].values())}
        #periodDaily = {'earlyMorning': earlyMorning[dim][day], 'morning': morning[dim][day], 'afterNoon': afterNoon[dim][day], 'night': night[dim][day]}
        dfAnovaDay = pd.DataFrame(periodDaily)
        """
        for col in list(dfAnovaDay.columns()):
            #if >50%data ==np.nan in that column, drop the column
        """
        for col in dfAnovaDay.columns:
            if (dfAnovaDay[col] == np.nan).sum() > 0.5 * len(dfAnovaDay):
                dfAnovaDay.drop(columns=[col], inplace=True)
        """
        df.drop(#rows where any value in the row is np.nan)
        """
        dfAnovaDay = dfAnovaDay[~dfAnovaDay.isin([np.nan]).any(axis=1)]
        """
        stat, pVal = friedman(#each remaining column of the data frame as each group)
        """
        dfAnovaDay = dfAnovaDay.apply(pd.to_numeric, errors='coerce')
        dfAnovaDay = dfAnovaDay.dropna()  # drop any new NaNs from conversion
        if len(dfAnovaDay.columns) >= 3 and len(dfAnovaDay) > 2 and all(dfAnovaDay[col].notna().sum() > 0 for col in dfAnovaDay.columns) and len(set(dfAnovaDay[col].notna().sum() for col in dfAnovaDay.columns)) == 1:
            stat, pVal = friedmanchisquare(*[dfAnovaDay[col] for col in dfAnovaDay.columns])  #DEBUG: ERROR BEING GENERATED HERE FOR ASD_001 POSSIBLY FOR EDA VARIABLE. LOOKS LIKE SOME VARIABLE K IS = 1. BUT WHAT IS K AND WHERE IS IT COMING FROM?      
            npAnovaResults[dim][day]['stat'] = stat
            npAnovaResults[dim][day]['p_val'] = pVal
            npAnovaResults[dim][day]['groups'] = list(dfAnovaDay.columns)
            npAnovaResults[dim][day]['final data length'] = len(dfAnovaDay)
        else:
            npAnovaResults[dim][day]['stat'] = None
            npAnovaResults[dim][day]['p_val'] = None
            npAnovaResults[dim][day]['groups'] = list(dfAnovaDay.columns)
            npAnovaResults[dim][day]['final data length'] = len(dfAnovaDay)

            

In [None]:
dfAnovaDay
dim
day

In [None]:
from scipy.stats import wilcoxon
from itertools import combinations
try:
    from scikit_posthocs import posthoc_nemenyi_friedman
except ImportError:
    print("scikit-posthocs not installed. Only Wilcoxon test will be available.")

In [None]:
#all day npANOVA
#friedman

def friedman_posthoc_with_viz(df, dim_name):
    """
    Perform and visualize post-hoc analysis after significant Friedman test
    df: pandas DataFrame where columns are groups
    dim_name: name of dimension being analyzed (for plot titles)
    """
    results = {}
    
    # 1. Wilcoxon with Bonferroni correction
    groups = list(df.columns)
    n_comparisons = len(groups) * (len(groups) - 1) / 2
    alpha = 0.05
    bonferroni_alpha = alpha / n_comparisons
    
    wilcoxon_results = {}
    # Matrix to store p-values for heatmap
    p_value_matrix = np.zeros((len(groups), len(groups)))
    
    for i, j in combinations(range(len(groups)), 2):
        group1, group2 = groups[i], groups[j]
        stat, p_val = wilcoxon(df[group1], df[group2])
        wilcoxon_results[f"{group1} vs {group2}"] = {
            'statistic': stat,
            'p_value': p_val,
            'significant': p_val < bonferroni_alpha
        }
        # Fill both sides of the matrix for the heatmap
        p_value_matrix[i, j] = p_val
        p_value_matrix[j, i] = p_val
    
    results['wilcoxon'] = wilcoxon_results
    
    # Visualization
    plt.figure(figsize=(12, 5))
    
    # 1. Box plot
    plt.subplot(1, 2, 1)
    sns.boxplot(data=df)
    plt.title(f'Distribution of Values Across Groups\n{dim_name}')
    plt.xticks(rotation=45)
    
    # 2. Heatmap of p-values
    plt.subplot(1, 2, 2)
    mask = np.triu(np.ones_like(p_value_matrix, dtype=bool))  # mask upper triangle
    sns.heatmap(p_value_matrix, 
                mask=mask,
                xticklabels=groups,
                yticklabels=groups,
                annot=True,  # Show numbers
                fmt='.3f',   # Format to 3 decimal places
                cmap='RdYlBu_r',  # Red for significant, blue for non-significant
                vmin=0,
                vmax=0.05)
    plt.title('Pairwise Comparison p-values\n(significant if < {:.3f})'.format(bonferroni_alpha))
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Print interpretation
    print(f"\nResults Interpretation for {dim_name}:")
    print("=" * 50)
    print(f"Bonferroni-corrected significance level: {bonferroni_alpha:.4f}")
    print("\nSignificant differences found between:")
    significant_pairs = []
    for pair, result in wilcoxon_results.items():
        if result['significant']:
            significant_pairs.append(f"- {pair} (p={result['p_value']:.4f})")
    if significant_pairs:
        print("\n".join(significant_pairs))
    else:
        print("No significant differences found after Bonferroni correction")
    
    return results, plt.gcf()  # Return both results and figure

#end of func


npAnovaResultsAllDays = {}

for dim in dict_meas:  # Iterate over dimensions
    # Initialize storage for this dimension
    npAnovaResultsAllDays[dim] = {}
    
    # Aggregate data across all days for each period
    periodAllDays = {'earlyMorning': [], 'morning': [], 'afterNoon': [], 'night': []}
    for day in dict_meas[dim]:
        for period in periodAllDays.keys():
            # Collect data from all days into the corresponding period
            periodAllDays[period].extend(list(eval(period)[dim][day].values()))
    
    #Create a DataFrame for all days combined
    dfAnovaAllDays = pd.DataFrame(periodAllDays)
    
    #dropping columns where >50% of data is np.nan
    for col in dfAnovaAllDays.columns:
        if (dfAnovaAllDays[col] == np.nan).sum() > 0.5 * len(dfAnovaAllDays):
            dfAnovaAllDays.drop(columns=[col], inplace=True)
    
    # Drop rows where any value is np.nan
    dfAnovaAllDays = dfAnovaAllDays[~dfAnovaAllDays.isin([np.nan]).any(axis=1)]
    
    # Perform friedmann test if there are at least 2 groups (columns)
    if len(dfAnovaAllDays.columns) >= 3:
        stat, pVal = friedmanchisquare(*[dfAnovaAllDays[col] for col in dfAnovaAllDays.columns])
        if pVal < 0.05:  # If Friedman test is significant
            posthoc_results, fig  = friedman_posthoc_with_viz(dfAnovaAllDays, dim)
            plt.savefig(os.path.join(mainfolder, f"{dim}_all_day_subjective_dim_corr_post_hoc.png"), bbox_inches='tight', dpi=300)
            plt.show()
            plt.close()

            npAnovaResultsAllDays[dim]['posthoc'] = posthoc_results
        else:
            npAnovaResultsAllDays[dim]['posthoc'] = []
        # Store results
        npAnovaResultsAllDays[dim]['stat'] = stat
        npAnovaResultsAllDays[dim]['p_val'] = pVal
        npAnovaResultsAllDays[dim]['groups'] = list(dfAnovaAllDays.columns)
        npAnovaResultsAllDays[dim]['final data length'] = len(dfAnovaAllDays)
    else:
        # Insufficient data for test
        npAnovaResultsAllDays[dim]['stat'] = None
        npAnovaResultsAllDays[dim]['p_val'] = None
        npAnovaResultsAllDays[dim]['groups'] = list(dfAnovaAllDays.columns)
        npAnovaResultsAllDays[dim]['final data length'] = len(dfAnovaAllDays)


In [None]:
#after timestitch
#all day npANOVA
#friedman

def friedman_posthoc_with_viz(df, dim_name):
    """
    Perform and visualize post-hoc analysis after significant Friedman test
    df: pandas DataFrame where columns are groups
    dim_name: name of dimension being analyzed (for plot titles)
    """
    results = {}
    
    # 1. Wilcoxon with Bonferroni correction
    groups = list(df.columns)
    n_comparisons = len(groups) * (len(groups) - 1) / 2
    alpha = 0.05
    bonferroni_alpha = alpha / n_comparisons
    
    wilcoxon_results = {}
    # Matrix to store p-values for heatmap
    p_value_matrix = np.zeros((len(groups), len(groups)))
    
    for i, j in combinations(range(len(groups)), 2):
        group1, group2 = groups[i], groups[j]
        stat, p_val = wilcoxon(df[group1], df[group2])
        wilcoxon_results[f"{group1} vs {group2}"] = {
            'statistic': stat,
            'p_value': p_val,
            'significant': p_val < bonferroni_alpha
        }
        # Fill both sides of the matrix for the heatmap
        p_value_matrix[i, j] = p_val
        p_value_matrix[j, i] = p_val
    
    results['wilcoxon'] = wilcoxon_results
    
    # Visualization
    plt.figure(figsize=(12, 5))
    
    # 1. Box plot
    plt.subplot(1, 2, 1)
    sns.boxplot(data=df)
    plt.title(f'Distribution of Values Across Groups\n{dim_name}')
    plt.xticks(rotation=45)
    
    # 2. Heatmap of p-values
    plt.subplot(1, 2, 2)
    mask = np.triu(np.ones_like(p_value_matrix, dtype=bool))  # mask upper triangle
    sns.heatmap(p_value_matrix, 
                mask=mask,
                xticklabels=groups,
                yticklabels=groups,
                annot=True,  # Show numbers
                fmt='.3f',   # Format to 3 decimal places
                cmap='RdYlBu_r',  # Red for significant, blue for non-significant
                vmin=0,
                vmax=0.05)
    plt.title('Pairwise Comparison p-values\n(significant if < {:.3f})'.format(bonferroni_alpha))
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Print interpretation
    print(f"\nResults Interpretation for {dim_name}:")
    print("=" * 50)
    print(f"Bonferroni-corrected significance level: {bonferroni_alpha:.4f}")
    print("\nSignificant differences found between:")
    significant_pairs = []
    for pair, result in wilcoxon_results.items():
        if result['significant']:
            significant_pairs.append(f"- {pair} (p={result['p_value']:.4f})")
    if significant_pairs:
        print("\n".join(significant_pairs))
    else:
        print("No significant differences found after Bonferroni correction")
    
    return results, plt.gcf()  # Return both results and figure

#end of func


npAnovaResultsAllDays = {}

for dim in dict_meas:  # Iterate over dimensions
    # Initialize storage for this dimension
    npAnovaResultsAllDays[dim] = {}
    
    # Aggregate data across all days for each period
    periodAllDays = {'earlyMorning': [], 'morning': [], 'afterNoon': [], 'night': []}
    for day in dict_meas[dim]:
        for period in periodAllDays.keys():
            # Collect data from all days into the corresponding period
            periodAllDays[period].extend(list(eval(period)[dim][day].values()))
    
    #Create a DataFrame for all days combined
    dfAnovaAllDays = pd.DataFrame(periodAllDays)
    
    #dropping columns where >50% of data is np.nan
    for col in dfAnovaAllDays.columns:
        if (dfAnovaAllDays[col] == np.nan).sum() > 0.5 * len(dfAnovaAllDays):
            dfAnovaAllDays.drop(columns=[col], inplace=True)
    
    # Drop rows where any value is np.nan
    dfAnovaAllDays = dfAnovaAllDays[~dfAnovaAllDays.isin([np.nan]).any(axis=1)]
    
    # Perform friedmann test if there are at least 2 groups (columns)
    if len(dfAnovaAllDays.columns) >= 3:
        stat, pVal = friedmanchisquare(*[dfAnovaAllDays[col] for col in dfAnovaAllDays.columns])
        if pVal < 0.05:  # If Friedman test is significant
            posthoc_results, fig  = friedman_posthoc_with_viz(dfAnovaAllDays, dim)
            plt.savefig(os.path.join(mainfolder, f"timestitch_{dim}_all_day_subjective_dim_corr_post_hoc.png"), bbox_inches='tight', dpi=300)
            plt.show()
            plt.close()

            npAnovaResultsAllDays[dim]['posthoc'] = posthoc_results
        else:
            npAnovaResultsAllDays[dim]['posthoc'] = []
        # Store results
        npAnovaResultsAllDays[dim]['stat'] = stat
        npAnovaResultsAllDays[dim]['p_val'] = pVal
        npAnovaResultsAllDays[dim]['groups'] = list(dfAnovaAllDays.columns)
        npAnovaResultsAllDays[dim]['final data length'] = len(dfAnovaAllDays)
    else:
        # Insufficient data for test
        npAnovaResultsAllDays[dim]['stat'] = None
        npAnovaResultsAllDays[dim]['p_val'] = None
        npAnovaResultsAllDays[dim]['groups'] = list(dfAnovaAllDays.columns)
        npAnovaResultsAllDays[dim]['final data length'] = len(dfAnovaAllDays)


In [None]:
df_AnovaResultsAllDays = pd.DataFrame(npAnovaResultsAllDays)

In [None]:
df_AnovaResultsAllDays

In [None]:
df_AnovaResultsAllDays.to_excel(os.path.join(mainfolder, 'all_day_time_distributions_objective.xlsx'))