# Modules

In [1]:
import datetime
import json
import numpy as np
import os
import pandas as pd

from IPython.display import clear_output
from scipy.stats import spearmanr, wilcoxon
from sklearn.metrics import roc_curve, auc

from pdathome.constants import classifiers, columns, descriptives, parameters, \
    participant_ids, paths, updrs_3_map, arm_labels_rename
from pdathome.evaluation import calculate_sens, calculate_spec

from paradigma.windowing import create_segments, discard_segments, categorize_segments

# Constants

In [2]:
d_map_activities = {
    'Lie-to-sit': 'Transitioning',
    'Lie-to-stand': 'Transitioning',
    'Sit-to-lie': 'Transitioning',
    'Sit-to-stand (low chair/couch)': 'Transitioning',
    'Sit-to-stand (normal chair)': 'Transitioning',
    'Stand-to-lie': 'Transitioning',
    'Stand-to-sit (low chair/couch)': 'Transitioning',
    'Stand-to-sit (normal chair)': 'Transitioning',
    'Walking downstairs': 'Walking the stairs',
    'Walking upstairs': 'Walking the stairs',
}

d_map_segment_duration = {
    1: 'short',
    2: 'moderately_long',
    3: 'long',
    4: 'very_long'
}

# Load

In [27]:
with open(os.path.join(paths.PATH_RAW_DATA, 'output', 'arm_activity', 'performance.json'), 'r') as f:
    d_performance = json.load(f)

# Descriptives

In [7]:
df_size = pd.DataFrame()

for (population, med_stage) in [(descriptives.PARKINSONS_DISEASE, descriptives.PRE_MED), (descriptives.PARKINSONS_DISEASE, descriptives.POST_MED), (descriptives.CONTROLS, descriptives.CONTROLS)]:
    if population == descriptives.PARKINSONS_DISEASE:
        l_ids = participant_ids.L_PD_IDS
    else:
        l_ids = participant_ids.L_HC_IDS

    for subject in l_ids:
        for segment_duration in d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['segment_duration'].keys():
            segment_duration_minutes = d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['segment_duration'][segment_duration]['minutes']

            df_size = pd.concat([df_size, pd.DataFrame([
                population,
                med_stage,
                subject,
                'gait',
                segment_duration,
                segment_duration_minutes
            ]).T], axis=0).reset_index(drop=True)

        for activity in ['gait', 'non_gait']:

            total_minutes = d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['size'][f'{activity}_s']/60

            df_size = pd.concat([df_size, pd.DataFrame([
                population,
                med_stage,
                subject,
                activity,
                'total',
                total_minutes
            ]).T], axis=0).reset_index(drop=True)


df_size.columns = ['pop', 'med_stage', 'id', 'activity', 'segment_duration', 'minutes']   

df_size_grouped = df_size.groupby(['pop', 'med_stage', 'activity', 'segment_duration'])['minutes'].agg(['mean', 'std']).reset_index()

df_size_grouped['minutes_agg'] = df_size_grouped.apply(lambda x: f"{np.round(x['mean'],1)} ({np.round(x['std'],1)})", axis=1)
df_size_grouped['pop'] = df_size_grouped.apply(lambda x: f"{x['pop']} {x['med_stage']}" if x['pop'] == descriptives.PARKINSONS_DISEASE else x['pop'], axis=1)

df_size_grouped = df_size_grouped.drop(columns=['mean', 'std', 'med_stage']).reset_index(drop=True)

df_size_grouped['pop'] = pd.Categorical(df_size_grouped['pop'], ['PD pre', 'PD post', descriptives.CONTROLS])
df_size_grouped['segment_duration'] = pd.Categorical(df_size_grouped['segment_duration'], ['short', 'moderately_long', 'long', 'very_long', 'total'])

df_size_grouped = df_size_grouped.sort_values(['pop', 'segment_duration']).reset_index(drop=True)

df_size_grouped.columns = ['Population', 'Activity', 'Segment duration', 'Mean (SD) number of minutes']

df_size_grouped.set_index(['Population', 'Activity', 'Segment duration'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean (SD) number of minutes
Population,Activity,Segment duration,Unnamed: 3_level_1
PD pre,gait,short,0.9 (0.5)
PD pre,gait,moderately_long,1.8 (0.8)
PD pre,gait,long,1.9 (0.9)
PD pre,gait,very_long,9.2 (5.7)
PD pre,gait,total,13.7 (5.8)
PD pre,non_gait,total,78.2 (15.2)
PD post,gait,short,0.5 (0.3)
PD post,gait,moderately_long,1.2 (0.5)
PD post,gait,long,1.1 (0.7)
PD post,gait,very_long,8.6 (5.4)


# General performance

In [8]:
df_performance = pd.DataFrame()

for gd_model in [classifiers.LOGISTIC_REGRESSION, classifiers.RANDOM_FOREST]:
    for (pop, med_stage) in [(descriptives.PARKINSONS_DISEASE, descriptives.PRE_MED), (descriptives.PARKINSONS_DISEASE, descriptives.POST_MED), (descriptives.CONTROLS, descriptives.CONTROLS)]:
        if pop == descriptives.PARKINSONS_DISEASE:
            l_subjects = participant_ids.L_PD_IDS
            pop_medstage = f"{pop}_{med_stage}"
        else:
            l_subjects = participant_ids.L_HC_IDS
            pop_medstage = pop

        sens = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] for x in l_subjects]
        spec = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'] for x in l_subjects]
        bacc = [(x+y)/2 for x,y in zip(sens, spec)]
        auc_score = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['auc'] for x in l_subjects]

        df_performance = pd.concat([df_performance, pd.DataFrame([gd_model,
                                                                  pop_medstage,
                                                                  f"{np.round(np.mean(bacc),2)} ({np.round(np.std(bacc),2)})",
                                                                  f"{np.round(np.mean(sens),2)} ({np.round(np.std(sens),2)})",
                                                                  f"{np.round(np.mean(spec),2)} ({np.round(np.std(spec),2)})",
                                                                  f"{np.round(np.mean(auc_score),2)} ({np.round(np.std(auc_score),2)})"
                                                                 ]).T], axis=0).reset_index(drop=True)
        
            
df_performance.columns = ['Classifier', 'Population', 'Balanced accuracy', 'Sensitivity', 'Specificity', 'AUC']

for med_stage in [descriptives.PRE_MED, descriptives.POST_MED]:
    rf_bacc = [(d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.RANDOM_FOREST][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] + d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.RANDOM_FOREST][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'])/2 for x in participant_ids.L_PD_IDS]
    logreg_bacc = [(d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.LOGISTIC_REGRESSION][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] + d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.LOGISTIC_REGRESSION][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'])/2 for x in participant_ids.L_PD_IDS]

    print(f"*{med_stage}-med* random forest vs. logistic regression balanced accuracy: p = {wilcoxon(rf_bacc, logreg_bacc)[1]}")

df_performance.pivot(index='Population', columns='Classifier', values=['Sensitivity', 'Specificity', 'AUC']).sort_index(ascending=False)

*pre-med* random forest vs. logistic regression balanced accuracy: p = 3.337860107421875e-05
*post-med* random forest vs. logistic regression balanced accuracy: p = 0.0012617111206054688


Unnamed: 0_level_0,Sensitivity,Sensitivity,Specificity,Specificity,AUC,AUC
Classifier,logreg,rf,logreg,rf,logreg,rf
Population,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
PD_pre,0.85 (0.09),0.92 (0.06),0.93 (0.04),0.92 (0.04),0.97 (0.02),0.98 (0.01)
PD_post,0.91 (0.06),0.97 (0.03),0.9 (0.06),0.88 (0.07),0.98 (0.01),0.98 (0.01)
HC,0.93 (0.03),0.98 (0.01),0.82 (0.09),0.77 (0.1),0.97 (0.02),0.97 (0.01)


### Create dict of results

In [3]:
gap_threshold_s = 1.5

d_output = {
    descriptives.PARKINSONS_DISEASE: {},
    descriptives.CONTROLS: {},
}

df_patient_info = pd.read_pickle(os.path.join(paths.PATH_CLINICAL_DATA, 'df_patient_info_updrs_3.pkl'))
df_patient_info = df_patient_info.loc[df_patient_info['record_id'].isin(participant_ids.L_PD_IDS)].reset_index(drop=True)
df_patient_info['age'] = datetime.datetime.now().year - df_patient_info['year_of_birth']
df_patient_info['years_since_diagnosis'] = datetime.datetime.now().year - df_patient_info['year_diagnosis']
df_patient_info['gender'] = df_patient_info['gender'].apply(lambda x: 'male' if x==1 else 'female')

for col in ['age', 'years_since_diagnosis']:
    df_patient_info[col] = df_patient_info[col].apply(lambda x: int(x))

for med_stage, med_prefix in zip([descriptives.PRE_MED, descriptives.POST_MED], ['OFF', 'ON']):
    for side in ['right', 'left']:
        updrs_3_hypokinesia_stage_cols = [f'{med_prefix}_{x}' for x in updrs_3_map[side]['hypokinesia'].keys()]
        updrs_3_stage_cols = updrs_3_hypokinesia_stage_cols + [f'{med_prefix}_{x}' for x in updrs_3_map[side]['tremor'].keys()]

for subject in participant_ids.L_PD_IDS + participant_ids.L_HC_IDS:
    d_performance = {}
    
    if subject in participant_ids.L_PD_IDS:
        d_performance['updrs'] = {}
        for med_stage, med_prefix in zip([descriptives.PRE_MED, descriptives.POST_MED], ['OFF', 'ON']):
            d_performance['updrs'][med_stage] = {}
            for side in ['right', 'left']:
                if subject in participant_ids.L_PD_MOST_AFFECTED_RIGHT:
                    if side == 'right':
                        affected_side = descriptives.MOST_AFFECTED_SIDE
                    else:
                        affected_side = descriptives.LEAST_AFFECTED_SIDE
                else:
                    if side == 'left':
                        affected_side = descriptives.MOST_AFFECTED_SIDE
                    else:
                        affected_side = descriptives.LEAST_AFFECTED_SIDE

                updrs_3_hypokinesia_stage_cols = [f'{med_prefix}_{x}' for x in updrs_3_map[side]['hypokinesia'].keys()]
                updrs_3_stage_cols = updrs_3_hypokinesia_stage_cols + [f'{med_prefix}_{x}' for x in updrs_3_map[side]['tremor'].keys()]
                
                d_performance['updrs'][med_stage][affected_side] = {
                    'subscore': np.sum(df_patient_info.loc[df_patient_info['record_id']==subject, updrs_3_hypokinesia_stage_cols], axis=1).values[0],
                    'total': np.sum(df_patient_info.loc[df_patient_info['record_id']==subject, updrs_3_stage_cols], axis=1).values[0]
                }

    for model in [classifiers.LOGISTIC_REGRESSION, classifiers.RANDOM_FOREST]:

        d_performance[model] = {}
        
        # thresholds
        with open(os.path.join(paths.PATH_THRESHOLDS, 'gait', f'{model}_threshold.txt'), 'r') as f:
            clf_threshold = np.mean(float(f.read()))

        # predictions
        df_predictions = pd.read_pickle(os.path.join(paths.PATH_GAIT_PREDICTIONS, model, f'{subject}.pkl'))

        # TEMPORARY
        df_predictions = df_predictions.rename(columns={'watch_side': 'side'})

        # PREPROCESS DATA
        df_predictions.loc[df_predictions[columns.PRED_GAIT_PROBA]>=clf_threshold, columns.PRED_GAIT] = 1
        df_predictions.loc[df_predictions[columns.PRED_GAIT_PROBA]<clf_threshold, columns.PRED_GAIT] = 0

        # boolean for gait
        df_predictions.loc[df_predictions[columns.FREE_LIVING_LABEL]=='Walking', 'gait_boolean'] = 1
        df_predictions.loc[df_predictions[columns.FREE_LIVING_LABEL]!='Walking', 'gait_boolean'] = 0

        if subject in participant_ids.L_HC_IDS:
            df_predictions[columns.PRE_OR_POST] = descriptives.CONTROLS
        else:
            # boolean for arm swing
            df_predictions.loc[df_predictions[columns.ARM_LABEL]=='Gait without other behaviours or other positions', 'arm_swing_boolean'] = 1
            df_predictions.loc[df_predictions[columns.ARM_LABEL]!='Gait without other behaviours or other positions', 'arm_swing_boolean'] = 0
            df_predictions.loc[df_predictions[columns.ARM_LABEL]=='Holding an object behind ', columns.ARM_LABEL] = 'Holding an object behind'
            df_predictions[columns.ARM_LABEL] = df_predictions.loc[~df_predictions[columns.ARM_LABEL].isna(), columns.ARM_LABEL].apply(lambda x: arm_labels_rename[x])

        # PROCESS DATA

        # make segments and segment duration categories
        for affected_side in [descriptives.MOST_AFFECTED_SIDE, descriptives.LEAST_AFFECTED_SIDE]:
            df_side = df_predictions.loc[df_predictions[columns.SIDE]==affected_side]

            if subject in participant_ids.L_TREMOR_IDS:
                df_ts = pd.read_pickle(os.path.join(paths.PATH_GAIT_FEATURES, f'{subject}_{affected_side}_ts.pkl'))

                df_ts = df_ts.explode(column=[columns.TIME, columns.FREE_LIVING_LABEL, columns.ARM_LABEL, columns.TREMOR_LABEL])
                df_ts = df_ts.drop_duplicates(subset=[columns.TIME, columns.FREE_LIVING_LABEL, columns.PRE_OR_POST, columns.ARM_LABEL, columns.TREMOR_LABEL])
                df_ts = df_ts.loc[df_ts[columns.PRE_OR_POST].isin([descriptives.PRE_MED, descriptives.POST_MED])]

                df_ts.loc[df_ts[columns.ARM_LABEL]=='Holding an object behind ', columns.ARM_LABEL] = 'Holding an object behind'
                df_ts[columns.ARM_LABEL] = df_ts.loc[~df_ts[columns.ARM_LABEL].isna(), columns.ARM_LABEL].apply(lambda x: arm_labels_rename[x])

            fpr, tpr, _ = roc_curve(y_true=np.array(df_side['gait_boolean']), y_score=np.array(df_side[columns.PRED_GAIT_PROBA]), pos_label=1)
            roc = auc(fpr, tpr)

            d_performance[model][affected_side] = {
                'sens': calculate_sens(df=df_side, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                'spec': calculate_spec(df=df_side, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                'auc': roc
            }

            if subject in participant_ids.L_PD_IDS and columns.PRE_OR_POST not in df_side.columns:
                df_raw = pd.read_pickle(os.path.join(paths.PATH_DATAFRAMES, f'{subject}_{affected_side}.pkl'))
                df_side = pd.merge(left=df_side, right=df_raw[[columns.TIME, columns.PRE_OR_POST]], how='left', on=[columns.TIME])

            for med_stage in df_side[columns.PRE_OR_POST].unique():
                df_med_stage = df_side.loc[df_side[columns.PRE_OR_POST]==med_stage].copy()

                fpr, tpr, _ = roc_curve(y_true=np.array(df_med_stage['gait_boolean']), y_score=np.array(df_med_stage[columns.PRED_GAIT_PROBA]), pos_label=1)
                roc = auc(fpr, tpr)

                d_performance[model][affected_side][med_stage] = {
                    'sens': calculate_sens(df=df_med_stage, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                    'spec': calculate_spec(df=df_med_stage, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                    'auc': roc,
                    'size': {
                        'gait_s': df_med_stage.loc[df_med_stage['gait_boolean']==1].shape[0] / parameters.DOWNSAMPLED_FREQUENCY,
                        'non_gait_s': df_med_stage.loc[df_med_stage['gait_boolean']==0].shape[0] / parameters.DOWNSAMPLED_FREQUENCY,
                    }
                }

                df_gait = df_med_stage.loc[df_med_stage[columns.FREE_LIVING_LABEL]=='Walking'].copy()

                # df, time_column_name, gap_threshold

                df_gait[columns.SEGMENT_NR] = create_segments(
                    df=df_gait,
                    time_column_name=columns.TIME,
                    segment_column_name=columns.SEGMENT_NR,
                    gap_threshold_s=gap_threshold_s
                )

                df_gait[columns.SEGMENT_CAT] = categorize_segments(
                    df=df_gait,
                    segment_nr_colname=columns.SEGMENT_NR,
                    sampling_frequency=parameters.DOWNSAMPLED_FREQUENCY,
                )

                df_gait[columns.SEGMENT_CAT] = df_gait[columns.SEGMENT_CAT].apply(lambda x: d_map_segment_duration[x])
  
                # minutes of data per med stage, per affected side, per segment duration category
                d_performance[model][affected_side][med_stage]['segment_duration'] = {}
                for segment_duration in df_gait[columns.SEGMENT_CAT].unique():
                    df_segments_cat = df_gait.loc[df_gait[columns.SEGMENT_CAT]==segment_duration]

                    d_performance[model][affected_side][med_stage]['segment_duration'][segment_duration] = {
                        'sens': calculate_sens(df=df_segments_cat, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                    }

                    d_performance[model][affected_side][med_stage]['segment_duration'][segment_duration]['minutes'] = df_segments_cat.shape[0]/parameters.DOWNSAMPLED_FREQUENCY/60

                    if subject in participant_ids.L_PD_IDS:
                        d_performance[model][affected_side][med_stage]['segment_duration'][segment_duration]['arm_activities'] = {}

                        for arm_label in df_segments_cat[columns.ARM_LABEL].unique():
                            df_arm_activity = df_segments_cat.loc[df_segments_cat[columns.ARM_LABEL]==arm_label]

                            d_performance[model][affected_side][med_stage]['segment_duration'][segment_duration]['arm_activities'][arm_label] = {
                                'mins': df_arm_activity.shape[0],
                                'sens': calculate_sens(df=df_arm_activity, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean')
                            }

                # minutes of data per activity of MAS
                df_med_stage['label_agg'] = df_med_stage[columns.FREE_LIVING_LABEL].apply(lambda x: d_map_activities[x] if x in d_map_activities.keys() else x)
                d_performance[model][affected_side][med_stage]['activities'] = {}

                for activity_label in df_med_stage['label_agg'].unique():
                    df_activity = df_med_stage.loc[df_med_stage['label_agg']==activity_label]
                    d_performance[model][affected_side][med_stage]['activities'][activity_label] = {
                        'spec': calculate_spec(df=df_activity, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean'),
                    }

                # minutes of data per arm activity of MAS
                if subject in participant_ids.L_PD_IDS:
                    d_performance[model][affected_side][med_stage]['arm_activities'] = {}

                    for arm_label in df_med_stage[columns.ARM_LABEL].unique():
                        df_arm_activity = df_med_stage.loc[df_med_stage[columns.ARM_LABEL]==arm_label]

                        d_performance[model][affected_side][med_stage]['arm_activities'][arm_label] = {
                            'mins': df_arm_activity.shape[0],
                            'sens': calculate_sens(df=df_arm_activity, pred_colname=columns.PRED_GAIT, true_colname='gait_boolean')
                        }

                # effect of tremor on specificity
                if subject in participant_ids.L_TREMOR_IDS:

                    df_med_stage = df_side.loc[df_side[columns.PRE_OR_POST]==med_stage].copy()

                    df_tremor = pd.merge(left=df_med_stage, right=df_ts.loc[df_ts[columns.PRE_OR_POST]==med_stage], on=[columns.TIME, columns.FREE_LIVING_LABEL, columns.PRE_OR_POST, columns.ARM_LABEL], how='left')

                    df_tremor['tremor_label_binned'] = df_tremor[columns.TREMOR_LABEL].apply(
                        lambda x: 'tremor' if x in ['Slight or mild tremor', 'Moderate tremor', 'Severe tremor', 'Tremor with significant upper limb activity'] else
                        ('no_tremor' if x in ['No tremor', 'Periodic activity of hand/arm similar frequency to tremor', 'No tremor with significant upper limb activity'] else
                        np.nan
                        )
                    )

                    for tremor_type in [x for x in df_tremor['tremor_label_binned'].unique() if not pd.isna(x)]:
                        d_performance[model][affected_side][med_stage][f'{tremor_type}_spec'] = calculate_spec(df=df_tremor.loc[df_tremor['tremor_label_binned']==tremor_type], pred_colname=columns.PRED_GAIT, true_colname='gait_boolean')

    if subject in participant_ids.L_PD_IDS:
        d_output[descriptives.PARKINSONS_DISEASE][subject] = d_performance
    else:
        d_output[descriptives.CONTROLS][subject] = d_performance
        
    print(f"Time {datetime.datetime.now()} - {subject} - Finished.")

clear_output(wait=False)

In [None]:
with open(os.path.join(paths.PATH_RAW_DATA, 'output', 'gait_detection', 'performance.json'), 'w') as f:
    json.dump(d_output, f)