# Modules

In [1]:
import datetime
import json
import numpy as np
import os
import pandas as pd

from IPython.display import clear_output
from scipy.stats import spearmanr, wilcoxon
from sklearn.metrics import roc_curve, auc

from pdathome.constants import classifiers, columns, descriptives, parameters, \
    participant_ids, paths, updrs_3_map, arm_labels_rename
from pdathome.evaluation import calculate_sens, calculate_spec, generate_clinical_scores, generate_results_step

from paradigma.windowing import create_segments, discard_segments, categorize_segments

# Constants

In [2]:
step = 'gait'

# Load

In [27]:
with open(os.path.join(paths.PATH_RAW_DATA, 'output', step, 'performance.json'), 'r') as f:
    d_performance = json.load(f)

# Descriptives

In [7]:
df_size = pd.DataFrame()

for (population, med_stage) in [(descriptives.PARKINSONS_DISEASE, descriptives.PRE_MED), (descriptives.PARKINSONS_DISEASE, descriptives.POST_MED), (descriptives.CONTROLS, descriptives.CONTROLS)]:
    if population == descriptives.PARKINSONS_DISEASE:
        l_ids = participant_ids.L_PD_IDS
    else:
        l_ids = participant_ids.L_HC_IDS

    for subject in l_ids:
        for segment_duration in d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['segment_duration'].keys():
            segment_duration_minutes = d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['segment_duration'][segment_duration]['minutes']

            df_size = pd.concat([df_size, pd.DataFrame([
                population,
                med_stage,
                subject,
                step,
                segment_duration,
                segment_duration_minutes
            ]).T], axis=0).reset_index(drop=True)

        for activity in ['gait', 'non_gait']:

            total_minutes = d_performance[population][subject][classifiers.GAIT_DETECTION_CLASSIFIER_SELECTED][descriptives.MOST_AFFECTED_SIDE][med_stage]['size'][f'{activity}_s']/60

            df_size = pd.concat([df_size, pd.DataFrame([
                population,
                med_stage,
                subject,
                activity,
                'total',
                total_minutes
            ]).T], axis=0).reset_index(drop=True)


df_size.columns = ['pop', 'med_stage', 'id', 'activity', 'segment_duration', 'minutes']   

df_size_grouped = df_size.groupby(['pop', 'med_stage', 'activity', 'segment_duration'])['minutes'].agg(['mean', 'std']).reset_index()

df_size_grouped['minutes_agg'] = df_size_grouped.apply(lambda x: f"{np.round(x['mean'],1)} ({np.round(x['std'],1)})", axis=1)
df_size_grouped['pop'] = df_size_grouped.apply(lambda x: f"{x['pop']} {x['med_stage']}" if x['pop'] == descriptives.PARKINSONS_DISEASE else x['pop'], axis=1)

df_size_grouped = df_size_grouped.drop(columns=['mean', 'std', 'med_stage']).reset_index(drop=True)

df_size_grouped['pop'] = pd.Categorical(df_size_grouped['pop'], ['PD pre', 'PD post', descriptives.CONTROLS])
df_size_grouped['segment_duration'] = pd.Categorical(df_size_grouped['segment_duration'], ['short', 'moderately_long', 'long', 'very_long', 'total'])

df_size_grouped = df_size_grouped.sort_values(['pop', 'segment_duration']).reset_index(drop=True)

df_size_grouped.columns = ['Population', 'Activity', 'Segment duration', 'Mean (SD) number of minutes']

df_size_grouped.set_index(['Population', 'Activity', 'Segment duration'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean (SD) number of minutes
Population,Activity,Segment duration,Unnamed: 3_level_1
PD pre,gait,short,0.9 (0.5)
PD pre,gait,moderately_long,1.8 (0.8)
PD pre,gait,long,1.9 (0.9)
PD pre,gait,very_long,9.2 (5.7)
PD pre,gait,total,13.7 (5.8)
PD pre,non_gait,total,78.2 (15.2)
PD post,gait,short,0.5 (0.3)
PD post,gait,moderately_long,1.2 (0.5)
PD post,gait,long,1.1 (0.7)
PD post,gait,very_long,8.6 (5.4)


# General performance

In [8]:
df_performance = pd.DataFrame()

for gd_model in [classifiers.LOGISTIC_REGRESSION, classifiers.RANDOM_FOREST]:
    for (pop, med_stage) in [(descriptives.PARKINSONS_DISEASE, descriptives.PRE_MED), (descriptives.PARKINSONS_DISEASE, descriptives.POST_MED), (descriptives.CONTROLS, descriptives.CONTROLS)]:
        if pop == descriptives.PARKINSONS_DISEASE:
            l_subjects = participant_ids.L_PD_IDS
            pop_medstage = f"{pop}_{med_stage}"
        else:
            l_subjects = participant_ids.L_HC_IDS
            pop_medstage = pop

        sens = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] for x in l_subjects]
        spec = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'] for x in l_subjects]
        bacc = [(x+y)/2 for x,y in zip(sens, spec)]
        auc_score = [d_performance[pop][x][gd_model][descriptives.MOST_AFFECTED_SIDE][med_stage]['auc'] for x in l_subjects]

        df_performance = pd.concat([df_performance, pd.DataFrame([gd_model,
                                                                  pop_medstage,
                                                                  f"{np.round(np.mean(bacc),2)} ({np.round(np.std(bacc),2)})",
                                                                  f"{np.round(np.mean(sens),2)} ({np.round(np.std(sens),2)})",
                                                                  f"{np.round(np.mean(spec),2)} ({np.round(np.std(spec),2)})",
                                                                  f"{np.round(np.mean(auc_score),2)} ({np.round(np.std(auc_score),2)})"
                                                                 ]).T], axis=0).reset_index(drop=True)
        
            
df_performance.columns = ['Classifier', 'Population', 'Balanced accuracy', 'Sensitivity', 'Specificity', 'AUC']

for med_stage in [descriptives.PRE_MED, descriptives.POST_MED]:
    rf_bacc = [(d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.RANDOM_FOREST][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] + d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.RANDOM_FOREST][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'])/2 for x in participant_ids.L_PD_IDS]
    logreg_bacc = [(d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.LOGISTIC_REGRESSION][descriptives.MOST_AFFECTED_SIDE][med_stage]['sens'] + d_performance[descriptives.PARKINSONS_DISEASE][x][classifiers.LOGISTIC_REGRESSION][descriptives.MOST_AFFECTED_SIDE][med_stage]['spec'])/2 for x in participant_ids.L_PD_IDS]

    print(f"*{med_stage}-med* random forest vs. logistic regression balanced accuracy: p = {wilcoxon(rf_bacc, logreg_bacc)[1]}")

df_performance.pivot(index='Population', columns='Classifier', values=['Sensitivity', 'Specificity', 'AUC']).sort_index(ascending=False)

*pre-med* random forest vs. logistic regression balanced accuracy: p = 3.337860107421875e-05
*post-med* random forest vs. logistic regression balanced accuracy: p = 0.0012617111206054688


Unnamed: 0_level_0,Sensitivity,Sensitivity,Specificity,Specificity,AUC,AUC
Classifier,logreg,rf,logreg,rf,logreg,rf
Population,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
PD_pre,0.85 (0.09),0.92 (0.06),0.93 (0.04),0.92 (0.04),0.97 (0.02),0.98 (0.01)
PD_post,0.91 (0.06),0.97 (0.03),0.9 (0.06),0.88 (0.07),0.98 (0.01),0.98 (0.01)
HC,0.93 (0.03),0.98 (0.01),0.82 (0.09),0.77 (0.1),0.97 (0.02),0.97 (0.01)


### Create dict of results

In [1]:
d_performance = generate_results_step(step, participant_ids.L_PD_IDS, participant_ids.L_HC_IDS, 1.5)

In [None]:
with open(os.path.join(paths.PATH_RAW_DATA, 'output', step, 'performance.json'), 'w') as f:
    json.dump(d_performance, f)