In [1]:
import pandas as pd
import numpy as np
import copy
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
import scipy.stats as stats
import warnings
import sys
sys.path.append("/Users/cr591/OneDrive - University of Exeter/Desktop/pyCGM/pyCGM")
import metrics as cgm

warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('~/OneDrive - University of Exeter/Desktop/PhD/Projects/interpolation-for-hypo-detection/dexcom-maths-exploration/data/tidy_data/interp_dataset.csv')
df.dropna(subset=['glc'], inplace=True)

In [6]:
cols = ['cut_glc', 'pchip', 'linear','cubicspline', 'akima', 'polynomial_3', 
        'polynomial_5', 'polynomial_7', 'matern', 'rq', 'rbf']

In [119]:
df[cols] = df[cols].round(2)
df_results = cgm.hypoglycemic_episodes(df, 'time', 'glc', 'ID', 
                                       interpolate=False, breakdown=True)

In [None]:
def overlapping_frame(row, dataframe):
    print(row.ID)
    return dataframe.loc[(dataframe['ID']==row['ID']) & 
                    (((dataframe['start_time']<=row['start_time']) & 
                        (dataframe['end_time']>=row['start_time']))| 
                    ((dataframe['start_time']<=row['end_time']) & 
                        (dataframe['end_time']>=row['end_time'])) |
                    ((row['start_time']<=dataframe['start_time']) & 
                        (row['end_time']>=dataframe['start_time']))| 
                    ((row['start_time']<=dataframe['end_time']) & 
                        (row['end_time']>=dataframe['end_time'])))]

In [216]:
def calculate_sens(original_df, comparison_df, col, aligned=False):
    false_positive = 0
    true_positive = 0
    false_negative = 0
    for i, row in original_df.iterrows():
        sub_df = overlapping_frame(row, comparison_df)

        if sub_df.shape[0]==0:
            false_negative += 1
            falsey = [col, row.ID, row['start_time'], row['start_time'], 
                      row['end_time'], row['end_time']- row['start_time'],
                      row['lv2'], np.nan, np.nan, np.nan, np.nan, -1]
            if aligned:
                aligned_results.append({'col': col, 'ID': row.ID, 
                      'global_start':row['start_time'], 
                      '5_min start':row['start_time'], 
                      '5_min_end':row['end_time'], 
                      '5_min_diff':row['end_time']- row['start_time'],
                      '5_min_lv2': row['lv2'],
                      'false_value':-1})
            
        elif sub_df.shape[0] == 1:
            true_positive += 1
            comparison_df = comparison_df.drop(index=sub_df.index[0])
            if aligned:
                aligned_results.append({'col':col, 'ID':row.ID, 
                           'global_start':row['start_time'], 
                           '5_min start':row['start_time'], 
                           '5_min_end':row['end_time'], 
                          '5_min_diff':row['end_time']- row['start_time'], 
                          '5_min_lv2':row['lv2'], 
                          '15_min start':sub_df.iloc[0]['start_time'], 
                          '15_min_end':sub_df.iloc[0]['end_time'], 
                          '15_min_diff':sub_df.iloc[0]['end_time'] - sub_df.iloc[0]['start_time'], 
                          '15_min_lv2':sub_df.iloc[0]['lv2'], 'false_value':0})

        elif sub_df.shape[0] > 1:
            true_positive += 1
            comparison_df = comparison_df.drop(index=sub_df.index[0])
            if aligned: 
                aligned_results.append({'col':col, 'ID':row.ID, 
                           'global_start':row['start_time'], 
                           '5_min start':row['start_time'], 
                           '5_min_end':row['end_time'], 
                          '5_min_diff':row['end_time']- row['start_time'], 
                          '5_min_lv2':row['lv2'], 
                          '15_min start':sub_df.iloc[0]['start_time'], 
                          '15_min_end':sub_df.iloc[0]['end_time'], 
                          '15_min_diff':sub_df.iloc[0]['end_time'] - sub_df.iloc[0]['start_time'], 
                          '15_min_lv2':sub_df.iloc[0]['lv2'], 'false_value':0})

    for i, row in comparison_df.iterrows():
        false_positive += 1
        if aligned:
            aligned_results.append({'col':col, 'ID':row.ID, 
                           'global_start':row['start_time'], 
                          '15_min start':row['start_time'], 
                          '15_min_end':row['end_time'], 
                          '15_min_diff':row['end_time'] - sub_df.iloc[0]['start_time'], 
                          '15_min_lv2':row['lv2'], 'false_value':+1})
    
    return [true_positive, false_negative, false_positive]

In [224]:
aligned_results = []
results_list = []
for col in cols:
    sub_df = df[['time', 'ID', col]]
    sub_df.columns = ['time', 'ID', 'glc']
 
    if col == 'cut_glc': interval_size = 15 
    else: interval_size = 5
    
    cut_results = cgm.hypoglycemic_episodes(sub_df, 'time', 'glc', 'ID',
                                                interpolate=False,
                                                breakdown=True,
                                                interval_size=interval_size)
    # all
    results_list.append([col, 'all'] + calculate_sens(df_results, cut_results, 
                                                      col, aligned=True))
    #lv1
    lv1_results = cut_results[cut_results['lv2']==False].reset_index(drop=True)
    df_lv1 = df_results[df_results['lv2']==False].reset_index(drop=True)
    results_list.append([col, 'lv1'] + calculate_sens(df_lv1, lv1_results, col))
    
    # lv2
    lv2_results = cut_results[cut_results['lv2']==True].reset_index(drop=True)
    df_lv2 = df_results[df_results['lv2']==True].reset_index(drop=True)
    results_list.append([col, 'lv2'] + calculate_sens(df_lv2, lv2_results, col))

results_frame = pd.DataFrame(results_list, columns=['method', 'lv', 'TP', 'FN', 
                                                    'FP'])
aligned_results = pd.DataFrame(aligned_results)

In [264]:
results_frame['recall'] = results_frame['TP']/(results_frame['TP']+results_frame['FN'])
results_frame['prec'] = results_frame['TP']/(results_frame['TP']+results_frame['FP'])
results_frame['csi'] = results_frame['TP']/(results_frame['TP']+results_frame['FP']+results_frame['FN'])
results_frame = results_frame.sort_values(['lv', 'recall', 'csi'], ascending=True)

In [267]:
results_frame.round(3).to_csv('confusion_matrix_hypos.csv', index=False)

In [35]:
for methods in aligned_results.col.values:
    method_df = aligned_results[aligned_results['col']==methods].sort_values(['ID', 'global_start'])
    name = './Aligned_results/aligned_results_' + methods + '.csv'
    method_df.to_csv(name)

In [226]:
aligned_results.sort_values(['col', 'ID', 'global_start']).head(10)

Unnamed: 0,col,ID,global_start,5_min start,5_min_end,5_min_diff,5_min_lv2,15_min start,15_min_end,15_min_diff,15_min_lv2,false_value
1738,akima,1001_baseline,2018-01-08 19:21:00,2018-01-08 19:21:00,2018-01-08 19:46:00,0 days 00:25:00,True,2018-01-08 19:21:00,2018-01-08 19:41:00,0 days 00:20:00,False,0
1739,akima,1001_baseline,2018-01-10 00:41:00,2018-01-10 00:41:00,2018-01-10 01:31:00,0 days 00:50:00,True,2018-01-10 00:41:00,2018-01-10 01:31:00,0 days 00:50:00,True,0
1740,akima,1001_baseline,2018-01-10 13:01:00,2018-01-10 13:01:00,2018-01-10 13:16:00,0 days 00:15:00,False,2018-01-10 13:01:00,2018-01-10 13:21:00,0 days 00:20:00,False,0
1741,akima,1001_baseline,2018-01-11 03:06:00,2018-01-11 03:06:00,2018-01-11 03:26:00,0 days 00:20:00,False,2018-01-11 03:11:00,2018-01-11 03:26:00,0 days 00:15:00,False,0
1742,akima,1001_baseline,2018-01-11 04:16:00,2018-01-11 04:16:00,2018-01-11 04:56:00,0 days 00:40:00,True,2018-01-11 03:51:00,2018-01-11 04:56:00,0 days 01:05:00,True,0
1743,akima,1001_baseline,2018-01-11 06:06:00,2018-01-11 06:06:00,2018-01-11 06:21:00,0 days 00:15:00,False,NaT,NaT,NaT,,-1
1744,akima,1001_baseline,2018-01-11 11:41:00,2018-01-11 11:41:00,2018-01-11 12:11:00,0 days 00:30:00,False,2018-01-11 11:41:00,2018-01-11 12:11:00,0 days 00:30:00,False,0
1745,akima,1003_6months,2018-09-21 16:07:00,2018-09-21 16:07:00,2018-09-21 18:52:00,0 days 02:45:00,True,2018-09-21 16:02:00,2018-09-21 18:52:00,0 days 02:50:00,True,0
1746,akima,1003_6months,2018-09-22 00:07:00,2018-09-22 00:07:00,2018-09-22 00:27:00,0 days 00:20:00,False,2018-09-22 00:07:00,2018-09-22 00:27:00,0 days 00:20:00,False,0
1747,akima,1003_6months,2018-09-22 03:07:00,2018-09-22 03:07:00,2018-09-22 04:17:00,0 days 01:10:00,False,2018-09-22 03:07:00,2018-09-22 04:17:00,0 days 01:10:00,False,0


In [266]:
aligned_results.sort_values(['col', 'ID', 'global_start']).to_csv('aligned_results_rounded.csv', index=False)

## Distribution of incorrect results

In [47]:
def incorrect_substypes(df_method):
    lv1_lv2 = df_method.loc[(df_method['5_min_lv2']==False) & (df_method['15_min_lv2']==True)].shape[0] # FP lv2 - was actually a lv1
    lv2_lv1 = df_method.loc[(df_method['5_min_lv2']==True) & (df_method['15_min_lv2']==False)].shape[0] # FN lv2 - was identified as lv1
    lv1_none = df_method.loc[(df_method['5_min_lv2']==False) & (pd.isnull(df_method['15_min_lv2']))].shape[0] # FN lv1 - was not identified as hypo
    none_lv1 = df_method.loc[(pd.isnull(df_method['5_min_lv2'])) & (df_method['15_min_lv2']==False)].shape[0] # FP lv1 - was identified as lv1
    lv2_none = df_method.loc[(df_method['5_min_lv2']==True) & (pd.isnull(df_method['15_min_lv2']))].shape[0] # FN lv2 - identified as none
    none_lv2 = df_method.loc[(pd.isnull(df_method['5_min_lv2'])) & (df_method['15_min_lv2']==True)].shape[0] # FP lv2 - was none but identified as lv2
    return pd.DataFrame([[lv1_lv2, lv2_lv1,  lv1_none, none_lv1,lv2_none, none_lv2]])

In [49]:
results = aligned_results.groupby('col').apply(incorrect_substypes).unstack() #.reset_index().drop(columns='level_1')
results.columns=['lv1_lv2', 'lv2_lv1', 'lv1_none','none_lv1', 
                                     'lv2_none', 'none_lv2']

In [50]:
results # actual_predicted

Unnamed: 0_level_0,lv1_lv2,lv2_lv1,lv1_none,none_lv1,lv2_none,none_lv2
col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
akima,8,8,12,11,0,0
cubicspline,12,7,9,13,0,0
cut_glc,0,19,28,1,3,0
linear,3,14,17,4,0,0
matern,11,6,8,14,1,0
pchip,9,10,11,8,0,0
polynomial_3,12,7,9,13,0,0
polynomial_5,12,6,8,13,0,0
polynomial_7,12,6,10,14,0,0
rbf,10,6,10,15,0,0


## Calculate person wise stats

In [230]:
df['ID_code'] = df.ID.apply(lambda x: x[:4])

In [231]:
aligned_results['ID_code'] = aligned_results.ID.apply(lambda x: x[:4])

In [237]:
def number_false_values(sub_id):
    FN_all = sub_id.loc[sub_id.false_value==-1].shape[0]
    TP_all = sub_id.loc[sub_id.false_value==0].shape[0]
    FP_all = sub_id.loc[sub_id.false_value==1].shape[0]
    
    TPlv1 = sub_id[((sub_id['5_min_lv2']==False) & (sub_id['15_min_lv2']==False))].shape[0]
    FNlv1 = sub_id[(sub_id['5_min_lv2']==False) & (sub_id['15_min_lv2']!=False)].shape[0]
    FPlv1 = sub_id[(sub_id['5_min_lv2']!=False) & (sub_id['15_min_lv2']==False)].shape[0]

    TPlv2 = sub_id[((sub_id['5_min_lv2']==True) & (sub_id['15_min_lv2']==True))].shape[0]
    FNlv2 = sub_id[(sub_id['5_min_lv2']==True) & (sub_id['15_min_lv2']!=True)].shape[0]
    FPlv2 = sub_id[(sub_id['5_min_lv2']!=True) & (sub_id['15_min_lv2']==True)].shape[0]
    
    return pd.DataFrame([[TP_all, FP_all, FN_all, TPlv1, FPlv1, FNlv1, TPlv2, FPlv2, FNlv2]],
                       columns = ['TP_all', 'FP_all', 'FN_all', 'TPlv1', 'FPlv1', 'FNlv1', 'TPlv2', 'FPlv2', 'FNlv2'])

In [238]:
results_personal = aligned_results.groupby(['ID_code', 'col'], dropna=False).apply(number_false_values).reset_index().drop(columns='level_2')

In [257]:
melted = results_personal.melt(id_vars=['ID_code', 'col'])

In [258]:
melted

Unnamed: 0,ID_code,col,variable,value
0,1001,akima,TP_all,6
1,1001,cubicspline,TP_all,7
2,1001,cut_glc,TP_all,5
3,1001,linear,TP_all,6
4,1001,matern,TP_all,7
5,1001,pchip,TP_all,6
6,1001,polynomial_3,TP_all,7
7,1001,polynomial_5,TP_all,7
8,1001,polynomial_7,TP_all,7
9,1001,rbf,TP_all,7


In [263]:
pd.DataFrame(melted.groupby(['col', 'variable']).apply(lambda x: x[x.value!=0].shape[0])).reset_index().pivot(index='col', columns='variable')

Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0
variable,FN_all,FNlv1,FNlv2,FP_all,FPlv1,FPlv2,TP_all,TPlv1,TPlv2
col,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
akima,11,16,8,9,16,7,57,54,44
cubicspline,9,15,7,11,16,10,57,54,44
cut_glc,25,22,21,1,19,0,57,54,42
linear,14,16,14,4,17,2,57,54,43
matern,9,14,7,11,15,10,57,54,44
pchip,10,17,10,7,15,8,57,54,43
polynomial_3,9,15,7,11,16,10,57,54,44
polynomial_5,8,15,6,11,15,11,57,54,44
polynomial_7,10,16,6,11,15,11,57,54,44
rbf,10,15,6,11,15,9,57,54,44
