# 3. Target creation
The objective of this notebook is to create the target variables for ML and statistical analysis.

### Target variables
The targets of interest are hypo- and hyper-glycemia. A single point in either is considered as a positive result.
The time periods of interest for prediction are:
- during exercise
- the 1 hr after exercise
- the 4 hrs after exercise

### Features
- CGM extracted features
    - Start and end glucose
    - Metrics of glycemic control (e.g. time in range, average glucose) calculated using diametrics
    - features extracted using tsfresh, a python package that automatically calculates a large number of time series characteristics (https://tsfresh.readthedocs.io/en/latest/)
    - Time series itself
- Exercise diary data (e.g. duration of bout, type of exercise)
- Demographics data
- Lab data

### Objectives:
1. Calculate start and end glucose for each bout
2. Divide CGM data into periods of interest
3. Extract metrics of glycemic control for each period
4. Extract tsfresh metrics for each period
5. Extract time series
5. Combine extracted features into dataframes

## 3.0. Import packages

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import datetime
import os
from functools import reduce
import preprocess_helper
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute, make_forecasting_frame
from tsfresh.feature_extraction import ComprehensiveFCParameters, settings
import warnings
warnings.filterwarnings('ignore')

import sys
# Change path to wherever Diametrics is
path = "../../diametrics/diametrics" #### CHANGE
sys.path.append(path)
import metrics as cgm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [20]:
# Define global parameters
fift_mins = timedelta(minutes=15)
thirt_mins = timedelta(minutes=30)

In [21]:
directory = '../../Data/tidy_data/'

## 3.1. Upload data

### 3.1.1. Upload EXTOD 101 data

In [22]:
# Exercise diaries
extod_101_diaries = pd.read_csv(directory + 'extod_101_diaries.csv')
extod_101_diaries['start_datetime'] = pd.to_datetime(extod_101_diaries['start_datetime'])
extod_101_diaries['finish_datetime'] = pd.to_datetime(extod_101_diaries['finish_datetime'])

In [23]:
extod_101_diaries.shape[0]

776

In [24]:
# Set the durations of interest to 10-120 mins 
extod_101_diaries = extod_101_diaries.loc[(extod_101_diaries.duration>=10) & (extod_101_diaries.duration<=120)]

In [25]:
extod_101_diaries.shape[0]

688

In [26]:
extod_101_diaries = extod_101_diaries.reset_index(drop=True)

In [39]:
# CGM data
extod_101_cgm = pd.read_csv(directory +'extod_101_cgm.csv').reset_index(drop=True)
extod_101_cgm['time'] = pd.to_datetime(extod_101_cgm['time'])

In [28]:
# Demo lab data
extod_101_demo_lab = pd.read_csv(directory + 'extod_101_demo_lab.csv')

### 3.1.2. Upload EXTOD education data

In [40]:
# Exercise diaries
extod_edu_diaries = pd.read_csv(directory + 'extod_edu_diaries.csv')
extod_edu_diaries['start_datetime'] = pd.to_datetime(
    extod_edu_diaries['start_datetime'])
extod_edu_diaries['finish_datetime'] = pd.to_datetime(
    extod_edu_diaries['finish_datetime'])

In [41]:
extod_edu_diaries.shape[0]

875

In [42]:
# Set the durations of interest to 10-300 mins 
extod_edu_diaries = extod_edu_diaries.loc[(extod_edu_diaries.duration>=10) &
                                          (extod_edu_diaries.duration<=120)]

In [43]:
extod_edu_diaries.shape[0]

804

In [44]:
extod_edu_diaries = extod_edu_diaries.reset_index(drop=True)

In [45]:
# CGM data
extod_edu_cgm = pd.read_csv(directory + 'extod_edu_cgm.csv')
extod_edu_cgm['time'] = pd.to_datetime(extod_edu_cgm['time'])
#extod_edu_cgm['id'] = str(extod_edu_cgm.ID) + extod_edu_cgm.period
# Drop scan_glc column and null values
extod_edu_cgm = extod_edu_cgm.drop(columns='period').dropna().reset_index(drop=True)

In [47]:
# Demo lab data
extod_edu_demo_lab = pd.read_csv(directory + 'extod_edu_demo_lab.csv')

In [48]:
# Combine demo-lab data from both studied
demo_lab = pd.concat([extod_101_demo_lab, extod_edu_demo_lab], axis=0)

## 3.2. Calculate start and end glucose

In [75]:
extod_101_cgm.tail()


Unnamed: 0,ID,time,glc,scan_glc
509287,3035,2018-09-05 07:57:00,,5.9
509288,3035,2018-09-05 08:48:00,8.0,
509289,3035,2018-09-05 09:03:00,7.4,
509290,3035,2018-09-05 09:19:00,7.0,
509291,3035,2018-09-05 09:31:00,,7.8


In [76]:
extod_101_cgm = extod_101_cgm.reset_index()

In [77]:
extod_101_cgm.tail()

Unnamed: 0,index,ID,time,glc,scan_glc
509287,509287,3035,2018-09-05 07:57:00,,5.9
509288,509288,3035,2018-09-05 08:48:00,8.0,
509289,509289,3035,2018-09-05 09:03:00,7.4,
509290,509290,3035,2018-09-05 09:19:00,7.0,
509291,509291,3035,2018-09-05 09:31:00,,7.8


In [86]:
def calc_glc(df, time, window, libre=False):
    '''
    Calculates the starting glucose from the time and cgm data
    '''
    time = pd.to_datetime(time)
    # treating the exercise period as truth, so only look before
    sub_df = df[(df['time'] >= (time - timedelta(minutes=window))) &
                (df['time'] < time)]
    if not sub_df.empty:
        if libre:
            sub_df.dropna(subset=['glc', 'scan_glc'], how='all', inplace=True)
            sub_df['glc'].fillna(sub_df.scan_glc, inplace=True)
        
        # Calculate diff
        sub_df['one_time'] = time
        sub_df['diff'] = sub_df[['time', 'one_time']].diff(axis=1)['one_time']
        sub_df['diff'] = sub_df['diff'].apply(lambda x: abs(x.total_seconds()))
        ind = sub_df['diff'].idxmin()
        glc = sub_df['glc'].loc[ind]
        try:
            ind_prev = ind-1
            
            if (df.iloc[ind].time - df.iloc[ind_prev].time) < timedelta(minutes=60):
                prec_reading_glc = df.loc[ind_prev].glc
            else:
                prec_reading_glc = np.nan
        except Exception as e:
            prec_reading_glc = np.nan


    else:
        glc = np.nan
        prec_reading_glc = np.nan
    
    return glc, prec_reading_glc

In [87]:
# Calculate start + end glucose for 101 diaries
extod_101_diaries['start_glc'] = extod_101_diaries.apply(lambda row: calc_glc(
    extod_101_cgm.loc[extod_101_cgm['ID']==row.ID], row.start_datetime, 30,
    True), axis=1)

In [79]:
extod_edu_cgm.iloc[106260]

time    2018-01-11 07:04:03
glc                    8.49
ID                     1006
Name: 106260, dtype: object

In [78]:
# Calculate start + end glucose for edu diaries
extod_edu_diaries['start_glc'] = extod_edu_diaries.apply(lambda row: preprocess_helper.calc_glc(
    extod_edu_cgm.loc[extod_edu_cgm['ID']==row.ID], row.start_datetime, 30,
    False), axis=1)


KeyError: 106260

In [50]:
# Calculate start + end glucose for 101 diaries
extod_101_diaries['start_glc'] = extod_101_diaries.apply(lambda row: preprocess_helper.calc_glc(
    extod_101_cgm.loc[extod_101_cgm['ID']==row.ID], row.start_datetime, 30,
    True), axis=1)

# Calculate start + end glucose for edu diaries
extod_edu_diaries['start_glc'] = extod_edu_diaries.apply(lambda row: preprocess_helper.calc_glc(
    extod_edu_cgm.loc[extod_edu_cgm['ID']==row.ID], row.start_datetime, 30,
    False), axis=1)


KeyError: 223141

In [None]:
# Drop scan_glc column and null values
extod_101_cgm = extod_101_cgm.drop(columns='scan_glc').dropna()

## 3.3. Divide CGM data into bouts

In [None]:
# Create unique id for each bout
extod_101_diaries = preprocess_helper.create_bout_id(extod_101_diaries)
extod_edu_diaries = preprocess_helper.create_bout_id(extod_edu_diaries)

In [None]:
extod_101_diaries.shape[0]

688

In [None]:
extod_edu_diaries.shape[0]

804

In [None]:
# Combine data from both studies
diaries = pd.concat([extod_101_diaries, extod_edu_diaries], axis=0)

#### During

In [None]:
# Set up during CGM dataframe for each study
extod_101_during, extod_101_during_series, _ = preprocess_helper.set_up_dataframes(extod_101_cgm, extod_101_diaries, 'during',
                                     15)
extod_101_during.reset_index(drop=True, inplace=True)
extod_edu_during, extod_edu_during_series, _ = preprocess_helper.set_up_dataframes(extod_edu_cgm, extod_edu_diaries, 'during',
                                     5)
extod_edu_during.reset_index(drop=True, inplace=True)

### 3.5.1. Create the dataframes for predicting during exercise

#### 3.4.1.1. Extract glycemic metrics

before_cgm = pd.concat([extod_101_before, extod_edu_before]).reset_index(drop=True)

#### 3.4.2.1. Extract glycemic metrics (during)

In [None]:
# Calculate glycemic metrics for extod 101 datasets
glyc_metrics_101_during = cgm.all_metrics(extod_101_during, ID='bout_id', interval_size=15,
                  exercise_thresholds=True)
# Calculate glycemic metrics for extod education datasets
glyc_metrics_edu_during = cgm.all_metrics(extod_edu_during, ID='bout_id', interval_size=5,
                  exercise_thresholds=True)
# Concat the two
X_during_glyc_metrics = pd.concat([glyc_metrics_101_during,
                                   glyc_metrics_edu_during]).rename(columns={'ID':'bout_id'})
# Label columns with 'during'
X_during_glyc_metrics.columns = ['during_'+i  if i not in ['bout_id'] else i for i in X_during_glyc_metrics.columns]

In [None]:
X_during_glyc_metrics.head()

Unnamed: 0,bout_id,during_TIR_lv2_hypo,during_TIR_lv1_hypo,during_TIR_hypo,during_TIR_norm,during_TIR_hyper,during_TIR_lv1_hyper,during_TIR_lv2_hyper,during_TIR_hypo_exercise,during_TIR_normal_exercise,during_TIR_hyper_exercise,during_number_hypos,during_avg_length_of_hypo,during_total_time_in_hypos,during_number_lv1_hypos,during_number_lv2_hypos,during_number_hypos_below_5,during_avg_length_hypo_below_5,during_total_time_in_hypos_below_5,during_sd,during_cv,during_minimum_glucose,during_maximum_glucose,during_average_glucose,during_mage_mean,during_ea1c,during_percent_missing
0,3032_562,0.0,0.0,0.0,50.0,50.0,50.0,0.0,0.0,100.0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,2.333452,26.668027,7.1,10.4,8.75,3.3,7.132075,0.0
1,3030_505,0.0,0.0,0.0,0.0,100.0,25.0,75.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.509902,3.565748,13.6,14.8,14.3,1.2,10.622642,0.0
2,3004_82,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,100.0,0,0.0,0.0,0,0,0,0.0,0.0,,,16.4,16.4,16.4,0.0,11.943396,0.0
3,3002_26,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,100.0,0,0.0,0.0,0,0,0,0.0,0.0,0.763763,3.301568,22.3,23.8,23.133333,1.5,16.178197,0.0
4,3026_420,0.0,0.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,100.0,0,0.0,0.0,0,0,0,0.0,0.0,1.343503,6.292754,20.4,22.3,21.35,1.9,15.056604,0.0


In [None]:
X_during_glyc_metrics[X_during_glyc_metrics.during_TIR_hyper>0].count()

bout_id                               452
during_TIR_lv2_hypo                   452
during_TIR_lv1_hypo                   452
during_TIR_hypo                       452
during_TIR_norm                       452
during_TIR_hyper                      452
during_TIR_lv1_hyper                  452
during_TIR_lv2_hyper                  452
during_TIR_hypo_exercise              452
during_TIR_normal_exercise            452
during_TIR_hyper_exercise             452
during_number_hypos                   452
during_avg_length_of_hypo             452
during_total_time_in_hypos            452
during_number_lv1_hypos               452
during_number_lv2_hypos               452
during_number_hypos_below_5           452
during_avg_length_hypo_below_5        452
during_total_time_in_hypos_below_5    452
during_sd                             425
during_cv                             425
during_minimum_glucose                452
during_maximum_glucose                452
during_average_glucose            

#### 3.4.2.2. Extract target features (y) for during exercise

In [None]:
# Select the target features (y)
y_101_during = glyc_metrics_101_during.loc[:,['ID', 'TIR_hypo_exercise', 'TIR_hyper_exercise'
                                              ]]
y_101_during.columns=['bout_id', 'y_hypo', 'y_hyper']
y_101_during['y_hypo'] = y_101_during['y_hypo']>0
y_101_during['y_hyper'] = y_101_during['y_hyper']>0

y_edu_during = glyc_metrics_edu_during.loc[:,['ID', 'TIR_hypo_exercise',  'TIR_hyper_exercise'
                                              ]]
y_edu_during.columns=['bout_id', 'y_hypo', 'y_hyper']
y_edu_during['y_hypo'] = y_edu_during['y_hypo']>0
y_edu_during['y_hyper'] = y_edu_during['y_hyper']>0


# Concat the two
y_during = pd.concat([y_101_during, y_edu_during])
# Rename y columns
y_during.columns = ['bout_id', 'y_hypo', 'y_hyper']

In [None]:
during_merged = X_during_glyc_metrics.merge(y_during, on='bout_id')

## 3.5. Combine dataframes for the machine learning tasks

In [None]:
# Set directory for saving files
df_directory = '../../Data/tidy_data/'

### During

In [None]:
# Frames to be merged, glyc metrics for 4hrs before, exercise diaries, TIR during
frames = [diaries, y_during]
# Merge 'em on bout_id
df_glyc_during = reduce(lambda left, right: pd.merge(left, right,
                                                      on=['bout_id']), frames)
# Recreate ID from bout_id
df_glyc_during['ID'] = df_glyc_during['bout_id'].apply(lambda x: int(x[:4]))
# Merge with demographics and lab data on ID
df_glyc_during = df_glyc_during.merge(demo_lab, on='ID')

In [None]:
all_drop = ['start_datetime', 'finish_datetime', 'type_of_exercise',
            'starting_glucose', 'finishing_glucose', 'insulin_administration',
            'urine_cpep', 'urine_creat', 'chol', 'h_index', 'hdl', 'ldl',
            'nhdl', 'trig', 'bout_id']
df_glyc_during.drop(columns=all_drop,inplace=True)

# Changing the time in ranges to binary
for i in during_frames + after_frames:
    i['y_hypo'] = i['y_hypo'] > 0
    i['y_hyper'] = i['y_hyper'] > 0
    # Add a column for stratification
    i['stratify'] = i['ID'].astype(str) + '_' + i['y_hypo'].astype(str)

In [None]:
# Glycemic metrics only
df_glyc_during.head()

Unnamed: 0,ID,intensity,month,day,day_of_week,time_of_day,duration,form_of_exercise,start_glc,y_hypo,y_hyper,age,sex,years_since_diagnosis,bmi,hba1c,cpep
0,3001,15.0,5,19,5,morning,71.0,aer,15.3,False,True,41.615332,female,10.064339,23.112472,73.0,3.0
1,3001,15.0,5,29,1,morning,26.0,aer,12.7,False,False,41.615332,female,10.064339,23.112472,73.0,3.0
2,3001,,6,3,6,evening,76.0,aer,10.4,False,False,41.615332,female,10.064339,23.112472,73.0,3.0
3,3001,,6,11,0,afternoon,13.0,aer,22.0,False,True,41.615332,female,10.064339,23.112472,73.0,3.0
4,3001,15.0,6,13,2,afternoon,55.0,aer,21.8,False,True,41.615332,female,10.064339,23.112472,73.0,3.0


## 3.6. Save to csv

In [None]:
# During
df_glyc_during.to_csv(df_directory+'during_simple_only.csv', index=False)