# Create synthetic 15 min dataset and interpolate
The purpose of this notebook is to preprocess CGM data from 95 individual files into one complete file containing: 
- The original 5-minute readings for each individual
- Artificially cut 15-minute data from the 5-minute data
- Interpolated 5-minute data created from interpolating the 15-minute data with various methods

In [54]:
# Import packages and upload dataset
import pandas as pd
import numpy as np
#from datetime import datetime
import datetime
import os
from datetime import timedelta as time
from sklearn.gaussian_process import GaussianProcessRegressor
#from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel as C
import sklearn.gaussian_process.kernels as k
import warnings
warnings.filterwarnings('ignore')

In [47]:
# Upload directory for individual data
directory = 'C:\\Users\\cr591\\OneDrive - University of Exeter\\Desktop\\PhD\\Projects\\interpolation-for-hypo-detection\\dexcom-maths-exploration\\data\\raw_data\\data-preprocessed'

In [None]:
cwd = os.getcwd()

## Clean and combine CGM files

In [49]:
def round_time(dt=None, roundTo=60):
    """Round a datetime object to any time lapse in seconds
    dt : datetime.datetime object, default now.
    roundTo : Closest number of seconds to round to, default 1 minute.
    Author: Thierry Husson 2012 - Use it as you want but don't blame me.
    """
    dt = dt.to_pydatetime()
    seconds = (dt.replace(tzinfo=None) - dt.min).seconds
    rounding = (seconds+roundTo/2) // roundTo * roundTo
    return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond)

In [50]:
def combine_frame(filename):
    '''
    Combines data from all participants into 1 file
    '''
    # set filepath for each file in directory
    filepath = directory + '/' + filename
    # Upload cgm dataset for the file
    df = pd.read_csv(filepath)
    df = df[['timestamp', 'sensorglucose']].dropna(how='all')
    df.columns = ['time', 'glc']
    # Set ID from filename
    ID = filename.replace('.csv','')
    df['ID'] = ID
    df['time'] = pd.to_datetime(df['time'])
    # Round seconds to zero so minute align
    df.time = df.time.apply(lambda x: round_time(x))
    # Replace low and high with values
    df.glc = pd.to_numeric(df.glc.replace({'Low':2.22, 'High':22.22}))\
    .apply(lambda x: 22.22 if x>22.22 else (2.22 if x<2.22 else x))
    return df

In [55]:
# Map the function to all files in the directory
results = list(map(combine_frame, os.listdir(directory)))
# Concatenate all files to make one dataframe
df_total = pd.concat(results).reset_index(drop=True)

## Create synthetic 15-minute data from 5-minute data

In [56]:
def resample_data(dataframe):
    '''
    Resamples the 5-minute glucose data to 15-minute intervals
    '''
    # List for the indices that will be kept for 15 min dataset
    index_list = []
    # Create column with the difference between each row
    dataframe['diff'] = dataframe.time.diff()
    # Set a counter to zero
    time_counter = time(minutes=0)
    # Loop through all the indices to find ones 15 mins or more apart
    for i, row in dataframe.iterrows():
        # If it's the first row, add the index to the list
        if i == dataframe.index[0]:
            index_list.append(i)
            continue
        # Add the difference between the next row to the time counter
        time_counter += dataframe.loc[i]['diff']
        # When the counter is up to >14 mins, add the index to the list
        if time_counter > time(minutes=14):
            index_list.append(i)
            # Reset the counter 
            time_counter = time(minutes=0)
    # Create a new column with nan values
    dataframe['cut_glc'] = np.nan
    # Set the values in cut_glc with the glc values in the index_list
    dataframe['cut_glc'].loc[index_list] = dataframe['glc'].loc[index_list]
    # Remove the diff column
    dataframe.drop(columns='diff', inplace=True)
    return dataframe

In [57]:
# Apply the function to each ID
df_total = df_total.groupby('ID').apply(resample_data)

In [58]:
df_total.head()

Unnamed: 0,time,glc,ID,cut_glc
0,2018-01-08 19:21:00,3.0,1001_baseline,3.0
1,2018-01-08 19:26:00,2.44,1001_baseline,
2,2018-01-08 19:31:00,2.22,1001_baseline,
3,2018-01-08 19:36:00,2.72,1001_baseline,2.72
4,2018-01-08 19:41:00,2.94,1001_baseline,


## Interpolate the 15-minute data using various methods

In [59]:
def resampleDf(df_id):
    '''
    Resample dataset to 1 min intervals
    '''
    # Drop duplicates
    df_id = df_id[~df_id['time'].duplicated(keep='first')]
    # Set time as datetime index
    df_id.set_index('time', inplace=True)
    # Resample to 1 minute intervals 
    df_id = df_id.resample(rule='min', origin='start').asfreq()
    return df_id

In [60]:
# Apply resample functin to each ID
results = df_total.groupby('ID').apply(resampleDf)
# Reset index
results = results.drop('ID', axis=1).reset_index()

In [64]:
results.head(16)

Unnamed: 0,ID,time,glc,cut_glc
0,1001_baseline,2018-01-08 19:21:00,3.0,3.0
1,1001_baseline,2018-01-08 19:22:00,,
2,1001_baseline,2018-01-08 19:23:00,,
3,1001_baseline,2018-01-08 19:24:00,,
4,1001_baseline,2018-01-08 19:25:00,,
5,1001_baseline,2018-01-08 19:26:00,2.44,
6,1001_baseline,2018-01-08 19:27:00,,
7,1001_baseline,2018-01-08 19:28:00,,
8,1001_baseline,2018-01-08 19:29:00,,
9,1001_baseline,2018-01-08 19:30:00,,


### Interpolate using Pandas SciPy wrappers

In [65]:
def interpolate(resampled_dataframe, method, limit, order=5):
    '''
    Interpolate a resampled series with a method of choice from the Pandas
    interpolation wrapper
    '''
    # If the method is polynomial or spline, an order needs to be given
    if (method == 'polynomial') | (method == 'spline'):
        interp_series = resampled_dataframe.interpolate(method=method, 
                                                     limit_area='inside',
                                                     limit_direction='forward',
                                                     limit=limit, order=order)
    # Else no order is needed
    else:
        interp_series = resampled_dataframe.interpolate(method=method,
                                                     limit_area='inside',
                                                     limit_direction='forward',
                                                     limit=limit)
    return interp_series

### Interpolation with Gaussian process regression

In [66]:
def gp_interp(resampled_dataframe):
    '''
    Interpolate using SciKit Learn's implementation of Gaussian Process (GP) 
    using RBF, rational quadratic and Matern kernels
    '''
    # Create a timestamp for the GP
    resampled_dataframe['timestamp'] = resampled_dataframe.time.apply(lambda x: x.timestamp())
    # Y_train is the 15-minute glucose data
    Y_train = resampled_dataframe.dropna().cut_glc
    # X_train is the timestamp in 2d form for the 15-min data
    X_train = np.atleast_2d(resampled_dataframe.dropna(subset=['cut_glc']).timestamp).T
    # X_test is the 1-min data we are predicting
    X_test = resampled_dataframe.timestamp
    # Y_test is the 1-min glucose data
    Y_test = resampled_dataframe.glc
    # Declare the kernels and set the length scale bounds
    rbf = k.RBF(length_scale=1, length_scale_bounds= (400, 900))
    rq = k.RationalQuadratic(length_scale=1, length_scale_bounds= (400, 900),
                                alpha=0)
    matern = k.Matern(length_scale=1, length_scale_bounds= (400, 900), nu=3.5)
    kernels = {'matern': matern, 'rq': rq, 'rbf': rbf}
    # For each of the kernels, fit a GP to predict the missing values
    for kernel in kernels:
        # Declare GP
        gp = GaussianProcessRegressor(kernel=kernels[kernel], alpha=0,
                                      n_restarts_optimizer=4)
        # Fit to 15-min data
        gp.fit(X_train, Y_train)
        # Predict 1-min data
        y_mean= gp.predict(X_test[:, None]) #, return_std=True)
        # Create new column in the dataframe to fit predicted values
        resampled_dataframe[kernel] = y_mean
    return resampled_dataframe

### Combine all methods to create interpolated dataset

In [67]:
def combine_interpolation_frames(dataframe):
    '''
    Combine all interpolation methods into one dataframe
    '''
    # All the interpolation methods to be used
    interp_methods = ['pchip', 'linear', 'cubicspline', 'akima', 'polynomial']
    # 15-minute data will be used for interpolation
    col = dataframe.cut_glc
    # Limit is 15 minutes for interpolation
    limit = 15
    # Run each method through the interpolate function 
    for method in interp_methods:
        if method != 'polynomial':
            name = method
            dataframe[name] = interpolate(col, method, limit)
        else:
            # Use 3 different orders for the polynomial interpolation
            for i in [3, 5, 7]:
                name = method + '_' + str(i)
                dataframe[name] = interpolate(col, method, limit, order=i)
    # Interpolate using gaussian process interpolation
    dataframe = gp_interp(dataframe)
    return dataframe

In [None]:
total_results = results.groupby('ID').apply(combine_interpolation_frames)

In [None]:
total_results

# Declare results dataframe
total_results = pd.DataFrame()
# Call interpolation methods and add them to total_results dataframe
for ID in set(results['ID'].values):
    df_id = results[results['ID']==ID]
    df_total = combine_interpolation_frames(df_ida)
    total_results = total_results.append(df_total)

In [187]:
# Make sure there's no nan values in the interpolation
total_results.dropna(subset=['pchip'], inplace=True)
# Reset the index and round to 2 decimal places to match the CGM readings
total_results.reset_index(drop=True, inplace=True)
total_results = total_results.round(2)

In [190]:
# Drop nan values in the 5-min data to get 5 min rather than 1 min data
total_results.dropna(subset=['glc']).head()

Unnamed: 0,ID,time,glc,cut_glc,pchip,linear,cubicspline,akima,polynomial_3,polynomial_5,polynomial_7,polynomial_9,timestamp,matern,rbf900,rq,rbf
0,2017_6months,2018-11-07 12:52:00,16.72,16.72,16.72,16.72,16.72,16.72,16.72,16.72,16.72,16.72,1541595000.0,16.72,16.72,16.72,16.72
5,2017_6months,2018-11-07 12:57:00,17.0,,17.29,17.15,17.28,17.29,17.28,16.93,16.24,16.63,1541595000.0,17.87,17.73,17.15,17.73
10,2017_6months,2018-11-07 13:02:00,17.39,,17.75,17.57,17.7,17.73,17.7,17.48,17.11,17.3,1541596000.0,18.09,18.01,17.61,18.01
15,2017_6months,2018-11-07 13:07:00,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,1541596000.0,18.0,18.0,18.0,18.0
20,2017_6months,2018-11-07 13:12:00,17.78,,18.11,18.07,18.18,18.12,18.18,18.3,18.45,18.39,1541596000.0,17.96,18.01,18.12,18.01


In [None]:
total_results.to_csv('interp_dataset.csv')