# 1. Preprocessing
The purpose of this notebook is to preprocess CGM data from 95 individual files into one complete file containing: 
- The original 5-minute readings for each individual
- Artificially cut 15-minute data from the 5-minute data
- Interpolated 5-minute data created from interpolating the 15-minute data with various methods

## 1.1. Import packages and upload data

In [1]:
# Import packages and upload dataset
import pandas as pd
import numpy as np
import datetime
import os
from datetime import timedelta as time
import warnings
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib as mpl
mpl.style.use('default')
warnings.filterwarnings('ignore')
%matplotlib inline

In [5]:
dexi = pd.read_csv('../data/cgm/dexi_cgm.csv')
#dexip = pd.read_csv('data/dexip_cgm.csv')

# combine the two datasets
cgm =dexi # pd.concat([dexi, dexip], axis=0)

cgm['time'] = pd.to_datetime(cgm['time'])



## 1.2.1 Drop traces with more than 30% missing data

## Resample - create 3 versions of 15 min interval data

In [13]:
cgm_cut1 = cgm.copy()

cgm_cut1['cut_glc'] = cgm_cut1['glc']

# Set 'glc' column to np.nan for every 2nd row starting from the 1st index
cgm_cut1.iloc[1::3, cgm_cut1.columns.get_loc('cut_glc')] = np.nan

# Set 'glc' column to np.nan for every 3rd row starting from the 2nd index
cgm_cut1.iloc[2::3, cgm_cut1.columns.get_loc('cut_glc')] = np.nan

### 1.4.1. Interpolate using Pandas SciPy wrappers

In [14]:
def interpolate(resampled_dataframe, method, limit, order=5):
    '''
    Interpolate a resampled series with a method of choice from the Pandas
    interpolation wrapper
    '''
    # If the method is polynomial or spline, an order needs to be given
    if (method == 'polynomial') | (method == 'spline'):
        interp_series = resampled_dataframe.interpolate(method=method, 
                                                     limit_area='inside',
                                                     limit_direction='forward',
                                                     limit=limit, order=order)
    # Else no order is needed
    else:
        interp_series = resampled_dataframe.interpolate(method=method,
                                                     limit_area='inside',
                                                     limit_direction='forward',
                                                     limit=limit)
    return interp_series

### 1.4.3. Combine all methods to create interpolated dataset

In [15]:
def combine_interpolation_frames(dataframe):
    '''
    Combine all interpolation methods into one dataframe
    '''

    dataframe = dataframe.drop_duplicates(subset='time').sort_values('time').set_index('time')

    # All the interpolation methods to be used
    #interp_methods = ['pchip', 'linear','cubicspline', 'akima', 'polynomial', 'quadratic', 'krogh', 'piecewise_polynomial', 'barycentric']
    interp_methods = ['pchip', 'linear', 'cubicspline', 'polynomial']

    # 15-minute data will be used for interpolation
    col = dataframe.glc
    # Limit is 15 minutes for interpolation
    limit = 15
    # Run each method through the interpolate function 
    for method in interp_methods:
        print(method)
        if method != 'spline':
            name = method
            dataframe[name] = interpolate(col, method, limit)
        else:
            # Use 3 different orders for the polynomial interpolation
            for i in [3, 5, 7]:
                name = method + '_' + str(i)
                dataframe[name] = interpolate(col, method, limit, order=i)
    # Interpolate using gaussian process interpolation
    #dataframe = gp_interp(dataframe)
    return dataframe.round(2)

In [27]:
interpolated_results = cgm_cut1.groupby('ID').apply(lambda group: combine_interpolation_frames(group))

pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
polynomial
pchip
linear
cubicspline
pol

In [29]:
interpolated_results

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,glc,cut_glc,pchip,linear,cubicspline,polynomial
ID,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
helm_1,2020-05-11 00:01:17,helm_1,6.39,6.39,6.39,6.39,6.39,6.39
helm_1,2020-05-11 00:06:17,helm_1,6.06,,6.06,6.06,6.06,6.06
helm_1,2020-05-11 00:11:17,helm_1,5.83,,5.83,5.83,5.83,5.83
helm_1,2020-05-11 00:16:18,helm_1,5.89,5.89,5.89,5.89,5.89,5.89
helm_1,2020-05-11 00:21:18,helm_1,6.11,,6.11,6.11,6.11,6.11
...,...,...,...,...,...,...,...,...
helm_988,2021-04-16 23:36:40,helm_988,4.94,,4.94,4.94,4.94,4.94
helm_988,2021-04-16 23:41:40,helm_988,5.22,,5.22,5.22,5.22,5.22
helm_988,2021-04-16 23:46:40,helm_988,5.28,5.28,5.28,5.28,5.28,5.28
helm_988,2021-04-16 23:51:40,helm_988,5.17,,5.17,5.17,5.17,5.17


In [30]:
interpolated_results.reset_index().to_csv('../data/interpolated_cgm.csv', index=False)

ValueError: cannot insert ID, already exists