# 2. Preprocessing

In [1]:
import pandas as pd
import numpy as np
from src.diametrics import transform, preprocessing
import copy

In [None]:
dexcom_data = transform.transform_directory(directory='tests/test_data/dexcom/', device='dexcom')
libre1 = transform.open_file('tests/test_data/libre/libre_amer_01.csv')
libre1_transformed = transform.convert_libre(libre1)
dxcm2 = transform.open_file('tests/test_data/dexcom/dexcom_eur_02.xlsx')
dxcm2_transformed = transform.convert_dexcom(dxcm2)

Unnamed: 0,time,glc,ID
0,2023-03-08 00:04:00,6.4,dexcom_eur_01
1,2023-03-08 00:09:00,6.5,dexcom_eur_01
2,2023-03-08 00:13:59,6.1,dexcom_eur_01
3,2023-03-08 00:18:59,6.5,dexcom_eur_01
4,2023-03-08 00:23:59,6.3,dexcom_eur_01


## 2.1. Check if df is usable

In [5]:
preprocessing.check_df(dexcom_data)


True

In [6]:
preprocessing.check_df(dxcm2_transformed)


True

## 2.2. Replacing the lo/hi cutoff values

In [7]:
dexcom_data = preprocessing.replace_cutoffs(dexcom_data)

In [8]:
libre1_transformed = preprocessing.replace_cutoffs(libre1_transformed, lo_cutoff=2.1, hi_cutoff=27.8)

In [9]:
dxcm2_transformed = preprocessing.replace_cutoffs(dxcm2_transformed, remove=True)

## 2.3. Change start and end times

### 2.3.1. Individual with one start time for all

In [10]:
preprocessing.set_time_frame(libre1_transformed, ['2021-03-21', '2021-04-01'])

Unnamed: 0,time,glc,scan_glc
26,2021-03-21 00:08:00,188,
27,2021-03-21 00:23:00,178,
28,2021-03-21 00:38:00,163,
29,2021-03-21 00:53:00,160,
30,2021-03-21 01:08:00,158,
...,...,...,...
1077,2021-03-31 22:53:00,162,
1078,2021-03-31 23:08:00,156,
1079,2021-03-31 23:23:00,155,
1080,2021-03-31 23:38:00,145,


### 2.3.2. Using a dictionary

In [11]:
d = {'dexcom_eur_01': ['2023-03-13 03:00:00', '2023-03-20 14:30:00'],
     'dexcom_eur_02': ['2023-03-09', '2023-03-18'],
     'dexcom_eur_03': ['2023-03-14', '2023-03-21']}

In [12]:
preprocessing.set_time_frame(dexcom_data, d)

Unnamed: 0,time,glc,ID
0,2023-03-13 03:04:11,14.7,dexcom_eur_01
1,2023-03-13 03:09:11,14.7,dexcom_eur_01
2,2023-03-13 03:14:11,14.2,dexcom_eur_01
3,2023-03-13 03:19:11,13.8,dexcom_eur_01
4,2023-03-13 03:24:11,13.8,dexcom_eur_01
...,...,...,...
6562,2023-03-20 23:35:18,7.2,dexcom_eur_03
6563,2023-03-20 23:40:18,7.3,dexcom_eur_03
6564,2023-03-20 23:45:18,7.3,dexcom_eur_03
6565,2023-03-20 23:50:18,7.3,dexcom_eur_03


## 2.4. Interpolate missing data

##### Create synthetic missing data

In [13]:
# Create a chunk of data from the transformed DataFrame
dxcm2_chunk = copy.copy(dxcm2_transformed.head(30))
# Set certain rows in the 'glc' column to NaN to simulate missing data
dxcm2_chunk.loc[[4,5,14,15,16,17,18,19,20,21,26,27],'glc'] = np.nan
# Display the first 10 rows of the chunk DataFrame
dxcm2_chunk.head(10)

Unnamed: 0,time,glc
0,2023-03-08 00:00:44,10.4
1,2023-03-08 00:05:44,10.3
2,2023-03-08 00:10:44,10.2
3,2023-03-08 00:15:44,10.1
4,2023-03-08 00:20:44,
5,2023-03-08 00:25:44,
6,2023-03-08 00:30:44,9.9
7,2023-03-08 00:35:44,9.9
8,2023-03-08 00:40:44,9.5
9,2023-03-08 00:45:44,9.5


In [14]:
# Fill missing data using the preprocessing function
preprocessing.fill_missing_data(dxcm2_chunk, interval=5, method='pchip', limit=30).head(10)

Unnamed: 0,glc
0,10.4
1,10.3
2,10.2
3,10.1
4,10.0
5,9.9
6,9.9
7,9.9
8,9.5
9,9.5


In [15]:
libre1_chunk = copy.copy(libre1_transformed.head(30))
libre1_chunk.loc[[4,5,14,15,16,17,18,19,20,21,26,27],'glc'] = np.nan
libre1_chunk.head(10)

Unnamed: 0,time,glc,scan_glc
0,2021-03-20 17:38:00,127.0,
1,2021-03-20 17:53:00,124.0,
2,2021-03-20 18:08:00,121.0,
3,2021-03-20 18:23:00,131.0,
4,2021-03-20 18:38:00,,
5,2021-03-20 18:53:00,,
6,2021-03-20 19:08:00,166.0,
7,2021-03-20 19:23:00,165.0,
8,2021-03-20 19:38:00,162.0,
9,2021-03-20 19:53:00,154.0,


In [16]:
preprocessing.fill_missing_data(libre1_chunk, interval=15, method='linear', limit=45).head(10)

Unnamed: 0,glc,scan_glc
0,127.0,
1,124.0,
2,121.0,
3,131.0,
4,142.7,
5,154.3,
6,166.0,
7,165.0,
8,162.0,
9,154.0,


## 2.5. Change units

In [17]:
preprocessing.change_units(libre1_transformed)

Unnamed: 0,time,glc,scan_glc
0,2021-03-20 17:38:00,7.1,
1,2021-03-20 17:53:00,6.9,
2,2021-03-20 18:08:00,6.7,
3,2021-03-20 18:23:00,7.3,
4,2021-03-20 18:38:00,8.5,
...,...,...,...
1334,2021-04-03 15:08:00,7.0,
1335,2021-04-03 15:23:00,6.6,
1336,2021-04-03 15:38:00,6.1,
1337,2021-04-03 15:53:00,6.0,


In [18]:
preprocessing.change_units(dxcm2_transformed)

Unnamed: 0,time,glc
0,2023-03-08 00:00:44,187.0
1,2023-03-08 00:05:44,185.0
2,2023-03-08 00:10:44,183.0
3,2023-03-08 00:15:44,181.0
4,2023-03-08 00:20:44,178.0
...,...,...
3890,2023-03-21 15:10:57,117.0
3891,2023-03-21 15:15:57,122.0
3892,2023-03-21 15:20:57,124.0
3893,2023-03-21 15:25:57,129.0
