# 4. Preprocessing JAEB T1-DEXIP dataset

The goal of this notebook is to prepare the EXTOD education data for machine learning - predicting euglycemia during and around exercise.

EXTOD education was a pilot study in which 106 participants were randomly allocated to either standard care or an education programme teaching them how better to manage blood glucose around exercise.

The data used will be demographic, lab, clinical and physiological if available.

### Objectives:
1. Clean and combine the exercise diaries
2. Preprocess demographic data
3. Preprcoess lab data
4. Prepare CGM data

## 4.0. Import packages

In [1]:
# Import packages and upload dataset
import pandas as pd
import numpy as np
from datetime import timedelta as time
import warnings
import preprocess_helper
import functools as ft
import sys
path = "../../diametrics"
sys.path.append(path)
import visualizations

warnings.filterwarnings('ignore')

In [2]:
# Set the directory
directory = '../../data/raw_data/helmsley_dexip/'

## 4.1. Clean and combine exercise diaries

### 4.1.1. Load files and rename columns

In [3]:
# Read file
exercise = pd.read_csv(directory+'pr.csv')

In [4]:
exercise


Unnamed: 0,STUDYID,DOMAIN,USUBJID,SPDEVID,PRSEQ,PRGRPID,PRLNKID,PRTRT,PRCAT,PRSCAT,PRSTDTC,PREVLINT,PRTRTC,PLNEXDUR,EXCINTSY,EXCOMP,ACTWREX,EXMANUAL,PECLFLG,FITCLFLG
0,T1DEXI-P,PR,100.0,GARMIN STUDY WATCH,1.000000e+00,2,1345,RUNNING,,,20210810T153328,20210810T153328/P0Y0M0DT0H7M9S,,0,,,,0,0,0
1,T1DEXI-P,PR,100.0,GARMIN STUDY WATCH,2.000000e+00,2,1355,RUNNING,,,20210811T125920,20210811T125920/P0Y0M0DT0H6M35S,,0,,,,0,0,0
2,T1DEXI-P,PR,100.0,GARMIN STUDY WATCH,3.000000e+00,2,1354,RUNNING,,,20210811T163558,20210811T163558/P0Y0M0DT0H7M7S,,0,,,,0,0,0
3,T1DEXI-P,PR,100.0,GARMIN STUDY WATCH,4.000000e+00,2,1362,RUNNING,,,20210812T124920,20210812T124920/P0Y0M0DT0H7M2S,,0,,,,0,0,0
4,T1DEXI-P,PR,100.0,GARMIN STUDY WATCH,5.000000e+00,2,1361,RUNNING,,,20210812T165919,20210812T165919/P0Y0M0DT0H7M7S,,0,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6795,,,,,3.687825e-40,1,0,Walking,,,20211014T103500,,,90,Mild: Working a bit,0.0,1.0,0,0,0
6796,,,,,3.687825e-40,1,0,Walking,,,20211014T103500,,,90,Mild: Working a bit,0.0,1.0,0,0,0
6797,,,,,3.687825e-40,1,0,Walking,,,20211014T103500,,,90,Mild: Working a bit,0.0,1.0,0,0,0
6798,,,,,3.687825e-40,1,0,Walking,,,20211014T103500,,,90,Mild: Working a bit,0.0,1.0,0,0,0


In [5]:
# Convert the start time to datetime
exercise['PRSTDTC'] = pd.to_datetime(exercise['PRSTDTC'])

In [6]:
# Function to extract duration in minutes from PREVLINT
def extract_duration(row):
    if pd.isna(row['PREVLINT']):
        return 0
    duration = row['PREVLINT']

    time = duration.split('/')[1].split('T')[1]
    hours = time.split('H')[0]
    minutes = time.split('H')[1].split('M')[0]
    seconds = time.split('M')[1].split('S')[0]

    return int(hours) * 60 + int(minutes) + int(seconds) / 60

# Apply the function to the DataFrame
exercise['duration_minutes'] = exercise.apply(extract_duration, axis=1)

In [7]:
exercise['PLNEXDUR'].isnull().any()

False

In [8]:
# Add duration_minutes to PLNEXDUR where duration_minutes is not NaN
exercise['PLNEXDUR'] = np.where(pd.notna(exercise['duration_minutes']), exercise['PLNEXDUR'] + exercise['duration_minutes'], exercise['PLNEXDUR'])


In [9]:
# Select and rename columns
exdi = exercise[['USUBJID', 'PRSTDTC', 'PLNEXDUR', 'EXCINTSY', 'PRTRT']] 
exdi.columns = ['ID', 'start_datetime', 'duration', 'intensity', 'type_of_exercise']

In [10]:
exdi.dropna(subset=['ID'], inplace=True)
# ID as int for concatenation
exdi['ID'] = exdi['ID'].astype(int)

In [11]:
exdi.shape[0]

6743

In [12]:
len(exdi.ID.unique())

262

### 4.1.2. Adjust date columns

In [12]:
# Calculate end time from start and duration 
exdi['finish_datetime'] = exdi.apply(lambda row: row['start_datetime'] + time(minutes=row['duration']), axis=1)

In [13]:
# Collect date information from start_datetime
exdi = preprocess_helper.date_preprocessing(exdi, 'start_datetime', False, True, True)

In [14]:
# Drop any instances with no id, start time or duration
exdi = exdi.dropna(subset=['ID', 'start_datetime', 'duration'])

In [15]:
exdi.shape[0]

6743

In [16]:
exdi = exdi.sort_values(by=['ID', 'start_datetime', 'finish_datetime'])

In [17]:
# Make sure there's no 0 durations
exdi = exdi.loc[exdi['duration']>0]

In [18]:
exdi.shape[0]

6729

In [19]:
exdi

Unnamed: 0,ID,start_datetime,duration,intensity,type_of_exercise,finish_datetime,month,day,day_of_week,time_of_day
6134,6,2021-02-13 15:07:00,60.000000,Moderate: Working to keep up,Undefined Exercise,2021-02-13 16:07:00,2,13,5,afternoon
6135,6,2021-02-14 19:21:00,50.000000,Moderate: Working to keep up,Undefined Exercise,2021-02-14 20:11:00,2,14,6,evening
6132,6,2021-02-15 17:15:00,115.000000,Low: Pretty easy,Physical Labor,2021-02-15 19:10:00,2,15,0,evening
6136,6,2021-02-16 14:00:00,45.000000,Moderate: Working to keep up,Undefined Exercise,2021-02-16 14:45:00,2,16,1,afternoon
6137,6,2021-02-18 16:22:56,40.000000,Heavy: Hard to keep going but did it,Undefined Exercise,2021-02-18 17:02:56,2,18,3,afternoon
...,...,...,...,...,...,...,...,...,...,...
6062,540,2021-05-21 20:20:00,20.000000,Mild: Working a bit,Running/Jogging,2021-05-21 20:40:00,5,21,4,evening
6065,540,2021-05-22 07:15:48,7.266667,,WALKING,2021-05-22 07:23:04,5,22,5,morning
6066,540,2021-05-22 11:10:04,13.916667,,WALKING,2021-05-22 11:23:59,5,22,5,morning
6067,540,2021-05-22 14:46:52,9.333333,,WALKING,2021-05-22 14:56:12,5,22,5,afternoon


def handle_overlaps(group):
    # Sort by start_datetime
    group = group.sort_values(by='start_datetime')
    
    non_overlapping = []
    for _, row in group.iterrows():
        if non_overlapping:
            last_interval = non_overlapping[-1]
            # Check for overlap
            if row['start_datetime'] < last_interval['finish_datetime']:
                curr_duration = (row['finish_datetime'] - row['start_datetime']).seconds / 60
                last_duration = (last_interval['finish_datetime'] - last_interval['start_datetime']).seconds / 60
                
                # Decide which interval to keep
                if (curr_duration >= 10 and curr_duration <= 120) and (last_duration < 10 or last_duration > 120):
                    non_overlapping[-1] = row
                elif (last_duration >= 10 and last_duration <= 120) and (curr_duration < 10 or curr_duration > 120):
                    continue
                elif curr_duration > last_duration:
                    non_overlapping[-1] = row
            else:
                non_overlapping.append(row)
        else:
            non_overlapping.append(row)
    
    return pd.DataFrame(non_overlapping)

# Apply the function to each group
exdi = exdi.groupby('ID').apply(handle_overlaps).reset_index(drop=True)


In [20]:
def handle_overlaps(group):
    # Sort by start_datetime
    group = group.sort_values(by='start_datetime')
    
    non_overlapping = []
    overlap_count = 0  # Counter to limit the number of printed overlaps
    for _, row in group.iterrows():
        if non_overlapping:
            last_interval = non_overlapping[-1]
            # Check for overlap
            if row['start_datetime'] < last_interval['finish_datetime']:
                # Print the overlapping intervals (limited to a few for brevity)
                if overlap_count < 5:
                    print(f"Overlap detected for ID {group['ID'].iloc[0]}:")
                    print(f"Interval 1: {last_interval['start_datetime']} to {last_interval['finish_datetime']}")
                    print(f"Interval 2: {row['start_datetime']} to {row['finish_datetime']}\n")
                    overlap_count += 1

                curr_duration = (row['finish_datetime'] - row['start_datetime']).seconds / 60
                last_duration = (last_interval['finish_datetime'] - last_interval['start_datetime']).seconds / 60
                
                # Decide which interval to keep
                if (curr_duration >= 10 and curr_duration <= 120) and (last_duration < 10 or last_duration > 120):
                    non_overlapping[-1] = row
                elif (last_duration >= 10 and last_duration <= 120) and (curr_duration < 10 or curr_duration > 120):
                    continue
                elif curr_duration > last_duration:
                    non_overlapping[-1] = row
            else:
                non_overlapping.append(row)
        else:
            non_overlapping.append(row)
    
    return pd.DataFrame(non_overlapping)

# Apply the function to each group
exdi = exdi.groupby('ID').apply(handle_overlaps).reset_index(drop=True)


Overlap detected for ID 6:
Interval 1: 2021-02-18 16:22:56 to 2021-02-18 17:02:56
Interval 2: 2021-02-18 17:00:00 to 2021-02-18 22:00:00

Overlap detected for ID 6:
Interval 1: 2021-02-19 11:00:00 to 2021-02-19 17:00:00
Interval 2: 2021-02-19 11:39:23 to 2021-02-19 11:45:58

Overlap detected for ID 9:
Interval 1: 2021-11-10 16:11:47 to 2021-11-10 16:56:47
Interval 2: 2021-11-10 16:18:05 to 2021-11-10 16:25:02

Overlap detected for ID 11:
Interval 1: 2021-04-02 08:22:00 to 2021-04-02 08:37:00
Interval 2: 2021-04-02 08:22:58 to 2021-04-02 08:36:11

Overlap detected for ID 11:
Interval 1: 2021-04-04 15:45:00 to 2021-04-04 16:06:17
Interval 2: 2021-04-04 15:45:00 to 2021-04-04 16:45:00

Overlap detected for ID 11:
Interval 1: 2021-04-04 15:45:00 to 2021-04-04 16:45:00
Interval 2: 2021-04-04 16:13:23 to 2021-04-04 16:21:07

Overlap detected for ID 11:
Interval 1: 2021-04-06 16:30:00 to 2021-04-06 17:00:00
Interval 2: 2021-04-06 16:38:36 to 2021-04-06 16:45:08

Overlap detected for ID 11:
In

In [21]:
exdi.shape[0]

5988

### 4.1.3. Convert intensity

In [22]:
# Mappings
intensity_dict = {'Low: Pretty easy': 0, 
                  'Feeling the burn - Mild':0,
                  'Mild: Working a bit':0,
                  'Moderate: Working to keep up':1,
                  'Heavy: Hard to keep going but did it':2,
                  'Exhaustive: Too tough/Had to stop':2
                  }
# Replace
exdi.intensity = exdi.intensity.replace(intensity_dict)

In [23]:
# Reset index
exdi.reset_index(drop=True, inplace=True)

### 4.1.5. Type of exercise

In [24]:
# Convert form_of_exercise
exdi['form_of_exercise'] = exdi.type_of_exercise.apply(lambda x: 
                                                       preprocess_helper.divide_exercise_into_type(x))

physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
other
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
physical labor
phys

In [25]:
# Removing bouts that aren't really exercise
#exdi.dropna(subset=['form_of_exercise'], inplace=True)

In [26]:
exdi.shape

(5988, 11)

## 4.2. Clean and combine CGM files

In [27]:
# Upload directory for individual data
lb = pd.read_csv(directory+'lb.csv')

# Select CGM readings
cgm = lb.loc[lb['LBCAT']=='CGM']

# Convert timestamp to datetime
cgm['LBDTC'] = pd.to_datetime(cgm['LBDTC'])

# Select and rename columns 
cgm = cgm[['USUBJID', 'LBDTC', 'LBORRES']]
cgm.columns = ['ID', 'time', 'glc']

# Drop any null 
cgm.dropna(inplace=True)

# Convert glucose to mg/dl
cgm['glc'] = (cgm['glc']/18).round(2)

# ID to int for concatenation
cgm['ID'] = cgm['ID'].astype(int)


In [28]:
visualizations.glucose_trace(cgm, 107)

## 4.3. Lab & demographic data

In [31]:
# Read file
demographics = pd.read_csv(directory+'dm.csv')

# Select and rename columns
demo = demographics[['USUBJID', 'AGE', 'SEX', 'RACE']]
demo.columns = ['ID', 'age', 'sex', 'race']

demo = demo.dropna(subset=['ID'])
# Replace m/f values
demo['sex'] = demo['sex'].replace({'M':'male', 'F':'female'})

# ID as int for concatenation
demo['ID'] = demo['ID'].astype(int)

### 4.3.1. Calculate hba1c

In [32]:
# Select the hba1c results from the tests file
hba1c = lb.loc[lb['LBTESTCD']=='HBA1C']

# Make id int for concatenation
hba1c['USUBJID'] = hba1c['USUBJID'].astype(int)

# Select and rename columns
hba1c = hba1c[['USUBJID', 'LBORRES']]
hba1c.columns = ['ID', 'hba1c']

# Convert to mmol/mol
hba1c['hba1c'] =(hba1c['hba1c'] -2.15)*10.929


### 4.3.2. Calculate diabetes onset

In [33]:
# Set up list for results
fa_data = []

# Read file
fa = pd.read_sas(directory+'FA.xpt', encoding='utf-8',chunksize=10000, iterator=True)

In [34]:
# Read the file in chunks because it's so massive
for chunk in fa:
    chunk = chunk.loc[chunk['FAOBJ']== "DIABETES ONSET"]
    if chunk.shape[0]==0:
        continue
    fa_data.append(chunk[['USUBJID', 'FASTRESN']])

In [46]:
# Concat all the chunks
fa_data=pd.concat(fa_data)

# Rename coluns
fa_data.columns = ['ID', 'years_since_diagnosis']

# Make id int for concatenation
fa_data['ID'] = fa_data['ID'].astype(int)

### 4.3.3. Calculate BMI

In [35]:
# Set up list for results
vs_data=[]

# Read file
vs = pd.read_sas(directory+'VS.xpt', encoding='utf-8',chunksize=10000, iterator=True)

In [36]:
# Read the file in chunks because it's so massive
for chunk in vs:
    chunk = chunk.loc[(chunk['VSTESTCD']== 'HEIGHT') | (chunk['VSTESTCD']=='WEIGHT')]
    if chunk.shape[0]==0:
        continue
    vs_data.append(chunk[['USUBJID', 'VSTESTCD', 'VSORRES']])

In [37]:
# Concat all the chunks
vs_data= pd.concat(vs_data)

# Make it the right way round
vs_data_adj = vs_data.pivot(columns='VSTESTCD', index='USUBJID')

vs_data_adj.columns = ['h', 'w']

#Calculate bmi
vs_data_adj['bmi'] = (703*vs_data_adj['w'])/(vs_data_adj['h']*vs_data_adj['h'])

# Reset index
vs_data_adj = vs_data_adj.reset_index()

# Select only id and bmi and rename
bmi = vs_data_adj[['USUBJID', 'bmi']]
bmi.columns = ['ID', 'bmi']

# Make id int for concatenatin
bmi['ID'] = bmi['ID'].astype(int)

## 4.4. Insulin modality

In [38]:
# Read file
dx = pd.read_sas(directory+'DX.xpt', encoding='utf-8')

In [39]:
# Rename values
insulin_modality = dx[dx['DXCAT']=='INSULIN MODALITY'][['USUBJID', 'DXTRT']].replace({'MULTIPLE DAILY INJECTIONS': 'mdi', 
                                                                                      'INSULIN PUMP':'pump', 
                                                                                      'CLOSED LOOP INSULIN PUMP':'closed_loop'})

In [40]:
# Rename columns
insulin_modality.columns = ['ID', 'insulin_modality']

# Convert ID to int for concatenation
insulin_modality['ID'] = insulin_modality['ID'].astype(int)

## 4.5. Combine all dfs

In [48]:
# List of all dfs
dfs = [demo, hba1c, bmi, fa_data, insulin_modality]

# Merge them all
demo_lab = ft.reduce(lambda left, right: pd.merge(left, right, on='ID', how='outer'), dfs)

In [49]:
# Add 'dexip' to IDs to make them unique
for i in [exdi, cgm, demo_lab]:
    i['ID'] = i['ID'].apply(lambda row: 'dexip_'+str(row))

## 2.4. Save files

In [50]:
df_directory = '../../data/tidy_data/dexip/'
exdi.to_csv(df_directory+'exercise.csv', index=False)
cgm.to_csv(df_directory+'cgm.csv', index=False)
demo_lab.to_csv(df_directory+'demo_lab.csv', index=False)