# Preprocess blood glucose datasets

## Import libraries

In [1]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import pickle

In [2]:
def print_statistics(dataframes):
    min_values = []
    mean_values = []
    max_values = []
    lengths = []
    
    for df in dataframes:
        min_values.append(df['CGM'].min())
        mean_values.append(df['CGM'].mean())
        max_values.append(df['CGM'].max())
        lengths.append(len(df))
    
    # Calculating the average statistics
    avg_min_value = sum(min_values) / len(min_values)
    avg_mean_value = sum(mean_values) / len(mean_values)
    avg_max_value = sum(max_values) / len(max_values)
    
    # Calculating overall statistics
    all_values = pd.concat([df['CGM'] for df in dataframes])
    overall_avg_value = all_values.mean()
    overall_min_value = all_values.min()
    overall_max_value = all_values.max()
    
    # Calculating the average length of the timeseries
    avg_length = sum(lengths) / len(lengths)
    
    # Printing the statistics
    print(f"Average minimum value of the timeseries: {avg_min_value}")
    print(f"Average mean value of the timeseries: {avg_mean_value}")
    print(f"Average maximum value of the timeseries: {avg_max_value}")
    print(f"Average value from all the timeseries: {overall_avg_value}")
    print(f"Minimum value from all the timeseries: {overall_min_value}")
    print(f"Maximum value from all the timeseries: {overall_max_value}")
    print(f"Average length of the timeseries: {avg_length}")
    print(f"Number of timeseries: {len(dataframes)}")

In [3]:
def check_15_min_intervals(dataframes):
    """
    Checks whether the 'Date' column in each DataFrame in the list has timestamps exactly 15 minutes apart.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to check.

    Returns:
    results (list of bool): List of booleans indicating if each DataFrame has 15-minute intervals.
    """
    results = []
    for df in dataframes:
        df = df.sort_values(by='Date').reset_index(drop=True)
        # Calculate the difference in minutes between consecutive timestamps
        time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60
        # Check if all differences are exactly 15 minutes 
        if not (time_diffs == 15).all():
            print(f'Found intervals {time_diffs[(time_diffs != 15)].values} minutes apart!')
        results.append((time_diffs == 15).all())
    return results

In [4]:
def check_5_min_intervals(dataframes):
    """
    Checks whether the 'Date' column in each DataFrame in the list has timestamps exactly 15 minutes apart.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to check.

    Returns:
    results (list of bool): List of booleans indicating if each DataFrame has 15-minute intervals.
    """
    results = []
    for df in dataframes:
        df = df.sort_values(by='Date').reset_index(drop=True)
        # Calculate the difference in minutes between consecutive timestamps
        time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60
        # Check if all differences are exactly 15 minutes 
        if not (time_diffs == 5).all():
            print(f'Found intervals {time_diffs[(time_diffs != 5)].values} minutes apart!')
        results.append((time_diffs == 5).all())
    return results

In [5]:
def check_and_fix_15_min_intervals(dataframes,sample_size):
    """
    Checks whether the 'Date' column in each DataFrame in the list has timestamps exactly 15 minutes apart.
    If not, it corrects the intervals by adding missing timestamps and interpolating the 'CGM' values.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to check and fix.

    Returns:
    results (list of pd.DataFrame): List of corrected DataFrames.
    """
    corrected_dataframes = []
    for df in dataframes:
        df = df.sort_values(by='Date').reset_index(drop=True)
        # Calculate the difference in minutes between consecutive timestamps
        time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60

        while not (time_diffs == 15).all():
            # Find the indices where the difference is not 15 minutes
            incorrect_intervals = time_diffs[time_diffs != 15].index
            idx = incorrect_intervals[0]

            if time_diffs[idx] != 30:
                df1 = df.iloc[:idx - 1]
                corrected_dataframes.append(df1)
                df = df.iloc[idx:].reset_index(drop=True)

            else:
                # Insert missing rows to correct the intervals
                prev_row = df.iloc[idx - 1]
                curr_row = df.iloc[idx]

                missing_time = prev_row['Date'] + pd.Timedelta(minutes=15)
                missing_value = (prev_row['CGM'] + curr_row['CGM']) / 2  # Linear interpolation

                # Create a new row with the interpolated value
                new_row = pd.DataFrame({'Date': [missing_time], 'CGM': [missing_value]})

                # Insert the new row into the DataFrame
                df = pd.concat([df.iloc[:idx], new_row, df.iloc[idx:]]).reset_index(drop=True)

                # Recalculate the time differences
            time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60

        corrected_dataframes.append(df)

    corrected_dataframes = [df for df in corrected_dataframes if len(df) >= sample_size]

    return corrected_dataframes

In [6]:
def check_and_fix_5_min_intervals(dataframes,sample_size):
    """
    Checks whether the 'Date' column in each DataFrame in the list has timestamps exactly 15 minutes apart.
    If not, it corrects the intervals by adding missing timestamps and interpolating the 'CGM' values.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to check and fix.

    Returns:
    results (list of pd.DataFrame): List of corrected DataFrames.
    """
    corrected_dataframes = []
    count = 0
    for df in dataframes:
        df = df.sort_values(by='Date').reset_index(drop=True)
        # Calculate the difference in minutes between consecutive timestamps
        time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60
        while not (time_diffs == 5).all():
            # Find the indices where the difference is not 15 minutes
            incorrect_intervals = time_diffs[time_diffs != 5].index
            idx = incorrect_intervals[0]

            if time_diffs[idx] >= 10:
                df1 = df.iloc[:idx - 1]
                corrected_dataframes.append(df1)
                df = df.iloc[idx:].reset_index(drop=True)
            elif (time_diffs[idx] < 10) & (time_diffs[idx] > 5):
                # Insert missing rows to correct the intervals
                prev_row = df.iloc[idx - 1]
                curr_row = df.iloc[idx]

                missing_time = prev_row['Date'] + pd.Timedelta(minutes=5)
                missing_value = (prev_row['CGM'] + curr_row['CGM']) / 2  # Linear interpolation

                # Create a new row with the interpolated value
                new_row = pd.DataFrame({'Date': [missing_time], 'CGM': [missing_value]})

                # Insert the new row into the DataFrame
                df = pd.concat([df.iloc[:idx], new_row, df.iloc[idx:]]).reset_index(drop=True)
            else:
                df = df.drop(idx).reset_index(drop=True)
                # Recalculate the time differences
            time_diffs = df['Date'].diff().dropna().dt.total_seconds() / 60

        corrected_dataframes.append(df)

    corrected_dataframes = [df for df in corrected_dataframes if len(df) >= sample_size]

    return corrected_dataframes

## Define context window and prediction window

In [7]:
CONTEXT_HOURS = 24
PREDICTION_HOURS = 3

samples_per_hour = 4
prediction_length = PREDICTION_HOURS * samples_per_hour
context_length = CONTEXT_HOURS * samples_per_hour

sample_size = prediction_length + context_length
print(f"Each timeseries has to have at least {sample_size} points")

Each timeseries has to have at least 108 points


In [51]:
TRAIN_RATIO = 0.9
VAL_RATIO = 0.05
TEST_RATIO = 0.05

split_ratios = [TRAIN_RATIO, VAL_RATIO, TEST_RATIO]

In [9]:
datasets_dictionary = {}

## Shanghai T1DM

In [10]:
file_path = f"../../datasets/Shanghai_T1DM"

dataframes = []
file_names = []

excel_files = glob.glob(os.path.join(file_path, "*.xls")) + glob.glob(os.path.join(file_path, "*.xlsx"))

for file in excel_files:
    df = pd.read_excel(file)
    if 'CGM (mg / dl)' in df.columns:
        df = df[['Date', 'CGM (mg / dl)']]
    else:
        df = df[['Date', 'CGM ']]

    if df.isna().any().any():
        print(f"DataFrame {file} has columns with NaN values:")
        print(df.isna().sum())
        continue

    dataframes.append(df.sort_values(by='Date').rename(columns={
        'CGM (mg / dl)': 'CGM',
        'CGM ': 'CGM'
    }))

    file_names.append(file.split('.')[-2].split('/')[-1])

In [11]:
check = check_15_min_intervals(dataframes)

Found intervals [30.] minutes apart!


In [12]:
corrected_dataframes_shanghaiT1DM = check_and_fix_15_min_intervals(dataframes,sample_size)

check = check_15_min_intervals(corrected_dataframes_shanghaiT1DM)
if check == [True for i in range(len(check))]:
    print('Shanghai T1DM is in the appropriate time format')

Shanghai T1DM is in the appropriate time format


In [13]:
print_statistics(corrected_dataframes_shanghaiT1DM)

Average minimum value of the timeseries: 47.1375
Average mean value of the timeseries: 163.6159111235018
Average maximum value of the timeseries: 365.40000000000003
Average value from all the timeseries: 164.74587155963303
Minimum value from all the timeseries: 39.6
Maximum value from all the timeseries: 475.2
Average length of the timeseries: 981.0
Number of timeseries: 16


In [14]:
datasets_dictionary['ShanghaiT1DM'] = corrected_dataframes_shanghaiT1DM

## Shanghai T2DM

In [15]:
file_path = f"../../datasets/Shanghai_T2DM"

dataframes = []
file_names = []

excel_files = glob.glob(os.path.join(file_path, "*.xls")) + glob.glob(os.path.join(file_path, "*.xlsx"))

for file in excel_files:
    df = pd.read_excel(file)
    if 'CGM (mg / dl)' in df.columns:
        df = df[['Date', 'CGM (mg / dl)']]
    else:
        df = df[['Date', 'CGM ']]

    if df.isna().any().any():
        print(f"DataFrame {file} has columns with NaN values:")
        print(df.isna().sum())
        continue

    dataframes.append(df.sort_values(by='Date').rename(columns={
        'CGM (mg / dl)': 'CGM',
        'CGM ': 'CGM'
    }))

    file_names.append(file.split('.')[-2].split('/')[-1])

DataFrame ../../datasets/Shanghai_T2DM\2029_0_20210526.xls has columns with NaN values:
Date              0
CGM (mg / dl)    13
dtype: int64


In [16]:
check = check_15_min_intervals(dataframes)

Found intervals [60.] minutes apart!
Found intervals [30.] minutes apart!
Found intervals [30. 45.] minutes apart!
Found intervals [45.] minutes apart!
Found intervals [30. 30. 30. 30. 30.] minutes apart!


In [17]:
corrected_dataframes_shanghaiT2DM = check_and_fix_15_min_intervals(dataframes,sample_size)
check = check_15_min_intervals(corrected_dataframes_shanghaiT2DM)
if check == [True for i in range(len(check))]:
    print('Shanghai T2DM is in the appropriate time format')

Shanghai T2DM is in the appropriate time format


In [18]:
print_statistics(corrected_dataframes_shanghaiT2DM)

Average minimum value of the timeseries: 64.94862385321106
Average mean value of the timeseries: 141.49387217421102
Average maximum value of the timeseries: 289.0238532110093
Average value from all the timeseries: 140.36590273545335
Minimum value from all the timeseries: 39.6
Maximum value from all the timeseries: 468.0
Average length of the timeseries: 1023.5963302752293
Number of timeseries: 109


In [19]:
datasets_dictionary['ShanghaiT2DM'] = corrected_dataframes_shanghaiT2DM

## Patient 0

In [20]:
# Load the dataset
df = pd.read_csv('../../datasets/patient0_raw_data.csv')

# Print the number of records before removing NaN values
print(f"Number of records before removing NaN values: {len(df)}")

# Remove rows with NaN values in 'Historic Glucose mg/dL'
df_clean = df.dropna(subset=['Historic Glucose mg/dL'])

# Print the number of records after removing NaN values
print(f"Number of records after removing NaN values: {len(df_clean)}")

Number of records before removing NaN values: 85923
Number of records after removing NaN values: 61544


In [21]:
df_clean.head()

Unnamed: 0,Device Timestamp,Historic Glucose mg/dL
3,24/07/2022 00:12,162.0
4,24/07/2022 00:27,155.0
5,24/07/2022 00:42,156.0
6,24/07/2022 00:57,163.0
11,24/07/2022 01:12,164.0


In [22]:
# Group by 'Device Timestamp' and calculate the mean of 'Historic Glucose mg/dL'
df_grouped = df_clean.groupby('Device Timestamp', as_index=False)['Historic Glucose mg/dL'].mean()

# Print the number of records before and after removing duplicates
print(f"Number of records before removing duplicates: {len(df_clean)}")
print(f"Number of records after removing duplicates: {len(df_grouped)}")

# Display the cleaned DataFrame
print(df_grouped.head())

Number of records before removing duplicates: 61544
Number of records after removing duplicates: 61536
   Device Timestamp  Historic Glucose mg/dL
0  01/01/2023 00:08                   148.0
1  01/01/2023 00:23                   166.0
2  01/01/2023 00:38                   205.0
3  01/01/2023 00:53                   231.0
4  01/01/2023 01:08                   248.0


In [23]:
# Convert 'Device Timestamp' to datetime
df_grouped['Device Timestamp'] = pd.to_datetime(df_grouped['Device Timestamp'], format='%d/%m/%Y %H:%M')

# Sort by timestamp to ensure the time intervals are calculated correctly
df_grouped = df_grouped.sort_values(by='Device Timestamp').reset_index(drop=True)

# Calculate the intervals between consecutive timestamps
intervals = df_grouped['Device Timestamp'].diff().dropna().dt.total_seconds() / 60  # convert to minutes

In [24]:
def histogram_bin_values(data, num_bins):

    hist, bin_edges = np.histogram(data, bins=num_bins)

    # Print the values in each bin
    bins = [[] for _ in range(num_bins)]
    for value in data:
        for i in range(num_bins):
            if bin_edges[i] <= value < bin_edges[i + 1]:
                bins[i].append(value)
                break
    # Include the rightmost edge value in the last bin
    bins[-1].extend([value for value in data if value == bin_edges[-1]])

    bins_percentage = {}
    # Display the bin ranges and the values in each bin
    for i in range(num_bins):
        bins_percentage[f"Bin {i + 1} ({bin_edges[i]:.2f} to {bin_edges[i + 1]:.2f})"] = ((len(bins[i])/len(data)))*100
    return bins_percentage


In [25]:
num_bins = 1000
bins_percentage = histogram_bin_values(intervals, num_bins)

In [26]:
{k: v for k, v in sorted(bins_percentage.items(), reverse=True, key=lambda item: item[1]) if v>0.01}

{'Bin 2 (12.50 to 24.00)': 99.52384821646217,
 'Bin 3 (24.00 to 35.50)': 0.10888112456325669,
 'Bin 1 (1.00 to 12.50)': 0.08287966198098642,
 'Bin 7 (69.99 to 81.49)': 0.03900219387340538,
 'Bin 6 (58.50 to 69.99)': 0.03412691963922971,
 'Bin 8 (81.49 to 92.99)': 0.02437637117087836,
 'Bin 9 (92.99 to 104.49)': 0.0178760055253108,
 'Bin 4 (35.50 to 47.00)': 0.016250914113918907,
 'Bin 10 (104.49 to 115.99)': 0.016250914113918907,
 'Bin 11 (115.99 to 127.49)': 0.014625822702527016,
 'Bin 14 (150.49 to 161.99)': 0.014625822702527016,
 'Bin 5 (47.00 to 58.50)': 0.013000731291135124,
 'Bin 15 (161.99 to 173.49)': 0.011375639879743236}

In [27]:
df_clean = df_grouped

# Split the time series for large intervals
max_interpolation_interval = 40  # in minutes

initial_segments = []
current_segment = []

previous_timestamp = df_clean.loc[0, 'Device Timestamp']
current_segment.append(df_clean.loc[0])

for i in range(1, len(df_clean)):
    current_timestamp = df_clean.loc[i, 'Device Timestamp']
    time_diff = (current_timestamp - previous_timestamp).total_seconds() / 60  # difference in minutes
    
    if time_diff > max_interpolation_interval:
        initial_segments.append(pd.DataFrame(current_segment))
        current_segment = []
    current_segment.append(df_clean.loc[i])
    previous_timestamp = current_timestamp

# Add the last segment if it's not empty
if current_segment:
    initial_segments.append(pd.DataFrame(current_segment))

In [28]:
final_segments = []
for segment in initial_segments:
    if len(segment) > sample_size:
        final_segments.append(pd.DataFrame(segment))

# Print the number of segments and the number of records in each segment
print(f"Number of segments: {len(final_segments)}")

Number of segments: 111


In [29]:
# Function to resample and interpolate a segment
def resample_and_interpolate(segment):
    interpolated_segment = segment.set_index('Device Timestamp')
    interpolated_segment = interpolated_segment.resample('15min').mean()
    interpolated_segment['Historic Glucose mg/dL'] = interpolated_segment['Historic Glucose mg/dL'].interpolate(method='linear', limit_direction='both')
    interpolated_segment.reset_index(inplace=True)
    return interpolated_segment

# Apply resampling and interpolation to each segment
resampled_segments = [resample_and_interpolate(segment) for segment in final_segments]

In [30]:
resampled_segments = [df.rename(columns={'Device Timestamp':'Date','Historic Glucose mg/dL':'CGM'}) for df in resampled_segments]

In [31]:
# Define the output directory path
output_dir = os.path.join('..', '..', 'datasets', 'Patient0')

# Check if the directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the segments to CSV files in the specified directory
for i, segment in enumerate(resampled_segments):
    segment.to_csv(os.path.join(output_dir, f'resampled_segment_{i+1}.csv'), index=False)


In [32]:
file_path = f"../../datasets/Patient0"

dataframes = []

csv_files = glob.glob(os.path.join(file_path, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    if df.isna().any().any():
        print(f"DataFrame {file} has columns with NaN values:")
        print(df.isna().sum())
        continue
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')
    dataframes.append(df.sort_values(by='Date'))

In [33]:
check = check_15_min_intervals(dataframes)

In [34]:
corrected_dataframes_Patient0 = check_and_fix_15_min_intervals(dataframes,sample_size)

check = check_15_min_intervals(corrected_dataframes_Patient0)
if check == [True for i in range(len(check))]:
    print('Patient0 Dataset is in the appropriate time format')
    
print_statistics(corrected_dataframes_Patient0)
datasets_dictionary['Patient0'] = corrected_dataframes_Patient0

Patient0 Dataset is in the appropriate time format
Average minimum value of the timeseries: 66.69369369369369
Average mean value of the timeseries: 158.3611250229459
Average maximum value of the timeseries: 310.3063063063063
Average value from all the timeseries: 160.35261472317276
Minimum value from all the timeseries: 40.0
Maximum value from all the timeseries: 483.0
Average length of the timeseries: 513.3783783783783
Number of timeseries: 111


## OhioT1DM

In [35]:
def split_dataframes_on_time_difference(dataframes, sample_size, max_diff_minutes=10):
    """
    Splits DataFrames in the list when the time difference between consecutive rows is more than max_diff_minutes.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to split.
    max_diff_minutes (int): Maximum allowed time difference in minutes between consecutive rows.

    Returns:
    split_dataframes (list of pd.DataFrame): List of split DataFrames.
    """
    split_dataframes = []
    
    for df in dataframes:
        # Ensure the 'Date' column is sorted
        df = df.sort_values(by='Date').reset_index(drop=True)
        
        # Calculate the time differences in minutes
        time_diffs = df['Date'].diff().dt.total_seconds() / 60
        
        # Initialize the current segment
        current_segment = [df.iloc[0]]
        
        for i in range(1, len(df)):
            if time_diffs.iloc[i] > max_diff_minutes:
                # If the time difference is greater than max_diff_minutes, finalize the current segment
                split_dataframes.append(pd.DataFrame(current_segment))
                # Start a new segment
                current_segment = [df.iloc[i]]
            else:
                # Otherwise, add the row to the current segment
                current_segment.append(df.iloc[i])
        
        # Add the last segment
        if current_segment:
            split_dataframes.append(pd.DataFrame(current_segment))
    split_dataframes = [df for df in split_dataframes if len(df) >= sample_size]
    return split_dataframes

In [36]:
import xml.etree.ElementTree as ET

# Define the directory containing the XML files
dataset_dir = os.path.join('..', '..', 'datasets', 'OhioT1DM')

# Lists to store the DataFrames
ohiot1dm_training = []
ohiot1dm_testing = []

# Function to parse an XML file and convert to a DataFrame
def parse_xml_to_df(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

    for event in root.find('glucose_level'):
        timestamp = event.get('ts')
        value = event.get('value')
        df = {'Date': pd.to_datetime(timestamp, format='%d-%m-%Y %H:%M:%S'), 'CGM': int(value)}
        df['Date'] = df['Date'].round(freq='min')
        data.append(df)
    
    return pd.DataFrame(data)

# Loop through the files in the directory
for filename in os.listdir(dataset_dir):
    if filename.endswith('-training.xml'):
        df = parse_xml_to_df(os.path.join(dataset_dir, filename))
        ohiot1dm_training.append(df)
    elif filename.endswith('-testing.xml'):
        df = parse_xml_to_df(os.path.join(dataset_dir, filename))
        ohiot1dm_testing.append(df)

print(f"Number of training DataFrames: {len(ohiot1dm_training)}")
print(f"Number of testing DataFrames: {len(ohiot1dm_testing)}")

# Optionally, print the first few rows of the first DataFrame in each list to verify
if ohiot1dm_training:
    print(f"First training DataFrame:\n{ohiot1dm_training[0].head()}")
if ohiot1dm_testing:
    print(f"First testing DataFrame:\n{ohiot1dm_testing[0].head()}")

Number of training DataFrames: 12
Number of testing DataFrames: 12
First training DataFrame:
                 Date  CGM
0 2027-05-19 11:36:00   76
1 2027-05-19 11:41:00   72
2 2027-05-19 11:46:00   68
3 2027-05-19 11:51:00   65
4 2027-05-19 11:56:00   63
First testing DataFrame:
                 Date  CGM
0 2027-07-04 00:02:00  254
1 2027-07-04 00:07:00  250
2 2027-07-04 00:12:00  249
3 2027-07-04 00:17:00  247
4 2027-07-04 00:22:00  242


In [37]:
ohiot1dm_training_split = split_dataframes_on_time_difference(ohiot1dm_training, sample_size)
ohiot1dm_testing_split = split_dataframes_on_time_difference(ohiot1dm_testing, sample_size)

# Example usage:
print(f"Number of training DataFrames after split: {len(ohiot1dm_training_split)}")
print(f"Number of testing DataFrames after split: {len(ohiot1dm_testing_split)}")

Number of training DataFrames after split: 339
Number of testing DataFrames after split: 87


In [38]:
corrected_dataframes_ohiot1dm_training = check_and_fix_5_min_intervals(ohiot1dm_training_split,sample_size)
check = check_5_min_intervals(corrected_dataframes_ohiot1dm_training)
if check == [True for i in range(len(check))]:
    print('OhioT1DM training is in the appropriate time format')

corrected_dataframes_ohiot1dm_testing = check_and_fix_5_min_intervals(ohiot1dm_testing_split,sample_size)
check = check_5_min_intervals(corrected_dataframes_ohiot1dm_testing)
if check == [True for i in range(len(check))]:
    print('OhioT1DM testing is in the appropriate time format')

OhioT1DM training is in the appropriate time format
OhioT1DM testing is in the appropriate time format


In [39]:
def keep_15_min_intervals(dataframes,sample_size):
    """
    Retain only the rows where the 'Date' column timestamps are exactly 15 minutes apart.

    Parameters:
    dataframes (list of pd.DataFrame): List of DataFrames to filter.

    Returns:
    filtered_dataframes (list of pd.DataFrame): List of filtered DataFrames.
    """
    filtered_dataframes = []

    for df in dataframes:
        # Ensure the 'Date' column is sorted
        df = df.sort_values(by='Date').reset_index(drop=True)
        
        # Filter rows to keep only those 15 minutes apart
        df_filtered = df.iloc[::3].reset_index(drop=True)
        
        filtered_dataframes.append(df_filtered)
    filtered_dataframes = [df for df in filtered_dataframes if len(df) > sample_size]
    return filtered_dataframes

In [40]:
#Resample OhioT1DM from 5 minutes to 15 minutes
corrected_dataframes_ohiot1dm_training = keep_15_min_intervals(corrected_dataframes_ohiot1dm_training,sample_size)
corrected_dataframes_ohiot1dm_testing = keep_15_min_intervals(corrected_dataframes_ohiot1dm_testing,sample_size)

In [41]:
for df in corrected_dataframes_ohiot1dm_testing:
    corrected_dataframes_ohiot1dm_training.append(df)
print_statistics(corrected_dataframes_ohiot1dm_training)

Average minimum value of the timeseries: 62.1796928940168
Average mean value of the timeseries: 159.4717722122069
Average maximum value of the timeseries: 295.3803706496851
Average value from all the timeseries: 159.78201816334987
Minimum value from all the timeseries: 40.0
Maximum value from all the timeseries: 400.0
Average length of the timeseries: 243.59859154929578
Number of timeseries: 142


In [42]:
datasets_dictionary['OhioT1DM'] = corrected_dataframes_ohiot1dm_training

## DINAMO

In [43]:
# Initialize an empty list to store the dataframes
dataframes = []

# Define the directory containing the csv files
dataset_dir = os.path.join('..', '..', 'datasets', 'Dinamo_T1DM')

# Loop through the files in the folder
for i in range(1, 10):
    file_path = f'{dataset_dir}/glucose_{i}.csv'
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Drop the 'comments' column
    df = df.drop(columns=['comments'])
    
    # Keep only rows where 'type' is 'cgm'
    df = df[df['type'] == 'cgm']
    
    # Drop the 'type' column
    df = df.drop(columns=['type'])
    
    # Create a new 'Date' column by combining 'data' and 'time'
    df['Date'] = pd.to_datetime(df['date'] + ' ' + df['time']).dt.round('min')
    
    # Drop rows with any NaN or empty values
    df = df.dropna()
    
    # Drop the 'data' and 'time' columns
    df = df.drop(columns=['date', 'time'])
    
    # Rename the 'glucose' column to 'CGM' and multiply the values by 18.018
    df['CGM'] = df['glucose'] * 18.018
    
    # Drop the old 'glucose' column
    df = df.drop(columns=['glucose'])
    
    # Append the cleaned dataframe to the list
    dataframes.append(df)

print(f"Number of DataFrames: {len(dataframes)}")


Number of DataFrames: 9


In [44]:
check = check_5_min_intervals(dataframes)

Found intervals [43205. 38885. 43205. 41765. 43205.] minutes apart!
Found intervals [340.] minutes apart!
Found intervals [1255.] minutes apart!


In [45]:
corrected_dataframes_Dinamo = check_and_fix_5_min_intervals(dataframes,sample_size)
corrected_dataframes_Dinamo = keep_15_min_intervals(corrected_dataframes_Dinamo,sample_size)

check = check_15_min_intervals(corrected_dataframes_Dinamo)
if check == [True for i in range(len(check))]:
    print('DinamoT1DM is in the appropriate time format')

DinamoT1DM is in the appropriate time format


In [46]:
print_statistics(corrected_dataframes_Dinamo)

Average minimum value of the timeseries: 57.400200000000005
Average mean value of the timeseries: 162.85046053458015
Average maximum value of the timeseries: 317.11680000000007
Average value from all the timeseries: 164.63681276595747
Minimum value from all the timeseries: 39.6396
Maximum value from all the timeseries: 399.9996
Average length of the timeseries: 302.14285714285717
Number of timeseries: 7


In [47]:
datasets_dictionary['DinamoT1DM'] = corrected_dataframes_ohiot1dm_training

## Estimate training, validation and test samples

In [48]:
def examples_estimation(dataset_name, data, split_ratios,context_window,prediction_window):

    # Load the datasets
    print(f"Available samples for {dataset_name} datasets...")
    train_ratio,val_ratio,test_ratio = split_ratios
    train_examples, val_examples, test_examples = 0, 0, 0

    assert(train_ratio+val_ratio+test_ratio==1)

    for i in range(len(data)):
        num_rows = len(data[i])
        sample_size = context_window + prediction_window
        if num_rows>sample_size:
            if num_rows < 3*sample_size + 1:
                training_samples = (num_rows - sample_size + 1)
                validation_samples = 0
                testing_samples = 0
            else:
                available_samples = (num_rows - 3*sample_size - 1)
                training_samples = int(available_samples * train_ratio)
                validation_samples = int(available_samples * val_ratio)
                testing_samples = int(available_samples * test_ratio)
            
                if validation_samples == 0:
                    training_samples -= 1
                    validation_samples += 1
                if testing_samples == 0:
                    training_samples -= 1
                    testing_samples += 1
            train_examples += training_samples
            val_examples += validation_samples
            test_examples += testing_samples

    print(f"Number of training examples: {train_examples}")
    print(f"Number of validation examples: {val_examples}")
    print(f"Number of test examples: {test_examples}")

In [49]:
datasets_dictionary['Dinamo_Shanghai_T1DM'] = datasets_dictionary['DinamoT1DM'] + datasets_dictionary['ShanghaiT1DM']
datasets_dictionary['Dinamo_Shanghai_Ohio_T1DM'] = datasets_dictionary['DinamoT1DM'] + datasets_dictionary['ShanghaiT1DM']+ datasets_dictionary['OhioT1DM']

In [52]:
for key, value in datasets_dictionary.items():
    examples_estimation(key, value, split_ratios,context_length,prediction_length)

Available samples for ShanghaiT1DM datasets...
Number of training examples: 9436
Number of validation examples: 516
Number of test examples: 516
Available samples for ShanghaiT2DM datasets...
Number of training examples: 69327
Number of validation examples: 3772
Number of test examples: 3772
Available samples for Patient0 datasets...
Number of training examples: 29748
Number of validation examples: 1315
Number of test examples: 1315
Available samples for OhioT1DM datasets...
Number of training examples: 12392
Number of validation examples: 208
Number of test examples: 208
Available samples for DinamoT1DM datasets...
Number of training examples: 12392
Number of validation examples: 208
Number of test examples: 208
Available samples for Dinamo_Shanghai_T1DM datasets...
Number of training examples: 21828
Number of validation examples: 724
Number of test examples: 724
Available samples for Dinamo_Shanghai_Ohio_T1DM datasets...
Number of training examples: 34220
Number of validation example

In [None]:
# Save the dictionary to a file
with open('dataset_dictionary.pkl', 'wb') as file:
    pickle.dump(datasets_dictionary, file)