In [1]:
import zipfile
import os

In [2]:
# imports
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from sklearn.linear_model import LinearRegression
import os
import scipy

In [3]:
# Specify the path to the ZIP file
zip_file_path = r"C:\Praktikum AI Med\OhioT1DM.zip"

# Specify the directory to extract the contents of the ZIP file
extract_dir = r"C:\Praktikum AI Med"

# Password for encrypted files
password = "...@@@!!==/\/\/\/\~~~~~BGLP-1804-BGLP~~~~~/\/\/\/\==!!@@@..."

# Create the extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the contents of the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    for member in zip_ref.infolist():
        # Check if the file is encrypted
        if member.flag_bits & 0x01:
            zip_ref.extract(member, extract_dir, pwd=password.encode())
        else:
            zip_ref.extract(member, extract_dir)

print("Extraction completed successfully.")


Extraction completed successfully.


In [4]:
def get_glc(root):
    glucose = []
    glucose_ts = []
    for type_tag in root.findall('glucose_level/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        glucose.append(int(value))
        glucose_ts.append(ts)
        
    glc_frame = [glucose_ts, glucose]
    glc_frame = np.array(glc_frame)
    df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])
    return df_glc


def get_basal(root):
    basal = []
    basal_ts = []
    for type_tag in root.findall('basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        basal.append(float(value))
        basal_ts.append(ts)
        
    basal_frame = [basal_ts, basal]
    basal_frame = np.array(basal_frame)
    df_basal = pd.DataFrame(basal_frame.T, columns=['ts', 'basal'])
    return df_basal

def get_bolus(root):
    bolus = []
    bolus_ts = []
    for type_tag in root.findall('bolus_level/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        if ts is not None:
            ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
            bolus.append(float(value))
            bolus_ts.append(ts)
    bolus_frame = [bolus_ts, bolus]
    bolus_frame = np.array(bolus_frame)
    df_bolus = pd.DataFrame(bolus_frame.T, columns=['ts', 'bolus'])
    df_bolus['bolus'] = df_bolus['bolus'].astype(float)
    return df_bolus


def get_temp_basal(root):
    temp_basal = []
    temp_basal_ts = []
    for type_tag in root.findall('temp_basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        temp_basal.append(float(value))
        temp_basal_ts.append(ts)
        
    temp_basal_frame = [temp_basal_ts, temp_basal]
    temp_basal_frame = np.array(temp_basal_frame)
    df_temp_basal = pd.DataFrame(temp_basal_frame.T, columns=['ts', 'temp_basal'])
    return df_temp_basal

def get_macc(root):
    macc = []
    macc_ts = []
    for type_tag in root.findall('macc/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        macc.append(float(value))
        macc_ts.append(ts)
        
    macc_frame = [macc_ts, macc]
    macc_frame = np.array(macc_frame)
    df_macc = pd.DataFrame(macc_frame.T, columns=['ts', 'macc'])
    return df_macc

def get_step(root):
    step = []
    step_ts = []
    for type_tag in root.findall('step/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        step.append(float(value))
        step_ts.append(ts)
        
    step_frame = [step_ts, step]
    step_frame = np.array(step_frame)
    df_step = pd.DataFrame(step_frame.T, columns=['ts', 'step'])
    return df_step

In [5]:
# this function load the data, combines the single columns, fills in missing data and assings the classes
# as input the file ordner which is either train or test, the subjects id, and finally the version which is 2018 or 2020 are given 
# linear interpolation and extrapolation are applied for missing values which are allowed to have a consecutive length of 2 hours


def load_data(TRAINFILE, TESTFILE, s_ID, version):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1
        

        # glucose, basal insulin, bolus insulin, and temp basal are stored as sepearte dataframes
        df_glc = get_glc(root)
        df_basal = get_basal(root)
        df_bolus = get_bolus(root)
        df_temp_basal = get_temp_basal(root)

        # then the activity data is stored which calls either get_step() or get_macc() according to the chosen cohort
        if version == 2018:
            df_macc = get_step(root)
        else:
            df_macc = get_macc(root)

        # the single dataframes are merged on the time and the subject id is added 
        df_list = [df_glc, df_basal, df_bolus, df_macc] 
        combined_df_train = df_list[0]
        for i in range(1,len(df_list)):
            combined_df_train = pd.merge(combined_df_train, df_list[i], on='ts', how='left')


        # the same procedure is done for the test data
        df_glc2 = get_glc(root2)
        df_basal2 = get_basal(root2)
        df_bolus2 = get_bolus(root2)
        df_temp_basal2 = get_temp_basal(root2)

        if version == 2018:
            df_macc2 = get_step(root2)
        else:
            df_macc2 = get_macc(root2)

        df_list2 = [df_glc2, df_basal2, df_bolus2, df_macc2] 
        combined_df_test = df_list2[0]
        for i in range(1,len(df_list2)):
            combined_df_test = pd.merge(combined_df_test, df_list2[i], on='ts', how='left')

        # the train and test data are concatenated
        combined_df = pd.concat([combined_df_train, combined_df_test])
        combined_df["Subject_ID"] = subject_ID
        combined_df = combined_df.reset_index().drop(columns='index')

        # the temporal basal replaces the original basal for the identified time intervalls of the train and then test files
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal)
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal2)

        # the bolus insulin is integrated over the time interval on which it is applied and the the row bolus_end is deleted
        for i in range (0, len(combined_df)):
            if((combined_df["bolus"][i]  != np.NaN)):
                start_time = combined_df["ts"][i]
                end_time = combined_df["bolus_end"][i]
                combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "bolus"] = combined_df["bolus"][i]  
        combined_df = combined_df.drop("bolus_end", axis=1)

        # the values are all converted to floats 
        combined_df['glucose'] = combined_df['glucose'].astype(str).astype(float)
        combined_df['basal'] = combined_df['basal'].astype(str).astype(float)
        combined_df['bolus'] = combined_df['bolus'].astype(str).astype(float)
        combined_df['macc'] = combined_df['macc'].astype(str).astype(float)

        # missing basal insulin is filled with the ffill and bfill method since the values are constantly infused
        combined_df['basal'] = combined_df['basal'].fillna(method = 'ffill')
        combined_df['basal'] = combined_df['basal'].fillna(method = 'bfill')
        # missing bolus insulin is filled with 0 for nan values since most often missing values means that no bolus was infused
        combined_df['bolus'] = combined_df['bolus'].fillna(0)


        # the number of missing values for each parameter before data imputation is printed
        print('Before Data Imputation')
        print(subject_ID, 'intra:', combined_df.isna().sum())

        # linear interpolation is applied for training data and linear extrapolation is applied for test data to fill some of the nan values in glucose and exercise data
        combined_df2 = combined_df.copy()
        # interpolation
        combined_df = combined_df.interpolate(method = "linear", limit = 24, limit_direction="both") 
        # extrapolation
        combined_df2['glucose'] = combined_df2['glucose'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
        combined_df2['macc'] = combined_df2['macc'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
            

        # remaining missing values in exercise data is filled with -1 indicating that no data was recorded
        # those gaps were not removed, as glucose should be recorded continously to assign the classes, and they have the highest impact for the models
        # and missing values could influence the performance significantly
        # but it cannot be asserted that the patients will wear the wearable continously, as why the model should learn to ignore -1 values
        combined_df['macc'] = combined_df['macc'].fillna(-1)
        combined_df2['macc'] = combined_df2['macc'].fillna(-1)

        # a column called Class is created and the value -1 is firstly assigned to each row to keep track of still available instances without a class
        # then all glucose values below 70 mg/dL are given the Class 0
        combined_df["Class"] = -1
        combined_df.loc[combined_df["glucose"] <= 70, "Class"] = 0

        # a list is created containing the timestamps of hypoglycemic events 
        list_hypo = (combined_df.loc[combined_df["Class"] == 0, "ts"]).to_numpy()

        # the function Class_generation() is called with wanted intervalls before a hypoglycemic event in minutes
        combined_df = Class_generation(combined_df, 0, 15, 1, list_hypo) # 0-15
        combined_df = Class_generation(combined_df, 15, 30, 2, list_hypo)  # 15-30 
        combined_df = Class_generation(combined_df, 30, 60, 3, list_hypo)  # 30-60
        combined_df = Class_generation(combined_df, 60, 120, 4, list_hypo)  # 1-2 
        combined_df = Class_generation(combined_df, 120, 240, 5, list_hypo) # 2-4
        combined_df = Class_generation(combined_df, 240, 480, 6, list_hypo)  # 4-8
        combined_df = Class_generation(combined_df, 480, 720, 7, list_hypo)  # 8-12
        combined_df = Class_generation(combined_df, 720, 1440, 8, list_hypo)  # 12-24
        combined_df = Class_generation(combined_df, 1440, 2880, 9, list_hypo)  # 24-48
        # 10 could be no hypoglycemia 
        combined_df.loc[combined_df["Class"] == -1, "Class"] = 10

        # same procedure is done for the extrapolated data 
        combined_df2["Class"] = -1
        combined_df2.loc[combined_df2["glucose"] <= 70, "Class"] = 0

        list_hypo_2 = (combined_df2.loc[combined_df2["Class"] == 0, "ts"]).to_numpy()

        combined_df2 = Class_generation(combined_df2, 0, 15, 1, list_hypo_2)  # 0-15
        combined_df2 = Class_generation(combined_df2, 15, 30, 2, list_hypo_2)  # 15-30 
        combined_df2 = Class_generation(combined_df2, 30, 60, 3, list_hypo_2)  # 30-60
        combined_df2 = Class_generation(combined_df2, 60, 120, 4, list_hypo_2)  # 1-2 
        combined_df2 = Class_generation(combined_df2, 120, 240, 5, list_hypo_2) # 2-4
        combined_df2 = Class_generation(combined_df2, 240, 480, 6, list_hypo_2)  # 4-8
        combined_df2 = Class_generation(combined_df2, 480, 720, 7, list_hypo_2)  # 8-12
        combined_df2 = Class_generation(combined_df2, 720, 1440, 8, list_hypo_2)  # 12-24
        combined_df2 = Class_generation(combined_df2, 1440, 2880, 9, list_hypo_2)  # 24-48
        combined_df2.loc[combined_df2["Class"] == -1, "Class"] = 10


        # the number of missing values for each parameter before data imputation is printed
        print('After Linear')
        print(subject_ID, 'intra:', combined_df.isna().sum())
        print(subject_ID, 'extra:', combined_df2.isna().sum())

        
        # the distribution of the classes is printed for interpolated and extrapolated data, respectively
        print(np.bincount(combined_df['Class']))
        print(len(combined_df['Class']))

        print(np.bincount(combined_df2['Class']))
        print(len(combined_df2['Class']))

        # the function Remove_big_gaps() is called to identify consecutive nan values 
        # and to create subdataframes for each patient without any gaps, which are then saved as single csv files
        Remove_big_gaps(combined_df, combined_df2, subject_ID, version)

In [6]:
# this function assigns the classes, it takes the start and end of the defined interval before the hypoglycemic event
# furthermore, a list of all locations of hypoglycemic datapoints is given as input
# only instances which were not assigned to another class are considered

def Class_generation(df, start, end, class_number, list_hypo):
    
    # it is iterated over each hypoglycemic event and compute backwards with the given condition
    for i in list_hypo:
        current_time = pd.to_datetime(i)
        start_time = current_time - datetime.timedelta(minutes = start)
        end_time = current_time - datetime.timedelta(minutes = end)
        # condition is checked, and the new class is assigned 
        df.loc[(df["ts"] < start_time) & (df["ts"] >= end_time) & (df["Class"] == -1), "Class"] = class_number

    return df

In [7]:
# this function identifies gaps and split the dataframe into multiple dataframes which do not contain any missing values
# as input data the interpolated and extrapoalted dataframes, the subject_ID, and the version of the cohort are given
# (with the help of chatpgt)
def Remove_big_gaps(df, df2, subject_ID, version):

    df_inter = df.copy().reset_index()
    dataframes_inter = []

    # the indexes of nan values are identified to split the original data based on those gaps
    nan_mask_inter = df_inter['glucose'].isnull()
    # consecutive nan values are identified 
    cumultative_sum_inter = nan_mask_inter.cumsum()
    # groups of consecutive nan values and non nan values are build
    groups_inter = df_inter.groupby(cumultative_sum_inter)

    # it is iterated through the groups and only the dataframes are added to the list which do not contain nan values
    for _, group in groups_inter: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_inter.append(group)

    # each dataframe which does not contain any nan value is saved for the specific person
    for i in range (0, len(dataframes_inter)):
        file_name = "GAPS_DATA/TRAIN/%s/%s_%i_%i_INTER.csv" % (subject_ID,subject_ID, i, version)
        dataframes_inter[i].to_csv(file_name)


    # the same is also done for the extrapolated data
        
    df_extra = df2.copy().reset_index()
    dataframes_extra = []

    nan_mask_extra = df_extra['glucose'].isnull()
    cumultative_sum_extra = nan_mask_extra.cumsum()
    groups_extra = df_extra.groupby(cumultative_sum_extra)

    for _, group in groups_extra: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_extra.append(group)

    for i in range (0, len(dataframes_extra)):
        file_name2 = "GAPS_DATA/TEST/%s/%s_%i_%i_EXTRA.csv" % (subject_ID,subject_ID, i, version)
        dataframes_extra[i].to_csv(file_name2)

In [8]:
def Count_Initial_Hypo(TRAINFILE, TESTFILE, s_ID):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1

        glucose = []
        glucose_ts = []
        for type_tag in root.findall('glucose_level/event'):
            value = type_tag.get('value')
            ts = type_tag.get('ts')
            ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
            glucose.append(int(value))
            glucose_ts.append(ts)
            
        glc_frame = [glucose_ts, glucose]
        glc_frame = np.array(glc_frame)
        df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])


        glucose2 = []
        glucose_ts2 = []
        for type_tag in root2.findall('glucose_level/event'):
            value2 = type_tag.get('value')
            ts2 = type_tag.get('ts')
            ts2 = datetime.datetime.strptime(ts2, "%d-%m-%Y %H:%M:%S")
            glucose2.append(int(value2))
            glucose_ts2.append(ts2)
            
        glc_frame2 = [glucose_ts2, glucose2]
        glc_frame2 = np.array(glc_frame2)
        df_glc2 = pd.DataFrame(glc_frame2.T, columns=['ts', 'glucose'])

        df_glc3 = pd.concat([df_glc, df_glc2])

        df_glc3["Class"] = 1
        df_glc3.loc[df_glc3["glucose"] <= 70, "Class"] = 0
        print(subject_ID)
        print(np.bincount(df_glc3['Class']))
        print(len(df_glc3['Class']))

In [9]:
# this function load the data, combines the single columns, fills in missing data and assings the classes
# as input the file ordner which is either train or test, the subjects id, and finally the version which is 2018 or 2020 are given 
# linear interpolation and extrapolation are applied for missing values which are allowed to have a consecutive length of 2 hours

def load_data(TRAINFILE, TESTFILE, s_ID, version):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1
        

        # glucose, basal insulin, bolus insulin, and temp basal are stored as sepearte dataframes
        df_glc = get_glc(root)
        df_basal = get_basal(root)
        df_bolus = get_bolus(root)
        df_temp_basal = get_temp_basal(root)

        # then the activity data is stored which calls either get_step() or get_macc() according to the chosen cohort
        if version == 2018:
            df_macc = get_step(root)
        else:
            df_macc = get_macc(root)

        # the single dataframes are merged on the time and the subject id is added 
        df_list = [df_glc, df_basal, df_bolus, df_macc] 
        combined_df_train = df_list[0]
        for i in range(1,len(df_list)):
            combined_df_train = pd.merge(combined_df_train, df_list[i], on='ts', how='left')


        # the same procedure is done for the test data
        df_glc2 = get_glc(root2)
        df_basal2 = get_basal(root2)
        df_bolus2 = get_bolus(root2)
        df_temp_basal2 = get_temp_basal(root2)

        if version == 2018:
            df_macc2 = get_step(root2)
        else:
            df_macc2 = get_macc(root2)

        df_list2 = [df_glc2, df_basal2, df_bolus2, df_macc2] 
        combined_df_test = df_list2[0]
        for i in range(1,len(df_list2)):
            combined_df_test = pd.merge(combined_df_test, df_list2[i], on='ts', how='left')

        # the train and test data are concatenated
        combined_df = pd.concat([combined_df_train, combined_df_test])
        combined_df["Subject_ID"] = subject_ID
        combined_df = combined_df.reset_index().drop(columns='index')

        # the temporal basal replaces the original basal for the identified time intervalls of the train and then test files
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal)
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal2)

        # the bolus insulin is integrated over the time interval on which it is applied and the the row bolus_end is deleted
        for i in range (0, len(combined_df)):
            if((combined_df["bolus"][i]  != np.NaN)):
                start_time = combined_df["ts"][i]
                end_time = combined_df["bolus_end"][i]
                combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "bolus"] = combined_df["bolus"][i]  
        combined_df = combined_df.drop("bolus_end", axis=1)

        # the values are all converted to floats 
        combined_df['glucose'] = combined_df['glucose'].astype(str).astype(float)
        combined_df['basal'] = combined_df['basal'].astype(str).astype(float)
        combined_df['bolus'] = combined_df['bolus'].astype(str).astype(float)
        combined_df['macc'] = combined_df['macc'].astype(str).astype(float)

        # missing basal insulin is filled with the ffill and bfill method since the values are constantly infused
        combined_df['basal'] = combined_df['basal'].fillna(method = 'ffill')
        combined_df['basal'] = combined_df['basal'].fillna(method = 'bfill')
        # missing bolus insulin is filled with 0 for nan values since most often missing values means that no bolus was infused
        combined_df['bolus'] = combined_df['bolus'].fillna(0)


        # the number of missing values for each parameter before data imputation is printed
        print('Before Data Imputation')
        print(subject_ID, 'intra:', combined_df.isna().sum())

        # linear interpolation is applied for training data and linear extrapolation is applied for test data to fill some of the nan values in glucose and exercise data
        combined_df2 = combined_df.copy()
        # interpolation
        combined_df = combined_df.interpolate(method = "linear", limit = 24, limit_direction="both") 
        # extrapolation
        combined_df2['glucose'] = combined_df2['glucose'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
        combined_df2['macc'] = combined_df2['macc'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
            

        # remaining missing values in exercise data is filled with -1 indicating that no data was recorded
        # those gaps were not removed, as glucose should be recorded continously to assign the classes, and they have the highest impact for the models
        # and missing values could influence the performance significantly
        # but it cannot be asserted that the patients will wear the wearable continously, as why the model should learn to ignore -1 values
        combined_df['macc'] = combined_df['macc'].fillna(-1)
        combined_df2['macc'] = combined_df2['macc'].fillna(-1)

        # a column called Class is created and the value -1 is firstly assigned to each row to keep track of still available instances without a class
        # then all glucose values below 70 mg/dL are given the Class 0
        combined_df["Class"] = -1
        combined_df.loc[combined_df["glucose"] <= 70, "Class"] = 0

        # a list is created containing the timestamps of hypoglycemic events 
        list_hypo = (combined_df.loc[combined_df["Class"] == 0, "ts"]).to_numpy()

        # the function Class_generation() is called with wanted intervalls before a hypoglycemic event in minutes
        combined_df = Class_generation(combined_df, 0, 15, 1, list_hypo) # 0-15
        combined_df = Class_generation(combined_df, 15, 30, 2, list_hypo)  # 15-30 
        combined_df = Class_generation(combined_df, 30, 60, 3, list_hypo)  # 30-60
        combined_df = Class_generation(combined_df, 60, 120, 4, list_hypo)  # 1-2 
        combined_df = Class_generation(combined_df, 120, 240, 5, list_hypo) # 2-4
        combined_df = Class_generation(combined_df, 240, 480, 6, list_hypo)  # 4-8
        combined_df = Class_generation(combined_df, 480, 720, 7, list_hypo)  # 8-12
        combined_df = Class_generation(combined_df, 720, 1440, 8, list_hypo)  # 12-24
        combined_df = Class_generation(combined_df, 1440, 2880, 9, list_hypo)  # 24-48
        # 10 could be no hypoglycemia 
        combined_df.loc[combined_df["Class"] == -1, "Class"] = 10

        # same procedure is done for the extrapolated data 
        combined_df2["Class"] = -1
        combined_df2.loc[combined_df2["glucose"] <= 70, "Class"] = 0

        list_hypo_2 = (combined_df2.loc[combined_df2["Class"] == 0, "ts"]).to_numpy()

        combined_df2 = Class_generation(combined_df2, 0, 15, 1, list_hypo_2)  # 0-15
        combined_df2 = Class_generation(combined_df2, 15, 30, 2, list_hypo_2)  # 15-30 
        combined_df2 = Class_generation(combined_df2, 30, 60, 3, list_hypo_2)  # 30-60
        combined_df2 = Class_generation(combined_df2, 60, 120, 4, list_hypo_2)  # 1-2 
        combined_df2 = Class_generation(combined_df2, 120, 240, 5, list_hypo_2) # 2-4
        combined_df2 = Class_generation(combined_df2, 240, 480, 6, list_hypo_2)  # 4-8
        combined_df2 = Class_generation(combined_df2, 480, 720, 7, list_hypo_2)  # 8-12
        combined_df2 = Class_generation(combined_df2, 720, 1440, 8, list_hypo_2)  # 12-24
        combined_df2 = Class_generation(combined_df2, 1440, 2880, 9, list_hypo_2)  # 24-48
        combined_df2.loc[combined_df2["Class"] == -1, "Class"] = 10


        # the number of missing values for each parameter before data imputation is printed
        print('After Linear')
        print(subject_ID, 'intra:', combined_df.isna().sum())
        print(subject_ID, 'extra:', combined_df2.isna().sum())

        
        # the distribution of the classes is printed for interpolated and extrapolated data, respectively
        print(np.bincount(combined_df['Class']))
        print(len(combined_df['Class']))

        print(np.bincount(combined_df2['Class']))
        print(len(combined_df2['Class']))

        # the function Remove_big_gaps() is called to identify consecutive nan values 
        # and to create subdataframes for each patient without any gaps, which are then saved as single csv files
        Remove_big_gaps(combined_df, combined_df2, subject_ID, version)

In [10]:
# main function which contains the files with their corresponsing subject id, modus and version  
# this function is highly influenced by the code of https://github.com/r-cui/GluPred/blob/master/preprocess/linker.py
def main():
    versions_arr = [2018, 2020]

    for v in versions_arr:
        # first the data of the 2018 is preprocess 
        if (v == 2018):
            patient_index = [559, 563, 570, 575, 588, 591]
            train_files = ['/Praktikum AI Med/OhioT1DM/2018/train/559-ws-training.xml', 
                        '/Praktikum AI Med/OhioT1DM/2018/train/563-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/train/570-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/train/575-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/train/588-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/train/591-ws-training.xml'
                        ]


            test_files = ['/Praktikum AI Med/OhioT1DM/2018/test/559-ws-testing.xml', 
                        '/Praktikum AI Med/OhioT1DM/2018/test/563-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/test/570-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/test/575-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/test/588-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2018/test/591-ws-testing.xml'
                        ]
        # second, the data of the 2020 is preprocess     
        elif (v == 2020):
            patient_index = [540, 544, 552, 567, 584, 596]
            train_files = ['/Praktikum AI Med/OhioT1DM/2020/train/540-ws-training.xml', 
                        '/Praktikum AI Med/OhioT1DM/2020/train/544-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/train/552-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/train/567-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/train/584-ws-training.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/train/596-ws-training.xml'
                        ]


            test_files = ['/Praktikum AI Med/OhioT1DM/2020/test/540-ws-testing.xml', 
                        '/Praktikum AI Med/OhioT1DM/2020/test/544-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/test/552-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/test/567-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/test/584-ws-testing.xml',
                        '/Praktikum AI Med/OhioT1DM/2020/test/596-ws-testing.xml'
                        ]

                
        load_data(train_files, test_files, patient_index, version=v) 
        


if __name__ == '__main__':
    main()

TypeError: strptime() argument 1 must be str, not None

# Importing the required libraries 
import xml.etree.ElementTree as Xet 

cols = ["name", "phone", "email", "date", "country"] 
rows = [] 

# Parsing the XML file 
xmlparse = Xet.parse('sample.xml') 
root = xmlparse.getroot() 
for i in root: 
	name = i.find("name").text 
	phone = i.find("phone").text 
	email = i.find("email").text 
	date = i.find("date").text 
	country = i.find("country").text 

	rows.append({"name": name, 
				"phone": phone, 
				"email": email, 
				"date": date, 
				"country": country}) 

df = pd.DataFrame(rows, columns=cols) 

# Writing dataframe to csv 
df.to_csv('output.csv') 

glucose_level {}
finger_stick {}
basal {}
temp_basal {}
bolus {}
meal {}
sleep {}
work {}
stressors {}
hypo_event {}
illness {}
exercise {}
basis_heart_rate {}
basis_gsr {}
basis_skin_temperature {}
basis_air_temperature {}
basis_steps {}
basis_sleep {}