In [55]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [175]:
def readin_data(data_type: str):
    """ This function reads in test or train data, which should be in folders 'testing_data' and 'training_data' in the same directory.

    Parameters
    ----------
    'data_type' : str
        This should be 'test' or 'train'.
    """
    assert (data_type=='test') or (data_type=='train'), f'You gave data_type as {data_type}. Please define data_type as "test" or "train."'
    if data_type == 'test':
        inner_directory = './testing_data/'
        data_list = os.listdir('./testing_data')
    else:
        inner_directory = './training_data/'
        data_list = os.listdir('./training_data')
    data_dict = {}
    for file_name in data_list:
        data_dict[file_name.split('.')[0]] = pd.read_csv(inner_directory+file_name).drop_duplicates()
    return data_dict

In [226]:
training_data = readin_data('train')

In [114]:
os.makedirs('./processed_data')

In [235]:
def add_uids(data_dictionary: dict):
    """ This function adds a UID to each row to establish unique instances between person_id & measurement datetimes for the various tables.
    
    This is not done for the demographics file since the information in it is not sensitive to the hour.

    Parameters
    ----------
    'datadictionary' : dict
        A dictionary of pandas DataFrames.
    """
    for table_ind in list(data_dictionary.keys()):
        if not table_ind.startswith("person_demographics"):
            table = data_dictionary[table_ind]
            datetime_index = np.argmax([i.find('datetime') for i in table.columns])
            date_column = table.columns[datetime_index]
            personid_index = np.argmax([i.find('person_id') for i in table.columns])
            personid_column = table.columns[personid_index]
            table['uid'] = table[date_column].astype(str) + table[personid_column].astype(str)
            table.drop(columns=[date_column,personid_column],inplace=True)
        data_dictionary[table_ind] = table

In [231]:
add_uids(training_data)

In [225]:
def birthday_management(data_dictionary: dict):
    """
    This function processes the 'person_demographics' table of given data, which is inputed as a dictionary. The data in dictionary is replaced by index, hence function returns nothing.

    Parameters
    ----------
    'datadictionary' : dict
        A dictionary of pandas DataFrames.
    """
    demographics_ind_no = np.argmax([table.startswith("person_demographics") for table in training_data.keys()])
    demographics_index = list(training_data.keys())[demographics_ind_no]
    demographics = data_dictionary[demographics_index]

    new_birthday_col = pd.DataFrame(columns=['birthday_formatted', 'person_id'])
    new_visit_col = pd.DataFrame(columns=['visit_start_date','new_visit_startdate'])
    
    for person in np.unique(demographics['person_id']):
        birthday = demographics[demographics['person_id']==person]['birth_datetime'].to_list()[0]
        birthday_formatted = datetime.strptime(birthday,'%Y-%m-%d')
        new_birthday_col.loc[len(new_birthday_col)] = [birthday_formatted, person]

    for date in np.unique(demographics['visit_start_date']):
        visit_start = demographics[demographics['visit_start_date']==date]['visit_start_date'].to_list()[0]
        new_visit_startdate = datetime.strptime(visit_start,'%Y-%m-%d')
        # print(f'new {new_visit_startdate} old {date}')
        new_visit_col.loc[len(new_visit_col)] = [date, new_visit_startdate]


    demographics = pd.merge(left=demographics,right=new_birthday_col,how='left',on='person_id')
    demographics = pd.merge(left=demographics,right=new_visit_col,how='left',on='visit_start_date')
    demographics.drop(columns=['visit_start_date','birth_datetime'],inplace=True)
    demographics.to_csv(f'./processed_data/processed_{demographics_index}.csv')
    data_dictionary[demographics_index] = demographics
    return None

In [227]:
birthday_management(training_data)

In [112]:
training_data.keys()

dict_keys(['devices_train', 'drugsexposure_train', 'measurement_lab_train', 'measurement_meds_train', 'measurement_observation_train', 'observation_train', 'person_demographics_episode_train', 'proceduresoccurrences_train', 'SepsisLabel_train'])

In [194]:
def measurement_meds_processing(data_dictionary: dict):
    """ This function processes the 'measurement_meds' table of given data, which is inputed in a dictionary. The data in dictionary is replaced by index, hence function returns nothing.

    Parameters
    ----------
    'datadictionary' : dict
        A dictionary of pandas DataFrames.
    """
    body_measurements_ind = np.argmax([table.startswith("measurement_meds") for table in training_data.keys()])
    body_measurements_index = list(training_data.keys())[body_measurements_ind]
    measurements = data_dictionary[body_measurements_index]
    
    measurements = measurements.dropna(subset=measurements.select_dtypes(float).columns, how='all')
    # measurements.drop(index=[i for i in measurements[measurements['Body temperature']>45].index], axis=1,inplace=True)
    measurements['Body temperature'] = measurements['Body temperature'].apply(lambda x: np.nan if x >46 else x)
    data_dictionary[body_measurements_index] = measurements
    return None

In [190]:
measurement_meds_processing(training_data)

Index(['visit_occurrence_id', 'person_id', 'measurement_datetime',
       'Systolic blood pressure', 'Diastolic blood pressure',
       'Body temperature', 'Respiratory rate', 'Heart rate',
       'Measurement of oxygen saturation at periphery',
       'Oxygen/Gas total [Pure volume fraction] Inhaled gas'],
      dtype='object')


In [224]:
training_data['person_demographics_episode_train'][training_data['person_demographics_episode_train']['person_id']==285203296]

Unnamed: 0,visit_occurrence_id,person_id,visit_start_date,birth_datetime,age_in_months,gender,birthday_formatted,new_visit_startdate
536,1269290336,285203296,2019-08-13,2013-11-21,69,MALE,2013-11-21,2019-08-13
537,510079749,285203296,2021-03-27,2013-11-21,88,MALE,2013-11-21,2021-03-27
538,1864970011,285203296,2021-05-13,2013-11-21,90,MALE,2013-11-21,2021-05-13
539,260222424,285203296,2021-08-10,2013-11-21,93,MALE,2013-11-21,2021-08-10
540,1633339455,285203296,2021-12-23,2013-11-21,97,MALE,2013-11-21,2021-12-23
541,1972916064,285203296,2024-03-28,2013-11-21,124,MALE,2013-11-21,2024-03-28
542,483842634,285203296,2024-04-13,2013-11-21,125,MALE,2013-11-21,2024-04-13
543,810102040,285203296,2024-04-27,2013-11-21,125,MALE,2013-11-21,2024-04-27
544,73355065,285203296,2024-05-08,2013-11-21,126,MALE,2013-11-21,2024-05-08
