In [55]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [175]:
def readin_data(data_type: str):
    """ This function reads in test or train data, which should be in folders 'testing_data' and 'training_data' in the same directory.

    Parameters
    ----------
    'data_type' : str
        This should be 'test' or 'train'.
    """
    assert (data_type=='test') or (data_type=='train'), f'You gave data_type as {data_type}. Please define data_type as "test" or "train."'
    if data_type == 'test':
        inner_directory = './testing_data/'
        data_list = os.listdir('./testing_data')
    else:
        inner_directory = './training_data/'
        data_list = os.listdir('./training_data')
    data_dict = {}
    for file_name in data_list:
        data_dict[file_name.split('.')[0]] = pd.read_csv(inner_directory+file_name).drop_duplicates()
    return data_dict

In [226]:
training_data = readin_data('train')

In [114]:
os.makedirs('./processed_data')

In [235]:
def add_uids(data_dictionary: dict):
    """ This function adds a UID to each row to establish unique instances between person_id & measurement datetimes for the various tables.
    
    This is not done for the demographics file since the information in it is not sensitive to the hour.

    Parameters
    ----------
    'data_dictionary' : dict
        A dictionary of pandas DataFrames.
    """
    for table_ind in list(data_dictionary.keys()):
        if not table_ind.startswith("person_demographics"):
            table = data_dictionary[table_ind]
            datetime_index = np.argmax([i.find('datetime') for i in table.columns])
            date_column = table.columns[datetime_index]
            personid_index = np.argmax([i.find('person_id') for i in table.columns])
            personid_column = table.columns[personid_index]
            table['uid'] = table[date_column].astype(str) + table[personid_column].astype(str)
            table.drop(columns=[date_column,personid_column],inplace=True)
        data_dictionary[table_ind] = table

In [231]:
add_uids(training_data)

In [225]:
def birthday_management(data_dictionary: dict):
    """
    This function processes the 'person_demographics' table of given data, which is inputed as a dictionary. The data in dictionary is replaced by index, hence function returns nothing.

    Parameters
    ----------
    'data_dictionary' : dict
        A dictionary of pandas DataFrames.
    """
    demographics_ind_no = np.argmax([table.startswith("person_demographics") for table in training_data.keys()])
    demographics_index = list(training_data.keys())[demographics_ind_no]
    demographics = data_dictionary[demographics_index]

    new_birthday_col = pd.DataFrame(columns=['birthday_formatted', 'person_id'])
    new_visit_col = pd.DataFrame(columns=['visit_start_date','new_visit_startdate'])
    
    for person in np.unique(demographics['person_id']):
        birthday = demographics[demographics['person_id']==person]['birth_datetime'].to_list()[0]
        birthday_formatted = datetime.strptime(birthday,'%Y-%m-%d')
        new_birthday_col.loc[len(new_birthday_col)] = [birthday_formatted, person]

    for date in np.unique(demographics['visit_start_date']):
        visit_start = demographics[demographics['visit_start_date']==date]['visit_start_date'].to_list()[0]
        new_visit_startdate = datetime.strptime(visit_start,'%Y-%m-%d')
        # print(f'new {new_visit_startdate} old {date}')
        new_visit_col.loc[len(new_visit_col)] = [date, new_visit_startdate]


    demographics = pd.merge(left=demographics,right=new_birthday_col,how='left',on='person_id')
    demographics = pd.merge(left=demographics,right=new_visit_col,how='left',on='visit_start_date')
    demographics.drop(columns=['visit_start_date','birth_datetime'],inplace=True)
    demographics.to_csv(f'./processed_data/processed_{demographics_index}.csv')
    data_dictionary[demographics_index] = demographics
    return None

In [227]:
birthday_management(training_data)

In [112]:
training_data.keys()

dict_keys(['devices_train', 'drugsexposure_train', 'measurement_lab_train', 'measurement_meds_train', 'measurement_observation_train', 'observation_train', 'person_demographics_episode_train', 'proceduresoccurrences_train', 'SepsisLabel_train'])

In [194]:
def measurement_meds_processing(data_dictionary: dict):
    """ This function processes the 'measurement_meds' table of given data, which is inputed in a dictionary. The data in dictionary is replaced by index, hence function returns nothing.

    Parameters
    ----------
    'data_dictionary' : dict
        A dictionary of pandas DataFrames.
    """
    body_measurements_ind = np.argmax([table.startswith("measurement_meds") for table in training_data.keys()])
    body_measurements_index = list(training_data.keys())[body_measurements_ind]
    measurements = data_dictionary[body_measurements_index]
    
    measurements = measurements.dropna(subset=measurements.select_dtypes(float).columns, how='all')
    # measurements.drop(index=[i for i in measurements[measurements['Body temperature']>45].index], axis=1,inplace=True)
    measurements['Body temperature'] = measurements['Body temperature'].apply(lambda x: np.nan if x >46 else x)
    data_dictionary[body_measurements_index] = measurements
    return None

In [None]:
# COPIED NOT REDONE

drugsexposure_train_2 = pd.DataFrame(columns = ['uid', 'drugs', 'routes', 'visit_occurrence_id'])
for x in tqdm(np.unique(drugsexposure_train['uid'])):
    drugs = drugsexposure_train[drugsexposure_train['uid']==x]['drug_concept_id'].to_list()
    drugs.sort()
    try:
        route = drugsexposure_train[drugsexposure_train['uid']==x]['route_concept_id'].to_list()
        route = list(set(route))
        route = [str(i) for i in route]
        route.sort()
    except:
        route = drugsexposure_train[drugsexposure_train['uid']==x]['route_concept_id'].to_list()
        route = list(set(route))
    visit_occurrence = drugsexposure_train[drugsexposure_train['uid']==x]['visit_occurrence_id'].to_list()[0]
    drugsexposure_train_2.loc[len(drugsexposure_train_2)]= [x,drugs,route, visit_occurrence]

In [None]:
# COPIED NOT REDONE

measurement_lab_train_2 = measurement_lab_train.copy()
print(len(measurement_lab_train))
# measurement_lab_train_2.drop(columns=['visit_occurrence_id'],inplace=True)
measurement_lab_train_2 = measurement_lab_train_2.dropna(subset=measurement_lab_train_2.select_dtypes(float).columns, how='all')
measurement_lab_train_2_count = pd.DataFrame([list(i) for i in Counter(measurement_lab_train_2['uid']).items()],columns=['uid','count'])
measurement_lab_train_2_count['count'].astype(int)
measurement_lab_rows = pd.DataFrame()
measurement_lab_extras = measurement_lab_train_2_count[measurement_lab_train_2_count['count']>1]
for j in [i for i in measurement_lab_extras['uid']]:
    measurement_lab_rows = pd.concat([measurement_lab_rows,measurement_lab_train_2[measurement_lab_train_2['uid']==j].max().to_frame().T]).reset_index(drop=True)
measurement_lab_train_2.drop(index=[i for i in measurement_lab_train_2[measurement_lab_train_2['uid']==measurement_lab_extras['uid'].item()].index], axis=1,inplace=True)
measurement_lab_train_2 = pd.concat([measurement_lab_train_2,measurement_lab_rows]).reset_index(drop=True)