In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.0f' % x)
pd.set_option('display.max_columns', 35)

In [2]:
patient_path = './dataset/Patient/'
p_info_original = pd.read_csv(patient_path+'PatientInfo.csv')

In [3]:
p_info_original.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1,,75,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1,,31,,2020-01-30,2020-03-02,,released
2,1000000003,6,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,2,2002000001.0,17,,2020-01-30,2020-02-19,,released
3,1000000004,7,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1,,9,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,9,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2,1000000002.0,2,,2020-01-31,2020-02-24,,released


In [4]:
p_info = p_info_original.copy()

# 1 - male, 0 - female
p_info['is_male'] = p_info['sex'].map({'male': 1, 'female': 0})
p_info['is_male'].unique()

# is from Korea - 1, not - 0
p_info['is_local'] = p_info['country'].map({'Korea': 1})
p_info['is_local'].fillna(0, inplace=True)

# is released - 1, not 0
p_info['released'] = p_info['state'].map({'released': 1})
p_info['released'].fillna(0, inplace=True)

# is deceased - 1, not 0
p_info['deceased'] = p_info['state'].map({'deceased': 1})
p_info['deceased'].fillna(0, inplace=True)

# infected by known contact - 1, not - 0
p_info['by_contact'] = p_info['infection_case'].map({'contact with patient': 1})
p_info['by_contact'].fillna(0, inplace=True)

# existing disease - 1, not - 0
p_info['has_disease'] = p_info['disease'].map({True: 1})
p_info['has_disease'].fillna(0, inplace=True)

# existing disease - 1, not - 0
p_info['has_symptoms'] = [0 if pd.isnull(s) else 1 for s in p_info['symptom_onset_date']]

# number of other people given patient infected (0 if there are no known cases)
infected_count = p_info['infected_by'].value_counts().rename_axis('patient_id').reset_index(name='infected_count')
p_info = p_info.merge(infected_count, on="patient_id", how = 'left')
p_info['infected_count'].fillna(0, inplace=True)

# days between confirmation and outcome
p_info['released_date'] = pd.to_datetime(p_info['released_date'])
p_info['confirmed_date'] = pd.to_datetime(p_info['confirmed_date'])
p_info['deceased_date'] = pd.to_datetime(p_info['deceased_date'])
p_info['symptom_onset_date'] = pd.to_datetime(p_info['symptom_onset_date'])

# number of days between symptoms and confirmation (if not known, then 0)
p_info['symptoms_to_confirmed'] = p_info['confirmed_date'] - p_info['symptom_onset_date']
p_info['symptoms_to_confirmed'].fillna(pd.Timedelta(seconds=0), inplace=True)

last_confirmed = p_info['confirmed_date'].max()

# number of days between symptoms and outcome (if not known, then 0)
p_info['symptoms_to_outcome'] = [r-s if pd.isnull(d) else d-s for s, r, d in zip(p_info['symptom_onset_date'],p_info['released_date'],p_info['deceased_date'])]
p_info['symptoms_to_outcome'].fillna(last_confirmed-p_info['symptom_onset_date'], inplace=True)
p_info['symptoms_to_outcome'].fillna(pd.Timedelta(seconds=0), inplace=True)

# number of days between confirmation and outcome (if none outcome defined, take last update date)
p_info['confirmed_to_outcome'] = [r-c if pd.isnull(d) else d-c for c, r, d in zip(p_info['confirmed_date'],p_info['released_date'],p_info['deceased_date'])]
p_info['confirmed_to_outcome'].fillna(last_confirmed-p_info['confirmed_date'], inplace=True)

In [5]:
# if not known who infected, set to 1000000000
p_info['infected_by'].fillna(1000000000, inplace=True)
# if contact number not known, set to 0
p_info['contact_number'].fillna(0, inplace=True)
# if state not known, set to unknown
p_info['state'].fillna('unknown', inplace=True)
# add age as number
p_info['age_years'] = 2020 - p_info['birth_year']

In [6]:
# drop cases with no confirmation date and age
p_info = p_info.dropna(subset=['confirmed_to_outcome', 'age_years', 'is_male'])

# convert date delta to int
p_info['symptoms_to_confirmed'] = p_info['symptoms_to_confirmed'].dt.days.astype('int16')
p_info['confirmed_to_outcome'] = p_info['confirmed_to_outcome'].dt.days.astype('int16')
p_info['symptoms_to_outcome'] = p_info['symptoms_to_outcome'].dt.days.astype('int16')

In [7]:
p_info.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state,is_male,is_local,released,deceased,by_contact,has_disease,has_symptoms,infected_count,symptoms_to_confirmed,symptoms_to_outcome,confirmed_to_outcome,age_years
0,1000000001,2,male,1964,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1,1000000000,75,2020-01-22,2020-01-23,2020-02-05,NaT,released,1,1,1,0,0,0,1,0,1,14,13,56
1,1000000002,5,male,1987,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1,1000000000,31,NaT,2020-01-30,2020-03-02,NaT,released,1,1,1,0,0,0,0,1,0,0,32,33
2,1000000003,6,male,1964,50s,Korea,Seoul,Jongno-gu,,contact with patient,2,2002000001,17,NaT,2020-01-30,2020-02-19,NaT,released,1,1,1,0,1,0,0,4,0,0,20,56
3,1000000004,7,male,1991,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1,1000000000,9,2020-01-26,2020-01-30,2020-02-15,NaT,released,1,1,1,0,0,0,1,0,4,20,16,29
4,1000000005,9,female,1992,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2,1000000002,2,NaT,2020-01-31,2020-02-24,NaT,released,0,1,1,0,1,0,0,0,0,0,24,28


In [8]:
p_info.to_csv('p_info_preprocessed.csv')