# 0 - Data preparation

In [4]:
import numpy as np
import pandas as pd

In [5]:
save_normalized_data = False

# IMC data

In [6]:
imc_data_file = '../data/data.csv'
imc_data = pd.read_csv(imc_data_file, na_values="\\N")

In [7]:
# Indexing like so to avoid pandas SettingWithCopyWarning warning
# https://maxpowerwastaken.github.io/blog/pandas_view_vs_copy/

imc_data.rename(columns={'DaysSinceSymptomFeverOnset':'reftime'}, inplace=True)

sl_indices = imc_data[(imc_data['ETUKey'] == 2) | (imc_data['ETUKey'] == 4) | (imc_data['ETUKey'] == 5)].index
lb_indices = imc_data[(imc_data['ETUKey'] == 1) | (imc_data['ETUKey'] == 3)].index
sl_data = imc_data.loc[sl_indices, :]
lb_data = imc_data.loc[lb_indices, :]

# Normalizing Sierra Leone and Liberia ETUs separately
sl_ct_mean = sl_data['cycletime'].mean()
sl_ct_std = sl_data['cycletime'].std()
print("IMC Sierra Leone", sl_ct_mean, sl_ct_std)
sl_data['cycletime'] = (sl_data['cycletime'] - sl_ct_mean) / sl_ct_std

lb_ct_mean = lb_data['cycletime'].mean()
lb_ct_std = lb_data['cycletime'].std()
print("IMC Liberia", lb_ct_mean, lb_ct_std)
lb_data['cycletime'] = (lb_data['cycletime'] - lb_ct_mean) / lb_ct_std

norm_imc_data = pd.concat([sl_data, lb_data])

if save_normalized_data:
    sl_data.to_csv('../data/data_sl_normalized.csv', index=False, na_rep="\\N")
    lb_data.to_csv('../data/data_lb_normalized.csv', index=False, na_rep="\\N")    
    norm_imc_data.to_csv('../data/data_normalized.csv', index=False, na_rep="\\N")

IMC Sierra Leone 21.83870655701492 5.154446983931702
IMC Liberia 27.671287912272724 5.44911795341149


In [8]:
# Normalize day 1 and day 2 CT values

ct_data_file = '../data/data_ct.csv'
ct_data = pd.read_csv(ct_data_file, na_values="\\N")

ct_data.rename(columns={'DaysSinceSymptomFeverOnset':'reftime'}, inplace=True)

sl_indices = ct_data[(ct_data['ETUKey'] == 2) | (ct_data['ETUKey'] == 4) | (ct_data['ETUKey'] == 5)].index
lb_indices = ct_data[(ct_data['ETUKey'] == 1) | (ct_data['ETUKey'] == 3)].index
sl_data = ct_data.loc[sl_indices, :]
lb_data = ct_data.loc[lb_indices, :]

sl_data['cycletime1'] = (sl_data['cycletime1'] - sl_ct_mean) / sl_ct_std
sl_data['cycletime2'] = (sl_data['cycletime2'] - sl_ct_mean) / sl_ct_std

lb_data['cycletime1'] = (lb_data['cycletime1'] - lb_ct_mean) / lb_ct_std
lb_data['cycletime2'] = (lb_data['cycletime2'] - lb_ct_mean) / lb_ct_std

norm_ct_data = pd.concat([sl_data, lb_data])

if save_normalized_data:
    norm_ct_data.to_csv('../data/data_ct_normalized.csv', index=False, na_rep="\\N")

# Kenema data

In [9]:
kenema_data_file = '../../kenema/mirador/data.csv'
kenema_data = pd.read_csv(kenema_data_file, na_values="\\N")

In [14]:
# Compute transformation between viral load and CT:

# Using all data
min_ct = imc_data['cycletime'].min()
max_ct = imc_data['cycletime'].max()

# Using Sierra Leone only
# min_ct = sl_data['cycletime'].min()
# max_ct = sl_data['cycletime'].max()

# Using Liberia only
# min_ct = lb_data['cycletime'].min()
# max_ct = lb_data['cycletime'].max()

min_log_pcr = kenema_data['PCR'].min()
max_log_pcr = kenema_data['PCR'].max()

print(min_ct, max_log_pcr) 
print(max_ct, min_log_pcr) 
b = (max_log_pcr - min_log_pcr) / (max_ct - min_ct)
a = min_log_pcr + b * max_ct
vl2ct_c1 = -1/b
vl2ct_c0 = +a/b
print(3*b)
print(vl2ct_c1, vl2ct_c0)

# Compare with:
# Each 3-point decrease in Ct was associated with an ≈10-fold increase in Ebola viral load; 
# a Ct of 39 corresponded to ≈40 TCID50/mL and a Ct of 19 corresponded to ≈40 million TCID50/mL
# http://www.fda.gov/downloads/medicaldevices/safety/emergencysituations/ucm436313.pdf
# Based on this, 3*b should be close to 1

12.10000038 10.56304728
39.79999924 1.546663713
0.9765036756034003
-3.072185056698576 44.55163638681653


In [14]:
# Keeping only positive cases

# Indexing like so to avoid pandas SettingWithCopyWarning warning
# https://maxpowerwastaken.github.io/blog/pandas_view_vs_copy/
indices = kenema_data[kenema_data['DIAG'] == 1].index
norm_kenema_data = kenema_data.loc[indices, :]

# Calculating CT from Viral Load (PCR)
norm_kenema_data['CT'] = vl2ct_c1 * kenema_data['PCR'] + vl2ct_c0

kgh_ct_mean = norm_kenema_data['CT'].mean()
kgh_ct_std = norm_kenema_data['CT'].std()
print("Kenema", kgh_ct_mean, kgh_ct_std)
norm_kenema_data['CT'] = (norm_kenema_data['CT'] - kgh_ct_mean) / kgh_ct_std

# Estimating "breathing problems" from oxygen saturation (95-100 values are considered normal):
# https://www.mayoclinic.org/symptoms/hypoxemia/basics/definition/sym-20050930
oxsat = list(norm_kenema_data['OXSAT'])
# (np.isnan(ox) ? np.nan : int(ox < 95))
breath = len(oxsat) * [np.nan]
for i in range(0, len(oxsat)):
    ox = oxsat[i]
    if np.isnan(ox): continue
    breath[i] = int(ox < 95)
norm_kenema_data['BREATH'] = pd.array(breath, dtype=pd.Int64Dtype())

# Switching some values
norm_kenema_data['SEX'] = 1 - kenema_data['GEND']
norm_kenema_data['BLEED'] = 1 - kenema_data['BNONE']

# These non-missing values of these variables are 0, so that causes imputation algorithms to fail. Setting all
# values to 0 so the variable is effectibly a constant and not considered in the imputation
norm_kenema_data['JAUN'] = 0

if len(norm_kenema_data[norm_kenema_data['BREATH'] == 1]) == 0:
    print("Not occurrence of breathing problems")
    norm_kenema_data['BREATH'] = 0
    
if save_normalized_data:
    norm_kenema_data.to_csv('../../kenema/mirador/data_normalized.csv', index=False, na_rep="\\N")

Kenema 26.04665893711396 6.000663383100513


# GOAL data

In [8]:
goal_data_file = '../../goal/data.csv'
goal_data = pd.read_csv(goal_data_file, na_values='\\N')

In [12]:
norm_goal_data = goal_data.copy()

goal_ct_mean = goal_data['evd_ct'].mean()
goal_ct_std = goal_data['evd_ct'].std()
print("GOAL", goal_ct_mean, goal_ct_std)
norm_goal_data['evd_ct'] = (norm_goal_data['evd_ct'] - goal_ct_mean) / goal_ct_std

if save_normalized_data:
    norm_goal_data.to_csv('../../goal/data_normalized.csv', index=False, na_rep="\\N")

GOAL 22.204958677685955 4.313078004543652
