In [1]:
import pandas as pds
import numpy as np

# loading data

In [2]:
df0_pre = pds.read_csv('real_data/df_control_pre_treatment.csv.gz')

df0_post = pds.read_csv('real_data/df_control_post_treatment.csv.gz')

df1_pre = pds.read_csv('real_data/df_treated_pre_treatment.csv.gz')

df1_post = pds.read_csv('real_data/df_treated_post_treatment.csv.gz')

df0 = pds.read_csv('real_data/df_control_cohort.csv.gz')

df1 = pds.read_csv('real_data/df_treated_cohort.csv.gz')

In [4]:
df0.shape

(336969, 50)

In [5]:
df1.shape

(96311, 50)

# Outcome tensor

In [7]:
df1_post['enroll_date'] = df1_post['statins_prscd']

In [8]:
def get_post_treatment(df, n_year=3, rolling=True):
    df = df.copy()
    df.exp_date = pds.to_datetime(df.exp_date)
    df.enroll_date = pds.to_datetime(df.enroll_date)
    # three year follow up

    df['time_since_enroll'] = df.exp_date - df.enroll_date
    if rolling:
        df_3yr = df[df.time_since_enroll.astype('timedelta64[D]').astype('int') < 365 * n_year]
    else:
        df_3yr = df[(df.time_since_enroll.astype('timedelta64[Y]').astype('int') == n_year - 1) ]
    df_3yr_out = df_3yr.groupby('patid').agg({'original': 'mean'}).reset_index()
    return df_3yr_out

In [10]:
df1_outcome = get_post_treatment(df1_post, 1, True)
df0_outcome = get_post_treatment(df0_post, 1, True)

In [12]:
Y0 = df0_outcome.values[:, 1:, None]
Y1 = df1_outcome.values[:, 1:, None]

In [13]:
Y0.shape

(73672, 1, 1)

In [14]:
Y1.shape

(52113, 1, 1)

# Pre-treatment tensor

In [15]:
df1_pre_with_outcome = df1_pre[df1_pre.patid.isin(df1_outcome.patid.values)]

df0_pre_with_outcome = df0_pre[df0_pre.patid.isin(df0_outcome.patid.values)]

In [16]:
df0_pre_with_outcome.shape

(714135, 14)

In [18]:
df1_pre_with_outcome.ldl.mean()

1.1440162365929625

In [19]:
df0_pre_with_outcome.ldl.mean()

0.472948168755814

In [19]:
def padd(arr, n, d):
    if len(arr.shape) == 1:
        lack = n - arr.shape[0]
        ones = np.ones_like(arr)
        pad = np.zeros(lack)
        arr_new = np.concatenate([arr, pad])
        mask_new = np.concatenate([ones, pad])
    else:
        lack = n - arr.shape[0]
        ones = np.ones_like(arr)
        pad = np.zeros((lack, d))
        arr_new = np.concatenate([arr, pad], axis=0)
        mask_new = np.concatenate([ones, pad], axis=0)
    return arr_new, mask_new

In [20]:
def process_pre_treatment(df):
    df_grouped = df.groupby('patid')
    
    patid_list = []
    ts_list = []
    mat_list = []
    ts_mask_list = []
    mat_mask_list = []


    for group_name, s in df_grouped:
        ts = s.ts.values
        ts_padded, ts_mask = padd(ts, 20, 10)

        mat = s[['bmi','creatinine','dbp','hdl','ldl','pulse','sbp','smokbin','tchol','triglycerides']].values
        mat_padded, mat_mask = padd(mat, 20, 10)
        mat_nan = 1 - np.isnan(mat_padded).astype('float')
        mat_mask = (mat_mask+mat_nan == 2).astype('float')
        mat_padded[np.isnan(mat_padded)] = 0

        patid_list.append(group_name)
        ts_list.append(ts_padded)
        mat_list.append(mat_padded)
        ts_mask_list.append(ts_mask)
        mat_mask_list.append(mat_mask)

    ts_arr = np.stack(ts_list, axis=0)
    val_arr = np.stack(mat_list, axis=0)
    ts_mask_arr = np.stack(ts_mask_list, axis=0)
    val_mask_arr = np.stack(mat_mask_list, axis=0)
    return patid_list, val_arr, val_mask_arr, ts_arr, ts_mask_arr

In [21]:
patid_list1, val_arr1, val_mask_arr1, ts_arr1, ts_mask_arr1 = process_pre_treatment(df1_pre_with_outcome)

patid_list0, val_arr0, val_mask_arr0, ts_arr0, ts_mask_arr0 = process_pre_treatment(df0_pre_with_outcome)

patid0 = np.array(patid_list0)
patid1 = np.array(patid_list1)


In [22]:
np.save('real_data3/val_arr1', val_arr1)
np.save('real_data3/val_mask_arr1', val_mask_arr1)
np.save('real_data3/ts_arr1', ts_arr1)
np.save('real_data3/ts_mask_arr1', ts_mask_arr1)
np.save('real_data3/patid1', patid1)

np.save('real_data3/val_arr0', val_arr0)
np.save('real_data3/val_mask_arr0', val_mask_arr0)
np.save('real_data3/ts_arr0', ts_arr0)
np.save('real_data3/ts_mask_arr0', ts_mask_arr0)
np.save('real_data3/patid0', patid0)


np.save('real_data3/Y0', Y0)
np.save('real_data3/Y1', Y1)


### Checking pre and post-treatment data matches on patid

In [23]:
np.sum(df1_outcome.patid.values != np.array(patid_list1))

0

In [24]:
np.sum(df0_outcome.patid.values != np.array(patid_list0))

0

In [25]:
val_arr0.shape

(73672, 20, 10)

In [26]:
val_arr1.shape

(52113, 20, 10)

## Export for clairvoyance benchmarks

In [57]:
def get_df(df1_pre_with_outcome, df1_post, treat):
    df1_pre = df1_pre_with_outcome[['patid', 'ts', 'bmi', 'creatinine','dbp','hdl','pulse','sbp','smokbin','tchol','triglycerides','ldl']]
    df1_pre = df1_pre.rename(columns={"patid": "id", "ts": "time", "ldl": "outcome"})
    df1_pre['treatment'] = 0
    df1_pre = df1_pre[~df1_pre.outcome.isnull()]
    df1_pre.outcome = df1_pre.outcome + 2.951493
    df1_post2 = df1_post[['patid', 'ts', 'original']]
    df1_post2 = df1_post2.rename(columns={"patid": "id", "ts": "time", "original": "outcome"})
    df1_post2['treatment'] = treat
    df1_post2.head()
    df1_out = pds.concat([df1_pre, df1_post2], sort=False)
    return df1_out

In [58]:
df1_out = get_df(df1_pre_with_outcome, df1_post, 1)
df0_out = get_df(df0_pre_with_outcome, df0_post, 0)

In [59]:
df_out = pds.concat([df1_out, df0_out])
df_out = df_out.sort_values(['id', 'time'])
df_out.head()

In [64]:
ind_df = df_out.id.unique()
df_static = pds.DataFrame({'id':ind_df})
df_static['x_static'] = 0
df_static.head()

Unnamed: 0,id,x_static
0,1025,0
1,1127,0
2,1128,0
3,1209,0
4,1273,0


In [65]:
df_out.to_csv('real_data3/clair_df.csv')

In [66]:
df_static.to_csv('real_data3/clair_df_static.csv')