In [142]:
import pandas as pds
import numpy as np

# loading data

In [143]:
df0_pre = pds.read_csv('real_data/df_control_pre_treatment.csv.gz')

df0_post = pds.read_csv('real_data/df_control_post_treatment.csv.gz')

df1_pre = pds.read_csv('real_data/df_treated_pre_treatment.csv.gz')

df1_post = pds.read_csv('real_data/df_treated_post_treatment.csv.gz')

df0 = pds.read_csv('real_data/df_control_cohort.csv.gz')

df1 = pds.read_csv('real_data/df_treated_cohort.csv.gz')

# Outcome tensor

In [144]:
df1_post.head()

Unnamed: 0,patid,statins_prscd,exp_date,exposure,original,ts
0,1025,2009-04-20,2009-05-27,ldl,2.2,0.10137
1,1025,2009-04-20,2011-03-01,ldl,2.6,1.863014
2,1025,2009-04-20,2012-03-29,ldl,2.3,2.942466
3,1025,2009-04-20,2013-01-16,ldl,2.6,3.745205
4,1025,2009-04-20,2014-09-08,ldl,1.9,5.389041


In [145]:
df1_post['enroll_date'] = df1_post['statins_prscd']

In [146]:
def get_post_treatment(df, n_year=3):
    df = df.copy()
    df.exp_date = pds.to_datetime(df.exp_date)
    df.enroll_date = pds.to_datetime(df.enroll_date)
    # three year follow up

    df['time_since_enroll'] = df.exp_date - df.enroll_date
    df_3yr = df[df.time_since_enroll.astype('timedelta64[D]').astype('int') < 365 * n_year]
    df_3yr_out = df_3yr.groupby('patid').agg({'original': 'mean'}).reset_index()
    return df_3yr_out

In [188]:
df1_outcome = get_post_treatment(df1_post, 3)
df0_outcome = get_post_treatment(df0_post, 3)

In [189]:
df1_outcome.head()

Unnamed: 0,patid,original
0,1025,2.366667
1,1128,2.3
2,1433,2.625
3,1595,1.87
4,1633,2.85


In [192]:
Y0 = df0_outcome.original.values[:, None, None]
Y1 = df1_outcome.original.values[:, None, None]

In [193]:
Y0.shape

(151260, 1, 1)

# Pre-treatment tensor

In [157]:
df1_pre_with_outcome = df1_pre[df1_pre.patid.isin(df1_outcome.patid.values)]

In [161]:
df0_pre_with_outcome = df0_pre[df0_pre.patid.isin(df0_outcome.patid.values)]

In [162]:
df0_pre_with_outcome.shape

(1435557, 14)

In [163]:
df1_pre_with_outcome.head()

Unnamed: 0,patid,statins_prscd,exp_date,ts,bmi,creatinine,dbp,hdl,ldl,pulse,sbp,smokbin,tchol,triglycerides
0,1025,2009-04-20,2006-03-17,-3.09589,0.816792,-0.760384,2.340395,-1.092967,1.461241,0.206718,1.67812,,1.469227,2.031118
1,1025,2009-04-20,2006-03-24,-3.076712,,,1.462977,,,,0.503195,,,
2,1025,2009-04-20,2006-04-27,-2.983562,,,1.199751,,,,0.758613,,,
3,1025,2009-04-20,2006-05-25,-2.906849,,,1.375235,,,,0.860781,,,
4,1025,2009-04-20,2006-07-05,-2.794521,,,1.901686,,,-0.776326,0.758613,,,


In [164]:
def padd(arr, n, d):
    if len(arr.shape) == 1:
        lack = n - arr.shape[0]
        ones = np.ones_like(arr)
        pad = np.zeros(lack)
        arr_new = np.concatenate([arr, pad])
        mask_new = np.concatenate([ones, pad])
    else:
        lack = n - arr.shape[0]
        ones = np.ones_like(arr)
        pad = np.zeros((lack, d))
        arr_new = np.concatenate([arr, pad], axis=0)
        mask_new = np.concatenate([ones, pad], axis=0)
    return arr_new, mask_new

In [165]:
def process_pre_treatment(df):
    df_grouped = df.groupby('patid')
    
    patid_list = []
    ts_list = []
    mat_list = []
    ts_mask_list = []
    mat_mask_list = []


    for group_name, s in df_grouped:
        ts = s.ts.values
        ts_padded, ts_mask = padd(ts, 20, 10)

        mat = s[['bmi','creatinine','dbp','hdl','ldl','pulse','sbp','smokbin','tchol','triglycerides']].values
        mat_padded, mat_mask = padd(mat, 20, 10)
        mat_nan = 1 - np.isnan(mat_padded).astype('float')
        mat_mask = (mat_mask+mat_nan == 2).astype('float')
        mat_padded[np.isnan(mat_padded)] = 0

        patid_list.append(group_name)
        ts_list.append(ts_padded)
        mat_list.append(mat_padded)
        ts_mask_list.append(ts_mask)
        mat_mask_list.append(mat_mask)

    ts_arr = np.stack(ts_list, axis=0)
    val_arr = np.stack(mat_list, axis=0)
    ts_mask_arr = np.stack(ts_mask_list, axis=0)
    val_mask_arr = np.stack(mat_mask_list, axis=0)
    return patid_list, val_arr, val_mask_arr, ts_arr, ts_mask_arr

In [166]:
patid_list1, val_arr1, val_mask_arr1, ts_arr1, ts_mask_arr1 = process_pre_treatment(df1_pre_with_outcome)

patid_list0, val_arr0, val_mask_arr0, ts_arr0, ts_mask_arr0 = process_pre_treatment(df0_pre_with_outcome)

patid0 = np.array(patid_list0)
patid1 = np.array(patid_list1)


In [71]:
np.save('real_data/val_arr1', val_arr1)
np.save('real_data/val_mask_arr1', val_mask_arr1)
np.save('real_data/ts_arr1', ts_arr1)
np.save('real_data/ts_mask_arr1', ts_mask_arr1)
np.save('real_data/patid1', patid1)

np.save('real_data/val_arr0', val_arr1)
np.save('real_data/val_mask_arr0', val_mask_arr1)
np.save('real_data/ts_arr0', ts_arr1)
np.save('real_data/ts_mask_arr0', ts_mask_arr1)
np.save('real_data/patid0', patid0)


In [194]:
np.save('real_data/Y0', Y0)
np.save('real_data/Y1', Y1)


### Checking pre and post-treatment data matches on patid

In [186]:
np.sum(df1_outcome.patid.values != np.array(patid_list1))

0

In [187]:
np.sum(df0_outcome.patid.values != np.array(patid_list0))

0

In [195]:
val_arr0.shape

(151260, 20, 10)

In [177]:
df1_outcome_with_cov = df1_outcome[df1_outcome.patid.isin(patid_list1)]
df0_outcome_with_cov = df0_outcome[df0_outcome.patid.isin(patid_list0)]

In [178]:
df1_outcome_with_cov.shape

(67736, 2)

In [181]:
df0_outcome_with_cov.shape

(151260, 2)

In [179]:
df1_outcome.shape

(67736, 2)

In [180]:
df0_outcome.shape

(151260, 2)

In [175]:
len(patid_list1)

67736

In [176]:
len(patid_list0)

151260

In [167]:
len(df0_pre_with_outcome['patid'].unique())

151260

In [168]:
len(df1_pre_with_outcome['patid'].unique())

67736

In [169]:
len(df0_post['patid'].unique())

191537

In [170]:
len(df1_post['patid'].unique())

74119

In [133]:
df0_post.shape

(462556, 6)

In [134]:
df1_post.shape

(552782, 7)

In [76]:
df = df0_post.copy()

In [77]:
df.exp_date = pds.to_datetime(df.exp_date)
df.enroll_date = pds.to_datetime(df.enroll_date)

In [90]:
# three year follow up

df['time_since_enroll'] = df.exp_date - df.enroll_date

In [107]:
df_3yr = df[df.time_since_enroll.astype('timedelta64[D]').astype('int') < 365 * 3]

In [111]:
df_3yr_out = df_3yr.groupby('patid').agg({'original': 'mean'}).reset_index()

In [113]:
df_3yr_out.shape

(151260, 2)

In [117]:
# df_3yr_plus = df[df.time_since_enroll.astype('timedelta64[D]').astype('int') >= 365 * 3]
# df_3yr_out_plus = df_3yr_plus.groupby('patid').agg({'original': 'mean'}).reset_index()
# df_3yr_out_plus.shape

In [62]:
np.sum(np.isnan(ts_mask_arr1))

0

In [53]:
val_arr1.shape

(96311, 20, 10)

In [54]:
len(patid_list1)

96311

In [55]:
val_mask_arr1.shape

(96311, 20, 10)

In [56]:
ts_arr1.shape

(96311, 20)

In [57]:
ts_mask_arr1.shape

(96311, 20)

In [5]:
s = df1_pre[df1_pre.patid == 1025]
s.head()

Unnamed: 0,patid,statins_prscd,exp_date,ts,bmi,creatinine,dbp,hdl,ldl,pulse,sbp,smokbin,tchol,triglycerides
0,1025,2009-04-20,2006-03-17,-3.09589,0.816792,-0.760384,2.340395,-1.092967,1.461241,0.206718,1.67812,,1.469227,2.031118
1,1025,2009-04-20,2006-03-24,-3.076712,,,1.462977,,,,0.503195,,,
2,1025,2009-04-20,2006-04-27,-2.983562,,,1.199751,,,,0.758613,,,
3,1025,2009-04-20,2006-05-25,-2.906849,,,1.375235,,,,0.860781,,,
4,1025,2009-04-20,2006-07-05,-2.794521,,,1.901686,,,-0.776326,0.758613,,,


In [6]:
ts = s.ts.values
ts_padded, ts_mask = padd(ts, 20, 10)

In [22]:
ts_padded

array([-3.09589041, -3.07671233, -2.98356164, -2.90684932, -2.79452055,
       -2.3369863 , -2.04657534, -1.97260274, -0.28767123, -0.22465753,
       -0.18630137, -0.07123288,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [23]:
ts_mask

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0.])

In [24]:
mat = s[['bmi','creatinine','dbp','hdl','ldl','pulse','sbp','smokbin','tchol','triglycerides']].values

mat_padded, mat_mask = padd(mat, 20, 10)

mat_nan = 1 - np.isnan(mat_padded).astype('float')

mat_mask = (mat_mask+mat_nan == 2).astype('float')

In [35]:
mat_mask

array([[1., 1., 1., 1., 1., 1., 1., 0., 1., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 1., 0., 1., 1.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [37]:
mat_padded[np.isnan(mat_padded)] = 0

In [40]:
df = df1_pre.head(5000)



In [45]:
ts_arr.shape

(482, 20)

In [47]:
val_arr.shape

(482, 20, 10)

In [48]:
ts_mask_arr.shape

(482, 20)

In [49]:
val_mask_arr.shape

(482, 20, 10)