In [2]:
import pandas as pd 
import numpy as np
import os
top_dirname = '/labs/shahlab/projects/agataf/data/pooled_cohorts/'
study_path = 'MESA/raw/MESA/exam1/datasets/MESAe1FinalLabel01022018.dta'
study_path = os.path.join(top_dirname, study_path)
outcome_path = 'MESA/raw/MESA/events/datasets/MESAEvThru2017_20191004.dta'
outcome_path = os.path.join(top_dirname, outcome_path)
dirname_output = os.path.join(top_dirname, 'cohort_extraction')

os.makedirs(dirname_output, exist_ok=True)

def cond_yrsto(time, cond):
    return [left/365.25 if right == 1 else np.nan for (left,right) in zip(time, cond)]

df = pd.read_stata(study_path, convert_categoricals=False)

df = (df
      .assign(idno = lambda x: x.idno.astype(int))
      .set_index('idno', drop=True)
     )

df = df.filter(items = ['idno', 'age1c', 'race1c', 'gender1', 'bmi1c',
                        'cig1c', 'sbp1c', 'dbp1c', 'glucos1c', 'dm031c',
                        'htnmed1c', 'htn1c', 'ldl1', 'hdl1', 'chol1', 'trig1',
                        'creatin1c', 'lipid1c', 'ascvd1c', 'bpmed1', 'sttn1c',
                        'agatum1c', 'agatpm1c'])

df = df.rename(columns = {
    'glucos1c': 'glucose',
    'hdl1': 'hdlc',
    'ldl1': 'ldlc',
    'chol1': 'totchol',
    'age1c': 'age',
    'sbp1c': 'sysbp',
    'dbp1c': 'diabp',
    'bmi1c': 'bmi',
    'lipid1c': 'cholmed',
    'bpmed1': 'hyptmdsr',
    'agatum1c': 'cac',
    'agatpm1c': 'cac_phantomadjusted',
    'trig1': 'trigly'
})


df = df.assign(gender = df.gender1.map({1: 'M', 0: 'F'}),
               racegrp = df.race1c.map({3: 'B', 1: 'W'}),
               cursmoke = df.cig1c.map({2: 1, 1: 0, 0:0}),
               dm = df.dm031c.map({0:0, 1:0, 2: 1, 3: 1}),
              )

df = df.filter(items = ['glucose', 'hdlc', 'ldlc', 'totchol', 'age',
                        'sysbp', 'diabp', 'bmi', 'cholmed', 'hyptmdsr',
                        'cac', 'cac_phantomadjusted', 'gender', 'racegrp',
                        'cursmoke', 'dm', 'trigly'])

events = pd.read_stata(outcome_path, convert_categoricals=False)

events = (events
      .assign(idno = lambda x: x.idno.astype(int))
      .set_index('idno', drop=True)
     )

# TODO: check logic
events = events.assign(chddeath = lambda x: (x.dth == 1) & (x.dthtype == 1),
                       strkdeath = lambda x: (x.dth == 1) & (x.dthtype == 2),
                       timetoang = lambda x: cond_yrsto(x.angtt, x.ang),
                       timetomi = lambda x: cond_yrsto(x.mitt, x.mi),
                       timetostrk = lambda x: cond_yrsto(x.strktt, x.strk),
                       timetochddeath = lambda x: cond_yrsto(x.dthtt, x.chddeath),
                       timetostrkdeath = lambda x: cond_yrsto(x.dthtt, x.strkdeath),
                       timetodth = lambda x: x.dthtt/365.25,
                       lastfu = lambda x: x.fuptt/365.25)

events = events.rename(columns = {
    #'strk': 'str',
    'dth': 'dead'
})

events = events.filter(items = ['chddeath', 'strkdeath', 'timetoang', 'timetomi', 
                       'timetostrk', 'timetochddeath', 'timetostrkdeath',
                       'timetodth', 'lastfu', 'strk', 'dead'])

df_final = (df
            .merge(events, how='outer', on='idno')
            .assign(study = 'MESA')
            .reset_index()
            .rename(columns={'idno': 'cohort_pid'})
            .filter(['timetomi', 'timetostrk', 'timetochddeath', 'timetostrkdeath',
                   'mi', 'strk', 'chddeath', 'strkdeath', 'lastexam', 'lastfu', 
                   'timetodth', 'prevmi', 'prevstrk', 'prevproc', 'prevchf', 'prevcvd',
                   'prevchd', 'prevafib', 'hyptmdsr', 'racegrp', 'gender', 'cholmed', 
                   'age', 'totchol', 'ldlc', 'trigly', 'hdlc', 'sysbp',  
                   'dm', 'cursmoke', 'study', 'cohort_pid'])
           )

df_final.to_csv(os.path.join(dirname_output, 'mesa_all_vars.csv'), index = False)

In [None]:
'''
missing: 
 'lastexam',
 'mi',
 'prevafib',
 'prevchd',
 'prevchf',
 'prevcvd',
 'prevmi',
 'prevproc',
 'prevstrk'
'''

In [5]:
set(['timetomi', 'timetostrk', 'timetochddeath', 'timetostrkdeath',
                   'mi', 'strk', 'chddeath', 'strkdeath', 'lastexam', 'lastfu', 
                   'timetodth', 'prevmi', 'prevstrk', 'prevproc', 'prevchf', 'prevcvd',
                   'prevchd', 'prevafib', 'hyptmdsr', 'racegrp', 'gender', 'cholmed', 
                   'age', 'totchol', 'ldlc', 'trigly', 'hdlc', 'sysbp', 
                   'dm', 'cursmoke', 'study', 'cohort_pid']) -set(df_final.columns)

{'lastexam',
 'mi',
 'prevafib',
 'prevchd',
 'prevchf',
 'prevcvd',
 'prevmi',
 'prevproc',
 'prevstrk'}