# ARRANEGE CARDIVASCULAR DISEASE DATA 

### 1. Get and arrange data
### 2. Define functions to generate variables
### 3. Generate variables including ICD10 or ICD9 codes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

# 1. Get and arrange data

In [2]:
### OPEN CVD DATA generated following the steps from the README.md
### Data-fields are indicated in the paper

# Read data
filename = "/workspace/datasets/ukbiobank_ch/ukb_data_670124/clinical_data_ukb670124/cvd_data_670124.txt"
lines = ''.join([line for line in open(filename, 'rt')])
lines

# Transform to df
cvd_ukb200k = pd.read_csv(StringIO(lines), sep= '\t')
cvd_ukb200k

In [8]:
# All variable codes

colnoms = []
for x in cvd_ukb200k.columns.tolist():
    colnoms.append(x.split('-')[0])
set(colnoms)

{'130708',
 '130709',
 '130712',
 '130713',
 '130714',
 '130715',
 '131270',
 '131271',
 '131272',
 '131273',
 '131274',
 '131275',
 '131276',
 '131277',
 '131278',
 '131279',
 '131280',
 '131281',
 '131282',
 '131283',
 '131284',
 '131285',
 '131286',
 '131287',
 '131288',
 '131289',
 '131290',
 '131291',
 '131292',
 '131293',
 '131294',
 '131295',
 '131296',
 '131297',
 '131298',
 '131299',
 '131300',
 '131301',
 '131302',
 '131303',
 '131304',
 '131305',
 '131306',
 '131307',
 '131308',
 '131309',
 '131310',
 '131311',
 '131312',
 '131313',
 '131314',
 '131315',
 '131316',
 '131317',
 '131318',
 '131319',
 '131320',
 '131321',
 '131322',
 '131323',
 '131324',
 '131325',
 '131326',
 '131327',
 '131328',
 '131329',
 '131330',
 '131331',
 '131332',
 '131333',
 '131334',
 '131335',
 '131336',
 '131337',
 '131338',
 '131339',
 '131340',
 '131341',
 '131342',
 '131343',
 '131344',
 '131345',
 '131346',
 '131347',
 '131348',
 '131349',
 '131350',
 '131351',
 '131352',
 '131353',
 '131354',

In [9]:
# Function to obtain the index from columns

def columna(data, name):
    a = []
    for n, x in enumerate(data.columns.tolist()):
        if str(x).startswith(str(name)):
            a.append([n , x])
    return(a)
    

In [13]:
# Max death
cvd_ukb200k[cvd_ukb200k['40000-0.0'].notnull()]['40000-0.0'].max()

'2021-11-12'

## 2. Define functions to generate variables

In [17]:
cvd_var = ['MI', 'STR', 'ISTR', 'CAD', 'HF', 'AF', 'ATH']

In [63]:
##### 
##### SUMMARY DIAGNOSIS   
#####

# 41202 [944:1023]  Diagnoses - main ICD10 (79)
# 41262 [1106:1185]  Date of first in-patient diagnosis - main ICD10 (79)
# 41203 [1023:1051]   Diagnoses - main ICD9 (28)
# 41263 [1185:1213]   Date of first in-patient diagnosis - main ICD9 (28)
# 41270 [1213:1456]  Diagnoses - ICD10 (243)
# 41280 [1627:1870] Date of first in-patient diagnosis - ICD10 (243)
# 41271 [1456:1503] Diagnoses - ICD9 (47)
# 41281 [1870:1917] Date of first in-patient diagnosis - ICD9 (47)

# to remove
#1900-01-01 represents "Code has no event date"
#1901-01-01 represents "Code has event date before participant's date of birth"
#1902-02-02 represents "Code has event date matching participant's date of birth"
#1903-03-03 represents "Code has event date after participant's date of birth same calendar year as date of birth"
#2037-07-07 represents "Code has event date in the future and is presumed to be a place-holder or other system default"
remove_dates = ['1900-01-01', '1901-01-01', '1902-02-02', '1903-03-03', '2037-07-07']

def Diagnose_variable(patient):
    # define column
    column_diseases = list(range(944,1023)) + list(range(1023,1051)) + list(range(1213,1456)) + list(range(1456,1503))
    column_dates = list(range(1106,1185)) + list(range(1185,1213)) + list(range(1627,1870)) + list(range(1870,1917))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    dates_hypertension_disease = ['nan' if pd.isna(value) else value for value in dates_hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
            mindate = min(dates_hypertension_disease) 
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        
        if mindate == 'nan':
            return np.nan
        else:
            return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def Diagnose_date(patient):
    
    # define column
    column_diseases = list(range(944,1023)) + list(range(1023,1051)) + list(range(1213,1456)) + list(range(1456,1503))
    column_dates = list(range(1106,1185)) + list(range(1185,1213)) + list(range(1627,1870)) + list(range(1870,1917))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    dates_hypertension_disease = ['nan' if pd.isna(value) else value for value in dates_hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        if mindate == 'nan':
            return np.nan
        else:
            return mindate
    else:
        return np.nan

In [19]:
##### 
##### SELF RECORDS DIAGNOSIS
#####

# 20002 [6:142]  Non-cancer illnes self-reported (136)
# 20009 [406:542]  Age

def SRD_variable(patient):
    
    # define columns
    column_diseases = list(range(6,142))
    column_dates = list(range(406,542))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srdisease)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srdisease)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def SRD_age(patient):
       
    # define columns
    column_diseases = list(range(6,142))
    column_dates = list(range(406,542))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srdisease)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srdisease)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        return mindate
    else:
        return np.nan

In [20]:
##### 
##### SELF RECORDS  
#####

# 20004 [142:270] Operation code, self-reported (128)
# 20011 [670:798] Age (128)

def SRO_variable(patient):
    
    # define columns
    column_diseases = list(range(142,270))
    column_dates = list(range(670,798))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(sroperation)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(sroperation)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def SRO_age(patient):
       
    # define columns
    column_diseases = list(range(142,270))
    column_dates = list(range(670,798))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(sroperation)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(sroperation)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        return mindate
    else:
        return np.nan

In [22]:
#Operative procedures
#41272 [1503:1627] Operative procedures - OPCS4 (124)
#41282 [1917:2041] Date OPCS4 (124)
#41200 [889:944] Operative procedures -main OPCS4 (55)
#41260 [1051:1106] Date of first operative procedure (55)


def OP_variable(patient):
       
    # define columns
    column_diseases = list(range(1503,1627)) + list(range(889,944))
    column_dates = list(range(1917,2041)) + list(range(1051,1106))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(opcode)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(opcode)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(mindate)]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        
        if mindate == 'nan':
            return np.nan
        else:
            return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def OP_date(patient):
    # define columns
    column_diseases = list(range(1503,1627)) + list(range(889,944))
    column_dates = list(range(1917,2041)) + list(range(1051,1106))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(opcode)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(opcode)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        if mindate == 'nan':
            return np.nan
        else:
            return mindate
    else:
        return np.nan

In [24]:
#Death record

#40001 [857:859] Underlaying cause of death
#40002 [859:887] Contributory cause of death
#40000 [391:393] Date of death
#40007 [416:418] Age of death

# define columns

def Death_variable(patient):
    column_diseases = list(range(857,887))
    
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return ', '.join(set(cause))  
    else:
        return np.nan
    
    
def Death_date(patient):
    column_diseases = list(range(857,887))
        
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return patient['40000-0.0']
    else:
        return np.nan

## 3. Generate variables including ICD10 or ICD9 codes

In [None]:
cvd_var = ['MI', 'STR', 'ISTR', 'CAD', 'HF', 'AF', 'ATH']

In [26]:
# Define Myocardial infarction

icd_10_9 = ('I21', 'I22', 'I23', 'I24', 'I25.2', '410', '411.0', '411.9')
srdisease = ('1075')
# sroperation = ()
# opcode = ()

cvd_ukb200k['MI'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['MI_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['MI_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['MI_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['MI_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['MI_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['MI_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['MI_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['MI_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['MI_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [29]:
# Define Stroke

icd_10_9 = ('I60', 'I61', 'I63', 'I64', '430', '431', '434', '436')
srdisease = ('1081', '1086', '1491', '1583')
# sroperation = ()
# opcode = ()

cvd_ukb200k['STR'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['STR_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['STR_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['STR_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['STR_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['STR_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['STR_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['STR_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['STR_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['STR_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [30]:
# Define Ischemic stroke

icd_10_9 = ('I63', '434', '436')
srdisease = ('1583')
# sroperation = ()
# opcode = ()

cvd_ukb200k['ISTR'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['ISTR_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['ISTR_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['ISTR_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['ISTR_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['ISTR_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['ISTR_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['ISTR_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['ISTR_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['ISTR_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [31]:
# Define Coronary artery disease

icd_10_9 = ('I21', 'I22', 'I23', 'I24', 'I25.1', 'I25.2', 'I25.5', 'I25.6', 'I25.8', 'I25.9',
            '410', '411', '412', '414.0', '414.8', '414.9')
srdisease = ('1075')
sroperation = ('1070', '1095', '1523')
opcode = ('K40', 'K41', 'K42', 'K43', 'K44', 'K45', 'K46', 'K49', 'K50.1','K50.2', 'K50.4', 'K75')

cvd_ukb200k['CAD'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['CAD_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['CAD_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['CAD_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cvd_ukb200k['CAD_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
cvd_ukb200k['CAD_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

cvd_ukb200k['CAD_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
cvd_ukb200k['CAD_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['CAD_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['CAD_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [64]:
# Define Heart failure

icd_10_9 = ('I11.0', 'I13.0', 'I13.2', 'I50', '428')
srdisease = ('1076')
# sroperation = ()
# opcode = ()

cvd_ukb200k['HF'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['HF_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['HF_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['HF_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['HF_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['HF_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['HF_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['HF_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['HF_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['HF_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [65]:
# Define Atrial fibrillation

icd_10_9 = ('I48', '4273')
srdisease = ('1471', '1483')
# sroperation = ()
# opcode = ()

cvd_ukb200k['AF'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['AF_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['AF_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cvd_ukb200k['AF_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['AF_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['AF_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['AF_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['AF_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['AF_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['AF_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [None]:
# Define Atherosclerosis

icd_10_9 = ('I70', '440')
# srdisease = ()
# sroperation = ()
# opcode = ()

cvd_ukb200k['ATH'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['ATH_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

# cvd_ukb200k['ATH_SRD'] = cvd_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cvd_ukb200k['ATH_SRD_age'] = cvd_ukb200k.apply(lambda x: SRD_age(x), axis=1)

# cvd_ukb200k['ATH_SRO'] = cvd_ukb200k.apply(lambda x: SRO_variable(x), axis=1)
# cvd_ukb200k['ATH_SRO_age'] = cvd_ukb200k.apply(lambda x: SRO_age(x), axis=1)

# cvd_ukb200k['ATH_OP'] = cvd_ukb200k.apply(lambda x: OP_variable(x), axis=1)
# cvd_ukb200k['ATH_OP_date'] = cvd_ukb200k.apply(lambda x: OP_date(x), axis=1)

cvd_ukb200k['ATH_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['ATH_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [None]:
# Select those variables containing the ages and dates
variables = [var for var in cvd_ukb200k.columns.to_list() if var.endswith(('age', 'date'))]
variables

In [7]:
from tqdm import tqdm


for variable in tqdm(variables):
    years = []
    if variable.endswith('date'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = pd.to_datetime(x, format='%Y-%m-%d')-pd.to_datetime(cvd_ukb200k.loc[n,'53-0.0'], format='%Y-%m-%d')
                years.append(year/np.timedelta64(1,'Y'))
    elif variable.endswith('age'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = x-cvd_ukb200k.loc[n,'21022-0.0']
                years.append(year)

    var = '_'.join(variable.split('_')[:-1])
    print(var)
    cvd_ukb200k[var+'_years'] = years


In [81]:
# Unify dates and ages in years and unify them into the min years from all of them.
cvd_var = ['MI', 'STR', 'ISTR', 'CAD', 'HF', 'AF', 'ATH']

for var in cvd_var:
    yeears = [x for x in cvd_ukb200k.columns if (x.startswith(var)) and (x.endswith('years'))]
    cvd_ukb200k['var_'+var+'_years'] = cvd_ukb200k.loc[:,yeears].min(axis=1)
    cvd_ukb200k['var_'+var+'_var'] = np.where(cvd_ukb200k['var_'+var+'_years'].notnull(), 1, 0)
    cvd_ukb200k['var_'+var+'_post'] = np.where(cvd_ukb200k['var_'+var+'_years'] > 0, 1, 0)
    cvd_ukb200k['var_'+var+'_pre'] = np.where(cvd_ukb200k['var_'+var+'_years'] < 0, 1, 0)

In [None]:
# Create a unique variable for CVD
yeears = ['var_MI_years', 'var_STR_years', 'var_ISTR_years', 'var_CAD_years',
          'var_HF_years', 'var_AF_years', 'var_ATH_years']
cvd_ukb200k['var_CVD_years'] = cvd_ukb200k.loc[:,yeears].min(axis=1)
cvd_ukb200k['var_CVD_var'] = np.where(cvd_ukb200k['var_CVD_years'].notnull(), 1, 0)
cvd_ukb200k['var_CVD_post'] = np.where(cvd_ukb200k['var_CVD_years'] > 0, 1, 0)
cvd_ukb200k['var_CVD_pre'] = np.where(cvd_ukb200k['var_CVD_years'] < 0, 1, 0)

In [1]:
df1 = cvd_ukb200k[['eid']+[x for x in cvd_ukb200k.columns if (x.startswith('var'))]]

In [83]:
### SAVE

df1.to_csv('cvd_ukb450k_670124_simple.txt.gz', sep="\t", index=False, compression='gzip')

In [84]:
### SAVE

cvd_ukb200k.to_csv('cvd_ukb450k_670124_all.txt.gz', sep="\t", index=False, compression='gzip')