In [1]:
import pandas as pd
import numpy as np

In [2]:
# select Cardiovascular disease (CVD) related diseases
# icd codes for hypertension
icd_codes_hypertension = ['I10', 'I11', 'I12', 'I13', 'I15']

# icd codes for atrial fibrillation
icd_codes_atrial_fibrillation = ['I48']

# icd codes for Angina pectoris
icd_codes_angina = ['I20']

# icd codes for chronic artery disease
icd_codes_chronic_artery_disease = ['I25']

# icd codes for heart failure
icd_codes_heart_failure = ['I50']

# stroke
icd_codes_stroke = ['I60', 'I61', 'I62', 'I63', 'I64']

# peripheral artery disease
icd_codes_peripheral_artery_disease = ['I73']

In [7]:
# self-reported diseases code
sr_hypertension = [1065, 1072, 1073]
sr_atrial_fibrillation = [1077]
sr_angina = [1074]
sr_chronic_artery_disease = []
sr_heart_failure = [1076]
sr_stroke = [1081, 1086, 1491, 1583]
sr_peripheral_artery_disease = [1067]

In [5]:
# read ICD10 codes and self-reported codes for all subjects
df_icd = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/ukb_diseases/main_data_icd10.csv', low_memory=False)
df_icd_times = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/ukb_diseases//time_icd10.csv', low_memory=False)
df_noncancer = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/ukb_diseases//self_reported_noncancer.csv', low_memory=False)
df_noncancer_times = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/ukb_diseases//self_reported_noncancer_times.csv', low_memory=False)

In [8]:
# codes to numpy
icd_codes = df_icd.iloc[:, 1:].to_numpy()
icd_code_times = df_icd_times.iloc[:, 1:].to_numpy()
noncancers_codes = df_noncancer.iloc[:, 1:].to_numpy()
noncancers_code_times = df_noncancer_times.iloc[:, 1:].to_numpy()
# disease marker for
dis_marker = np.zeros([len(icd_codes), 7], dtype=np.int32)
dis_date = np.zeros([len(icd_codes), 7], dtype=float)

In [9]:
from datetime import datetime

# search disease codes for each subject
for i in range(len(icd_codes)):
    icd_i = icd_codes[i]
    # check icd_codes
    for t, icd_code in enumerate(icd_i):
        # if nan detected, just break (no more codes for this subject)
        if isinstance(icd_code, str) is False:
            break
            
        # 0. hypertension
        for x in icd_codes_hypertension:
            if icd_code.startswith(x):
                dis_marker[i][0] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][0] = dt.timestamp()
                    break
                
        # 1. atrial fibrillation
        for x in icd_codes_atrial_fibrillation:
            if icd_code.startswith(x):
                dis_marker[i][1] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][1] = dt.timestamp()
                    break
                
        # 2. Angina pectoris
        for x in icd_codes_angina:
            if icd_code.startswith(x):
                dis_marker[i][2] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][2] = dt.timestamp()
                    break
        
        # 3. chronic artery disease
        for x in icd_codes_chronic_artery_disease:
            if icd_code.startswith(x):
                dis_marker[i][3] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][3] = dt.timestamp()
                    break
        
        # 4. heart failure
        for x in icd_codes_heart_failure:
            if icd_code.startswith(x):
                dis_marker[i][4] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][4] = dt.timestamp()
                    break
        
        # 5. stroke
        for x in icd_codes_stroke:
            if icd_code.startswith(x):
                dis_marker[i][5] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][5] = dt.timestamp()
                    break
                    
        # 6. peripheral artery disease
        for x in icd_codes_peripheral_artery_disease:
            if icd_code.startswith(x):
                dis_marker[i][6] = 1
                if str(icd_code_times[i][t]) != 'nan':
                    dt = datetime.strptime(icd_code_times[i][t], '%d/%m/%Y')
                    dis_date[i][6] = dt.timestamp()
                    break


In [10]:
# search self-reported disease codes for each subject
from math import ceil
# check non-cancer codes
for i in range(len(icd_codes)):
    noncancers_i = noncancers_codes[i]
    for t, code in enumerate(noncancers_i):
        if code == np.nan:
            continue
            
        # 0. hypertension
        if dis_marker[i][0] == 0:
            for x in sr_hypertension:
                if code == x:
                    dis_marker[i][0] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][0] = dt
                    break
        
        # 1. atrial fibrillation
        if dis_marker[i][1] == 0:
            for x in sr_atrial_fibrillation:
                if code == x:
                    dis_marker[i][1] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][1] = dt
                    break
        
        # 2. angina pectoris
        if dis_marker[i][2] == 0:
            for x in sr_angina:
                if code == x:
                    dis_marker[i][2] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][2] = dt
                    break
        
        # 3. chronic artery disease
        if dis_marker[i][3] == 0:
            for x in sr_chronic_artery_disease:
                if code == x:
                    dis_marker[i][3] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][3] = dt
                    break
                    
        # 4. heart failure
        if dis_marker[i][4] == 0:
            for x in sr_heart_failure:
                if code == x:
                    dis_marker[i][4] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][4] = dt
                    break
        
        # 5. stroke
        if dis_marker[i][5] == 0:
            for x in sr_stroke:
                if code == x:
                    dis_marker[i][5] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][5] = dt
                    break
        
        # 6. peripheral artery disease
        if dis_marker[i][6] == 0:
            for x in sr_peripheral_artery_disease:
                if code == x:
                    dis_marker[i][6] = 1
                    dt = 0.
                    if not np.isnan(noncancers_code_times[i][t]) and noncancers_code_times[i][t] != -1:
                        yr = int(noncancers_code_times[i][t])
                        mon = max(1, ceil(int((noncancers_code_times[i][t] - int(noncancers_code_times[i][t])) * 12))) 
                        if yr > 1970:
                            dt = datetime(year=yr, month=mon, day=1).timestamp()
                    dis_date[i][6] = dt
                    break

In [11]:
# save the result
columns = ['hypertension', 'atrial_fibrillation', 'angina', 'chronic_artery_disease', 'heart_failure', 'stroke', 'peripheral_artery_disease']

time_columns = ['time_' + c for c in columns]

df_cvd = pd.DataFrame(dis_marker, columns=columns)
df_cvd_times = pd.DataFrame(dis_date, columns=time_columns)

# insert subject id: Eid
df_cvd.insert(0, 'Eid', df_icd['Eid'])
df_cvd_times.insert(0, 'Eid', df_icd['Eid'])

In [12]:
# save the result
df_cvd.to_csv('data/ukb_CVD.csv', index=False)
df_cvd_times.to_csv('data/ukb_CVD_times.csv', index=False)

In [13]:
# count the number of subjects with each disease
for d in columns:
    print(d, ':', df_cvd[d].sum())

hypertension : 194621
atrial_fibrillation : 37959
angina : 36524
chronic_artery_disease : 49470
heart_failure : 17100
stroke : 18349
peripheral_artery_disease : 9869


In [15]:
import os
os.makedirs('/Users/natsumikyouno/UKBiobank/diseases/CVD', exist_ok=True)
df_cvd.to_csv('/Users/natsumikyouno/UKBiobank/diseases/CVD/CVD.csv', index=False)
df_cvd_times.to_csv('/Users/natsumikyouno/UKBiobank/diseases/CVD/CVD_times.csv', index=False)