# ARRANEGE INFECTIOUS DISEASE DATA 

### 1. Get and arrange data
### 2. Define functions to generate variables
### 3. Generate variables including ICD10 or ICD9 codes
### 4. Define specific Bacterial infetions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

# 1. Get and arrange data

In [2]:
### OPEN CVD DATA generated following the steps from the README.md
### Data-fields are indicated in the paper

# Read data
filename = "/workspace/datasets/ukbiobank_ch/ukb_data_670124/clinical_data_ukb670124/cvd_data_670124.txt"
lines = ''.join([line for line in open(filename, 'rt')])
lines

# Transform to df
cvd_ukb200k = pd.read_csv(StringIO(lines), sep= '\t')
cvd_ukb200k

In [8]:
# All variable codes

colnoms = []
for x in cvd_ukb200k.columns.tolist():
    colnoms.append(x.split('-')[0])
set(colnoms)

{'130708',
 '130709',
 '130712',
 '130713',
 '130714',
 '130715',
 '131270',
 '131271',
 '131272',
 '131273',
 '131274',
 '131275',
 '131276',
 '131277',
 '131278',
 '131279',
 '131280',
 '131281',
 '131282',
 '131283',
 '131284',
 '131285',
 '131286',
 '131287',
 '131288',
 '131289',
 '131290',
 '131291',
 '131292',
 '131293',
 '131294',
 '131295',
 '131296',
 '131297',
 '131298',
 '131299',
 '131300',
 '131301',
 '131302',
 '131303',
 '131304',
 '131305',
 '131306',
 '131307',
 '131308',
 '131309',
 '131310',
 '131311',
 '131312',
 '131313',
 '131314',
 '131315',
 '131316',
 '131317',
 '131318',
 '131319',
 '131320',
 '131321',
 '131322',
 '131323',
 '131324',
 '131325',
 '131326',
 '131327',
 '131328',
 '131329',
 '131330',
 '131331',
 '131332',
 '131333',
 '131334',
 '131335',
 '131336',
 '131337',
 '131338',
 '131339',
 '131340',
 '131341',
 '131342',
 '131343',
 '131344',
 '131345',
 '131346',
 '131347',
 '131348',
 '131349',
 '131350',
 '131351',
 '131352',
 '131353',
 '131354',

In [9]:
# Function to obtain the index from columns

def columna(data, name):
    a = []
    for n, x in enumerate(data.columns.tolist()):
        if str(x).startswith(str(name)):
            a.append([n , x])
    return(a)
    

In [13]:
# Max death
cvd_ukb200k[cvd_ukb200k['40000-0.0'].notnull()]['40000-0.0'].max()

'2021-11-12'

## 2. Define functions to generate variables

In [17]:
cvd_var = ['Sepsis', 'Pneumonia', 'Gastroenteritis', 'Skin_infection', 'Urinary_tract_infection',
           'Composite_any_infection', 'Bacterial_infections', 'Viral_infections', 'Fungal_Infections']

In [63]:
##### 
##### SUMMARY DIAGNOSIS   
#####

# 41202 [944:1023]  Diagnoses - main ICD10 (79)
# 41262 [1106:1185]  Date of first in-patient diagnosis - main ICD10 (79)
# 41203 [1023:1051]   Diagnoses - main ICD9 (28)
# 41263 [1185:1213]   Date of first in-patient diagnosis - main ICD9 (28)
# 41270 [1213:1456]  Diagnoses - ICD10 (243)
# 41280 [1627:1870] Date of first in-patient diagnosis - ICD10 (243)
# 41271 [1456:1503] Diagnoses - ICD9 (47)
# 41281 [1870:1917] Date of first in-patient diagnosis - ICD9 (47)

# to remove
#1900-01-01 represents "Code has no event date"
#1901-01-01 represents "Code has event date before participant's date of birth"
#1902-02-02 represents "Code has event date matching participant's date of birth"
#1903-03-03 represents "Code has event date after participant's date of birth same calendar year as date of birth"
#2037-07-07 represents "Code has event date in the future and is presumed to be a place-holder or other system default"
remove_dates = ['1900-01-01', '1901-01-01', '1902-02-02', '1903-03-03', '2037-07-07']

def Diagnose_variable(patient):
    # define column
    column_diseases = list(range(944,1023)) + list(range(1023,1051)) + list(range(1213,1456)) + list(range(1456,1503))
    column_dates = list(range(1106,1185)) + list(range(1185,1213)) + list(range(1627,1870)) + list(range(1870,1917))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    dates_hypertension_disease = ['nan' if pd.isna(value) else value for value in dates_hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
            mindate = min(dates_hypertension_disease) 
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        
        if mindate == 'nan':
            return np.nan
        else:
            return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def Diagnose_date(patient):
    
    # define column
    column_diseases = list(range(944,1023)) + list(range(1023,1051)) + list(range(1213,1456)) + list(range(1456,1503))
    column_dates = list(range(1106,1185)) + list(range(1185,1213)) + list(range(1627,1870)) + list(range(1870,1917))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    dates_hypertension_disease = ['nan' if pd.isna(value) else value for value in dates_hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        if mindate == 'nan':
            return np.nan
        else:
            return mindate
    else:
        return np.nan

In [24]:
#Death record

#40001 [857:859] Underlaying cause of death
#40002 [859:887] Contributory cause of death
#40000 [391:393] Date of death
#40007 [416:418] Age of death

# define columns

def Death_variable(patient):
    column_diseases = list(range(857,887))
    
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return ', '.join(set(cause))  
    else:
        return np.nan
    
    
def Death_date(patient):
    column_diseases = list(range(857,887))
        
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return patient['40000-0.0']
    else:
        return np.nan

## 3. Generate variables including ICD10 or ICD9 codes

In [26]:
# Define Sepsis
   
icd_10_9 = ('A021', 'A227', 'A40', 'A41', 'B377', 'O85', 'R651', 'R527')

cvd_ukb200k['Sepsis'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Sepsis_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Sepsis_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Sepsis_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [29]:
# Define Pneumonia
icd_10_9 = ('B012', 'J14', 'J12', 'J16', 'J15', 'J17', 'J18', 'J851')

cvd_ukb200k['Pneumonia'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Pneumonia_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Pneumonia_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Pneumonia_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [30]:
# Define Gastroenteritis
icd_10_9 = ('A00', 'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09')

cvd_ukb200k['Gastroenteritis'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Gastroenteritis_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Gastroenteritis_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Gastroenteritis_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [31]:
# Define Skin infection
icd_10_9 = ('L00', 'L01', 'L02', 'L03', 'L04', 'L05',
            'L06', 'L07', 'L08')

cvd_ukb200k['Skin_infection'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Skin_infection_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Skin_infection_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Skin_infection_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [64]:
# Define Urinary tract infection
icd_10_9 = ('N390', 'O23', 'O86')

cvd_ukb200k['Urinary_tract_infection'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Urinary_tract_infection_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Urinary_tract_infection_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Urinary_tract_infection_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [65]:
# Define Composite any infection

icd_10_9 = ('A00', 'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09',
            'A15', 'A16', 'A17', 'A18', 'A19',
            'A20', 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28',
            'A30', 'A31', 'A32', 'A33', 'A34', 'A35', 'A36', 'A37', 'A38', 'A39',
            'A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49',
            'A50', 'A51', 'A52', 'A53', 'A54', 'A55', 'A56', 'A57', 'A58', 'A59',
            'A60', 'A61', 'A62', 'A63', 'A64',
            'A65', 'A66', 'A67', 'A68', 'A69',
            'A70', 'A71', 'A72', 'A73', 'A74',
            'A75', 'A76', 'A77', 'A78', 'A79',
            'A80', 'A81', 'A82', 'A83', 'A84', 'A85', 'A86', 'A87', 'A88', 'A89',
            'A90', 'A91', 'A92', 'A93', 'A94', 'A95', 'A96', 'A97', 'A98', 'A99',
            'B00', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09',
            'B10',
            'B15', 'B16', 'B17', 'B18', 'B19',
            'B20',
            'B25', 'B26', 'B27', 'B28', 'B29', 'B30', 'B31', 'B32', 'B33', 'B34',
            'B35', 'B36', 'B37', 'B38', 'B39',
            'B40', 'B41', 'B42', 'B43', 'B44', 'B45', 'B46', 'B47', 'B48', 'B49',
            'B50', 'B51', 'B52', 'B53', 'B54', 'B55', 'B56', 'B57', 'B58', 'B59',
            'B60', 'B61', 'B62', 'B63', 'B64',
            'B65', 'B66', 'B67', 'B68', 'B69', 'B70', 'B71', 'B72', 'B73', 'B74',
            'B75', 'B76', 'B77', 'B78', 'B79', 'B80', 'B81', 'B82', 'B83',
            'B85', 'B86', 'B87', 'B88', 'B89',
            'B90', 'B91', 'B92', 'B93', 'B94',
            'B95', 'B96', 'B97',
            'B99',)

cvd_ukb200k['Composite_any_infection'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Composite_any_infection_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Composite_any_infection_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Composite_any_infection_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [None]:
# Define Bacterial infections

icd_10_9 = ('A00', 'A01', 'A02', 'A03', 'A04', 'A05',
            'A15', 'A16', 'A17', 'A18', 'A19',
            'A20', 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28',
            'A30', 'A31', 'A32', 'A33', 'A34', 'A35', 'A36', 'A37', 'A38', 'A39',
            'A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49',
            'A51', 'A52', 'A53', 'A54', 'A56',
            'A65', 'A66', 'A67', 'A68', 'A69',
            'A70', 'A71', 'A72', 'A73', 'A74',
            'A75', 'A76', 'A77', 'A78', 'A79',
            'B90', 'B95', 'B96', 'B98')

cvd_ukb200k['Bacterial_infections'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Bacterial_infections_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Bacterial_infections_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Bacterial_infections_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [None]:
# Define Viral infections 

icd_10_9 = ('A08', 'A60',
            'A80', 'A81', 'A82', 'A83', 'A84', 'A85', 'A86', 'A87', 'A88', 'A89',
            'A92', 'A93', 'A94', 'A95', 'A96', 'A97', 'A98', 'A99',
            'B00', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B09',
            'B15', 'B16', 'B17', 'B18', 'B19',
            'B20', 'B21', 'B22', 'B23', 'B24',
            'B25', 'B26', 'B27', 'B28', 'B29', 'B30', 'B31', 'B32', 'B33', 'B34',
            'B941', 'B942', 'B97')

cvd_ukb200k['Viral_infections '] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Viral_infections_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Viral_infections_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Viral_infections_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [None]:
# Define Fungal Infections

icd_10_9 = ('B35', 'B36', 'B37', 'B38', 'B39',
            'B40', 'B41', 'B42', 'B43', 'B44', 
            'B45', 'B46', 'B47', 'B48', 'B49')

cvd_ukb200k['Fungal_Infections '] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['Fungal_Infections_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['Fungal_Infections_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['Fungal_Infections_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [73]:
cvd_var = ['Sepsis', 'Pneumonia', 'Gastroenteritis', 'Skin_infection', 'Urinary_tract_infection',
           'Composite_any_infection', 'Bacterial_infections', 'Viral_infections', 'Fungal_Infections']

In [74]:
from tqdm import tqdm
variables = ['Sepsis_date',
 'Sepsis_death_date',
 'Pneumonia_date',
 'Pneumonia_death_date',
 'Gastroenteritis_date',
 'Gastroenteritis_death_date',
 'Skin_infection_date',
 'Skin_infection_death_date',
 'Urinary_tract_infection_date',
 'Urinary_tract_infection_death_date',
 'Composite_any_infection_date',
 'Composite_any_infection_death_date',
 'Bacterial_infections_date',
 'Bacterial_infections_death_date',
 'Viral_infections_date',
 'Viral_infections_death_date',
 'Fungal_Infections_date',
 'Fungal_Infections_death_date']
for variable in tqdm(variables):
    years = []
    if variable.endswith('date'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = pd.to_datetime(x, format='%Y-%m-%d')-pd.to_datetime(cvd_ukb200k.loc[n,'53-0.0'], format='%Y-%m-%d')
                years.append(year/np.timedelta64(1,'Y'))
    elif variable.endswith('age'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = x-cvd_ukb200k.loc[n,'21022-0.0']
                years.append(year)

    var = '_'.join(variable.split('_')[:-1])
    print(var)
    cvd_ukb200k[var+'_years'] = years


  6%|██▍                                         | 1/18 [00:05<01:34,  5.54s/it]

Sepsis


 11%|████▉                                       | 2/18 [00:06<00:42,  2.68s/it]

Sepsis_death


 17%|███████▎                                    | 3/18 [00:16<01:30,  6.02s/it]

Pneumonia


 22%|█████████▊                                  | 4/18 [00:18<01:01,  4.37s/it]

Pneumonia_death


 28%|████████████▏                               | 5/18 [00:26<01:17,  5.97s/it]

Gastroenteritis


 33%|██████████████▋                             | 6/18 [00:27<00:48,  4.04s/it]

Gastroenteritis_death


 39%|█████████████████                           | 7/18 [00:34<00:57,  5.19s/it]

Skin_infection


 44%|███████████████████▌                        | 8/18 [00:34<00:36,  3.62s/it]

Skin_infection_death


 50%|██████████████████████                      | 9/18 [00:44<00:50,  5.62s/it]

Urinary_tract_infection


 56%|███████████████████████▉                   | 10/18 [00:45<00:32,  4.00s/it]

Urinary_tract_infection_death


 61%|██████████████████████████▎                | 11/18 [01:08<01:09,  9.91s/it]

Composite_any_infection


 67%|████████████████████████████▋              | 12/18 [01:09<00:42,  7.15s/it]

Composite_any_infection_death


 72%|███████████████████████████████            | 13/18 [01:24<00:47,  9.45s/it]

Bacterial_infections


 78%|█████████████████████████████████▍         | 14/18 [01:24<00:27,  6.82s/it]

Bacterial_infections_death


 83%|███████████████████████████████████▊       | 15/18 [01:30<00:18,  6.29s/it]

Viral_infections


 89%|██████████████████████████████████████▏    | 16/18 [01:30<00:08,  4.49s/it]

Viral_infections_death


 94%|████████████████████████████████████████▌  | 17/18 [01:33<00:04,  4.01s/it]

Fungal_Infections


100%|███████████████████████████████████████████| 18/18 [01:33<00:00,  5.19s/it]

Fungal_Infections_death





In [80]:
a = cvd_ukb200k[cvd_ukb200k['Fungal_Infections_years'].notnull()]
a[['eid', '53-0.0', '21022-0.0']+[x for x in a.columns if (x.startswith('Fungal_Infections'))]]

Unnamed: 0,eid,53-0.0,21022-0.0,Fungal_Infections,Fungal_Infections_date,Fungal_Infections_death,Fungal_Infections_death_date,Fungal_Infections_years,Fungal_Infections_death_years
7,1000081,2010-02-01,61.0,B378,2020-09-03,,,10.587486,
34,1000357,2009-02-18,54.0,B378,2020-03-12,,,11.061144,
88,1000893,2008-01-21,56.0,B370,2014-06-30,,,6.439557,
89,1000906,2010-03-01,56.0,B371,2018-02-17,,,7.967309,
297,1002989,2009-03-07,51.0,B49,2013-06-07,,,4.251970,
...,...,...,...,...,...,...,...,...,...
502202,6022972,2007-10-16,56.0,B49,2016-08-11,B49,2016-10-01,8.821536,8.96117
502262,6023573,2008-04-15,60.0,B379,2016-09-05,,,8.391685,
502290,6023858,2009-08-03,66.0,B378,2009-01-22,,,-0.528416,
502312,6024073,2007-09-25,55.0,B378,2021-05-20,,,13.651204,


In [81]:
cvd_var = ['Sepsis', 'Pneumonia', 'Gastroenteritis', 'Skin_infection', 'Urinary_tract_infection',
           'Composite_any_infection', 'Bacterial_infections', 'Viral_infections', 'Fungal_Infections']

for var in cvd_var:
    yeears = [x for x in cvd_ukb200k.columns if (x.startswith(var)) and (x.endswith('years'))]
    cvd_ukb200k['var_'+var+'_years'] = cvd_ukb200k.loc[:,yeears].min(axis=1)
    cvd_ukb200k['var_'+var+'_var'] = np.where(cvd_ukb200k['var_'+var+'_years'].notnull(), 1, 0)
    cvd_ukb200k['var_'+var+'_post'] = np.where(cvd_ukb200k['var_'+var+'_years'] > 0, 1, 0)
    cvd_ukb200k['var_'+var+'_pre'] = np.where(cvd_ukb200k['var_'+var+'_years'] < 0, 1, 0)

In [1]:
df1 = cvd_ukb200k[['eid']+[x for x in cvd_ukb200k.columns if (x.startswith('var'))]]

In [83]:
### SAVE

df1.to_csv('Infection_ukb450k_670124_simple.txt.gz', sep="\t", index=False, compression='gzip')

In [84]:
### SAVE

cvd_ukb200k.to_csv('Infection_ukb450k_670124_all.txt.gz', sep="\t", index=False, compression='gzip')

## 4. Define specific Bacterial infetions

In [87]:
cvd_var = ['A00', 'A01', 'A02', 'A03', 'A04',
           'A05', 'A15_A19', 'A20_A28', 'A30_A49',
           'A51', 'A52', 'A53', 'A54', 'A56',
           'A65_A69', 'A70_A74', 'A75_A79',
           'B90', 'B95', 'B96', 'B98']

In [88]:
# Define Bacterial infections

icd_10_9 = ('A00')

cvd_ukb200k['A00'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A00_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A00_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A00_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [89]:
# Define Bacterial infections

icd_10_9 = ('A01')

cvd_ukb200k['A01'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A01_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A01_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A01_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [90]:
# Define Bacterial infections

icd_10_9 = ('A02')

cvd_ukb200k['A02'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A02_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A02_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A02_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [91]:
# Define Bacterial infections

icd_10_9 = ('A03')

cvd_ukb200k['A03'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A03_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A03_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A03_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [92]:
# Define Bacterial infections

icd_10_9 = ('A04')

cvd_ukb200k['A04'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A04_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A04_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A04_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [93]:
# Define Bacterial infections

icd_10_9 = ('A05')

cvd_ukb200k['A05'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A05_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A05_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A05_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [94]:
# Define Bacterial infections

icd_10_9 = ('A15', 'A16', 'A17', 'A18', 'A19')

cvd_ukb200k['A15_A19'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A15_A19_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A15_A19_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A15_A19_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [95]:
# Define Bacterial infections

icd_10_9 = ('A20', 'A21', 'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28')

cvd_ukb200k['A20_A28'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A20_A28_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A20_A28_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A20_A28_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [96]:
# Define Bacterial infections

icd_10_9 = ('A30', 'A31', 'A32', 'A33', 'A34', 'A35', 'A36', 'A37', 'A38', 'A39',
            'A40', 'A41', 'A42', 'A43', 'A44', 'A45', 'A46', 'A47', 'A48', 'A49')

cvd_ukb200k['A30_A49'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A30_A49_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A30_A49_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A30_A49_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [97]:
# Define Bacterial infections

icd_10_9 = ('A51')

cvd_ukb200k['A51'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A51_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A51_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A51_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [98]:
# Define Bacterial infections

icd_10_9 = ('A52')

cvd_ukb200k['A52'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A52_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A52_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A52_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [99]:
# Define Bacterial infections

icd_10_9 = ('A53')

cvd_ukb200k['A53'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A53_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A53_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A53_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [100]:
# Define Bacterial infections

icd_10_9 = ('A54')

cvd_ukb200k['A54'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A54_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A54_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A54_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [101]:
# Define Bacterial infections

icd_10_9 = ('A56')

cvd_ukb200k['A56'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A56_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A56_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A56_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [102]:
# Define Bacterial infections

icd_10_9 = ('A65', 'A66', 'A67', 'A68', 'A69')

cvd_ukb200k['A65_A69'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A65_A69_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A65_A69_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A65_A69_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [103]:
# Define Bacterial infections

icd_10_9 = ('A70', 'A71', 'A72', 'A73', 'A74')

cvd_ukb200k['A70_A74'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A70_A74_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A70_A74_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A70_A74_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [104]:
# Define Bacterial infections

icd_10_9 = ('A75', 'A76', 'A77', 'A78', 'A79')

cvd_ukb200k['A75_A79'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['A75_A79_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['A75_A79_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['A75_A79_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [105]:
# Define Bacterial infections

icd_10_9 = ('B90')

cvd_ukb200k['B90'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['B90_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['B90_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['B90_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [106]:
# Define Bacterial infections

icd_10_9 = ('B95')

cvd_ukb200k['B95'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['B95_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['B95_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['B95_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [107]:
# Define Bacterial infections

icd_10_9 = ('B96')

cvd_ukb200k['B96'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['B96_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['B96_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['B96_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [108]:
# Define Bacterial infections

icd_10_9 = ('B98')

cvd_ukb200k['B98'] = cvd_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cvd_ukb200k['B98_date'] = cvd_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cvd_ukb200k['B98_death'] = cvd_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cvd_ukb200k['B98_death_date'] = cvd_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [110]:
cvd_var = ['A00', 'A01', 'A02', 'A03', 'A04',
           'A05', 'A15_A19', 'A20_A28', 'A30_A49',
           'A51', 'A52', 'A53', 'A54', 'A56',
           'A65_A69', 'A70_A74', 'A75_A79',
           'B90', 'B95', 'B96', 'B98']

In [111]:
from tqdm import tqdm
variables = ['A00_date', 'A00_death_date',
             'A01_date', 'A01_death_date',
             'A02_date', 'A02_death_date',
             'A03_date', 'A03_death_date',
             'A04_date', 'A04_death_date',
             'A05_date', 'A05_death_date',
             'A15_A19_date', 'A15_A19_death_date',
             'A20_A28_date', 'A20_A28_death_date',
             'A30_A49_date', 'A30_A49_death_date',
             'A51_date', 'A51_death_date',
             'A52_date', 'A52_death_date',
             'A53_date', 'A53_death_date',
             'A54_date', 'A54_death_date',
             'A56_date', 'A56_death_date',
             'A65_A69_date', 'A65_A69_death_date',
             'A70_A74_date', 'A70_A74_death_date',
             'A75_A79_date', 'A75_A79_death_date',
             'B90_date', 'B90_death_date',
             'B95_date', 'B95_death_date',
             'B96_date', 'B96_death_date',
             'B98_date', 'B98_death_date',]

for variable in tqdm(variables):
    years = []
    if variable.endswith('date'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = pd.to_datetime(x, format='%Y-%m-%d')-pd.to_datetime(cvd_ukb200k.loc[n,'53-0.0'], format='%Y-%m-%d')
                years.append(year/np.timedelta64(1,'Y'))
    elif variable.endswith('age'):
        for n, x in enumerate(cvd_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = x-cvd_ukb200k.loc[n,'21022-0.0']
                years.append(year)

    var = '_'.join(variable.split('_')[:-1])
    print(var)
    cvd_ukb200k[var+'_years'] = years


  2%|█                                           | 1/42 [00:00<00:08,  4.85it/s]

A00


  5%|██                                          | 2/42 [02:22<55:47, 83.68s/it]

A00_death


  7%|███▏                                        | 3/42 [02:22<29:37, 45.57s/it]

A01


 10%|████▏                                       | 4/42 [04:43<52:49, 83.40s/it]

A01_death


 12%|█████▏                                      | 5/42 [04:44<32:56, 53.42s/it]

A02


 14%|██████▎                                     | 6/42 [04:44<21:11, 35.33s/it]

A02_death
A03


 19%|████████▍                                   | 8/42 [07:06<34:45, 61.34s/it]

A03_death


 21%|█████████▍                                  | 9/42 [07:08<23:31, 42.76s/it]

A04


 24%|██████████▏                                | 10/42 [07:08<15:48, 29.63s/it]

A04_death


 26%|███████████▎                               | 11/42 [07:08<10:39, 20.63s/it]

A05


 29%|████████████▎                              | 12/42 [09:29<28:37, 57.25s/it]

A05_death


 31%|█████████████▎                             | 13/42 [09:29<19:20, 40.02s/it]

A15_A19


 33%|██████████████▎                            | 14/42 [09:30<13:03, 27.99s/it]

A15_A19_death


 36%|███████████████▎                           | 15/42 [09:30<08:49, 19.63s/it]

A20_A28


 38%|████████████████▍                          | 16/42 [09:30<05:58, 13.78s/it]

A20_A28_death


 40%|█████████████████▍                         | 17/42 [09:36<04:45, 11.41s/it]

A30_A49


 43%|██████████████████▍                        | 18/42 [09:37<03:16,  8.19s/it]

A30_A49_death


 45%|███████████████████▍                       | 19/42 [09:37<02:13,  5.80s/it]

A51


 48%|████████████████████▍                      | 20/42 [11:59<17:05, 46.60s/it]

A51_death


 50%|█████████████████████▌                     | 21/42 [11:59<11:26, 32.68s/it]

A52


 52%|██████████████████████▌                    | 22/42 [11:59<07:38, 22.94s/it]

A52_death
A53


 57%|████████████████████████▌                  | 24/42 [14:20<16:05, 53.62s/it]

A53_death


 60%|█████████████████████████▌                 | 25/42 [14:21<10:39, 37.60s/it]

A54


 62%|██████████████████████████▌                | 26/42 [16:42<18:19, 68.71s/it]

A54_death


 64%|███████████████████████████▋               | 27/42 [16:42<12:02, 48.17s/it]

A56


 67%|████████████████████████████▋              | 28/42 [19:04<17:47, 76.23s/it]

A56_death


 69%|█████████████████████████████▋             | 29/42 [19:04<11:34, 53.44s/it]

A65_A69


 71%|██████████████████████████████▋            | 30/42 [21:25<15:57, 79.80s/it]

A65_A69_death


 74%|███████████████████████████████▋           | 31/42 [21:26<10:15, 55.93s/it]

A70_A74


 76%|████████████████████████████████▊          | 32/42 [23:47<13:34, 81.45s/it]

A70_A74_death


 79%|█████████████████████████████████▊         | 33/42 [23:47<08:33, 57.08s/it]

A75_A79


 81%|██████████████████████████████████▊        | 34/42 [26:08<10:58, 82.35s/it]

A75_A79_death


 83%|███████████████████████████████████▊       | 35/42 [26:08<06:43, 57.71s/it]

B90


 86%|████████████████████████████████████▊      | 36/42 [26:09<04:02, 40.46s/it]

B90_death


 88%|█████████████████████████████████████▉     | 37/42 [26:12<02:26, 29.26s/it]

B95


 90%|██████████████████████████████████████▉    | 38/42 [28:34<04:12, 63.15s/it]

B95_death


 93%|███████████████████████████████████████▉   | 39/42 [28:40<02:18, 46.15s/it]

B96


 95%|████████████████████████████████████████▉  | 40/42 [31:02<02:29, 74.83s/it]

B96_death


 98%|█████████████████████████████████████████▉ | 41/42 [31:03<00:52, 52.75s/it]

B98


100%|███████████████████████████████████████████| 42/42 [33:26<00:00, 47.77s/it]

B98_death





In [112]:
cvd_var = ['A00', 'A01', 'A02', 'A03', 'A04',
           'A05', 'A15_A19', 'A20_A28', 'A30_A49',
           'A51', 'A52', 'A53', 'A54', 'A56',
           'A65_A69', 'A70_A74', 'A75_A79',
           'B90', 'B95', 'B96', 'B98']

for var in cvd_var:
    yeears = [x for x in cvd_ukb200k.columns if (x.startswith(var)) and (x.endswith('years'))]
    cvd_ukb200k['var_'+var+'_years'] = cvd_ukb200k.loc[:,yeears].min(axis=1)
    cvd_ukb200k['var_'+var+'_var'] = np.where(cvd_ukb200k['var_'+var+'_years'].notnull(), 1, 0)
    cvd_ukb200k['var_'+var+'_post'] = np.where(cvd_ukb200k['var_'+var+'_years'] > 0, 1, 0)
    cvd_ukb200k['var_'+var+'_pre'] = np.where(cvd_ukb200k['var_'+var+'_years'] < 0, 1, 0)

In [114]:
df1 = cvd_ukb200k[['eid']+[x for x in cvd_ukb200k.columns if (x.startswith('var'))]]

In [115]:
df1.to_csv('Infection_ukb450k_670124_bacterial_infection.txt.gz', sep="\t", index=False, compression='gzip')