# ARRANEGE CANCER DATA 

### 1. Get and arrange data
### 2. Define functions to generate variables
### 3. Generate variables including ICD10 or ICD9 codes
### 4. Save output

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

## 1. Get and arrange data

In [3]:
### OPEN CANCER DATA generated following the steps from the README.md
### Data-fields are indicated in the paper

# Read data
filename = "/workspace/datasets/ukbiobank_ch/ukb_data_670124/clinical_data_ukb670124/cancer_data_siddhartha_670124.txt"
lines = ''.join([line for line in open(filename, 'rt')])
lines

# Transform to df
cancer_ukb200k = pd.read_csv(StringIO(lines), sep= '\t')
len(cancer_ukb200k)

  cancer_df = pd.read_csv(StringIO(lines), sep= '\t')


502394

In [14]:
# All variable codes

colnoms = []
for x in cancer_ukb200k.columns.tolist():
    colnoms.append(x.split('-')[0])
set(colnoms)

{'20001',
 '20002',
 '20007',
 '20009',
 '21003',
 '21022',
 '31',
 '40000',
 '40001',
 '40002',
 '40005',
 '40006',
 '40007',
 '40008',
 '40009',
 '40011',
 '40012',
 '40013',
 '53',
 'eid'}

In [15]:
# Function to obtain the index from columns

def columna(data, name):
    a = []
    for n, x in enumerate(data.columns.tolist()):
        if str(x).startswith(str(name)):
            a.append([n , x])
    return(a)

In [16]:
# Max death
cancer_ukb200k[cancer_ukb200k['40000-0.0'].notnull()]['40000-0.0'].max()

'2021-11-12'

In [17]:
# Function to obtain the index from columns EXAMPLE

columna(cancer_ukb200k, '40005')

[[363, '40005-0.0'],
 [364, '40005-1.0'],
 [365, '40005-2.0'],
 [366, '40005-3.0'],
 [367, '40005-4.0'],
 [368, '40005-5.0'],
 [369, '40005-6.0'],
 [370, '40005-7.0'],
 [371, '40005-8.0'],
 [372, '40005-9.0'],
 [373, '40005-10.0'],
 [374, '40005-11.0'],
 [375, '40005-12.0'],
 [376, '40005-13.0'],
 [377, '40005-14.0'],
 [378, '40005-15.0'],
 [379, '40005-16.0'],
 [380, '40005-17.0'],
 [381, '40005-18.0'],
 [382, '40005-19.0'],
 [383, '40005-20.0'],
 [384, '40005-21.0']]

In [18]:
# Max self reported is 2022.5
# Max SUMMARY DIAGNOSIS (40005): 2021-06-25

from math import nan, isnan
#The rest of variables:
disease_dates = list(range(363,385))
max_dates=[]
for x in cancer_ukb200k.columns[disease_dates].tolist():
    max_dates.append(cancer_ukb200k[cancer_ukb200k[x].notnull()][x].max())
max(max_dates)

'2021-06-25'

## 2. Define functions to generate variables

In [20]:
# Indicate the different cancer types to generate
cancer_var = ['HN', 'MM', 'MPN', 'MDS', 'CMML', 'AML', 'LL', 'N-HN', 
           'Breast', 'Prostate', 'Lung', 'Colorectal', 'Small-intestine', 
           'Melanoma', 'Lymphoma', 'Myeloma', 'Kidney', 'Head-neck',
           'Brain', 'Bladder', 'Pancreas', 'Uterine', 'Oesophageal', 'Ovarian',
           'Stomach', 'Liver', 'Thyroid', 'Biliari', 'Cervical', 'Sarcoma', 'Testicular']

In [14]:
variables = []
for x in ['40005','40006','40013' ]:
    a = columna(cancer_ukb200k, x)
    variables = variables+[i[1] for i in a]
variables

['40005-0.0',
 '40005-1.0',
 '40005-2.0',
 '40005-3.0',
 '40005-4.0',
 '40005-5.0',
 '40005-6.0',
 '40005-7.0',
 '40005-8.0',
 '40005-9.0',
 '40005-10.0',
 '40005-11.0',
 '40005-12.0',
 '40005-13.0',
 '40005-14.0',
 '40005-15.0',
 '40005-16.0',
 '40005-17.0',
 '40005-18.0',
 '40005-19.0',
 '40005-20.0',
 '40005-21.0',
 '40006-0.0',
 '40006-1.0',
 '40006-2.0',
 '40006-3.0',
 '40006-4.0',
 '40006-5.0',
 '40006-6.0',
 '40006-7.0',
 '40006-8.0',
 '40006-9.0',
 '40006-10.0',
 '40006-11.0',
 '40006-12.0',
 '40006-13.0',
 '40006-14.0',
 '40006-15.0',
 '40006-16.0',
 '40006-17.0',
 '40006-18.0',
 '40006-19.0',
 '40006-20.0',
 '40006-21.0',
 '40013-0.0',
 '40013-1.0',
 '40013-2.0',
 '40013-3.0',
 '40013-4.0',
 '40013-5.0',
 '40013-6.0',
 '40013-7.0',
 '40013-8.0',
 '40013-9.0',
 '40013-10.0',
 '40013-11.0',
 '40013-12.0',
 '40013-13.0',
 '40013-14.0']

In [27]:
##### 
##### SUMMARY DIAGNOSIS   
#####

# 40005 [363:385]  Date of first cancer (22)
# 40006 [385:407]  Cancer - ICD10 (22)
# 40013 [476:491]  Cancer - ICD9 (15)

# Codes that indicates no information
#1900-01-01 represents "Code has no event date"
#1901-01-01 represents "Code has event date before participant's date of birth"
#1902-02-02 represents "Code has event date matching participant's date of birth"
#1903-03-03 represents "Code has event date after participant's date of birth same calendar year as date of birth"
#2037-07-07 represents "Code has event date in the future and is presumed to be a place-holder or other system default"
remove_dates = ['1900-01-01', '1901-01-01', '1902-02-02', '1903-03-03', '2037-07-07']

def Diagnose_variable(patient):
    # define column
    column_diseases = list(range(385,407)) + list(range(476,491))
    column_dates = list(range(363,385)) + list(range(363,378))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(mindate)]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        
        if mindate == 'nan':
            return np.nan
        else:
            return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def Diagnose_date(patient):
    
    # define column
    column_diseases = list(range(385,407)) + list(range(476,491))
    column_dates = list(range(363,385)) + list(range(363,378))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(icd_10_9)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(icd_10_9)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        for i in range(len(dates_hypertension_disease)):
            if dates_hypertension_disease[i] in remove_dates:
                dates_hypertension_disease[i] = 'nan'
        
        mindate = min(dates_hypertension_disease)
        if mindate == 'nan':
            return np.nan
        else:
            return mindate
    else:
        return np.nan

In [28]:
##### 
##### SELF RECORDS CANCER
#####

# 20001 [6:30]  Cancer illnes self-reported (24)
# 20007 [166:190]  Age

def SRC_variable(patient):
    
    # define columns
    column_diseases = list(range(6,30))
    column_dates = list(range(166,190))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srcancer)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srcancer)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def SRC_age(patient):
       
    # define columns
    column_diseases = list(range(6,30))
    column_dates = list(range(166,190))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srcancer)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srcancer)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        return mindate
    else:
        return np.nan

In [29]:
##### 
##### SELF RECORDS NON-CANCER 
#####

# 20002 [30:166] Diagnosis, self-reported (136)
# 20009 [190:326] Age non-cancer (136)

def SRD_variable(patient):
    
    # define columns
    column_diseases = list(range(30,166))
    column_dates = list(range(190,326))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srdisease)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srdisease)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        i_minage = [i for i,j in enumerate(dates_hypertension_disease) if str(j).startswith(str(mindate))]  
        Hypertens = [str(Hypertension[index]) for index in i_minage]
        return ', '.join(set(Hypertens))
        
    else:
        return np.nan
    
def SRD_age(patient):
       
    # define columns
    column_diseases = list(range(30,166))
    column_dates = list(range(190,326))
    # select columns
    diseases = patient[column_diseases]
    dates = patient[column_dates]
    
    Hypertension = [i for i in diseases if str(i).startswith(srdisease)]
    i_Hypertension_disease = [i for i,j in enumerate(diseases) if str(j).startswith(srdisease)]  
    dates_hypertension_disease = [dates[index] for index in i_Hypertension_disease]
    if len(dates_hypertension_disease) > 0:
        
        #To remove dates with no age return nan
        for i in range(len(dates_hypertension_disease)):
            if (dates_hypertension_disease[i] == -1) or (dates_hypertension_disease[i] == -3):
                dates_hypertension_disease[i] = np.nan
                
        mindate = np.nanmin(dates_hypertension_disease)
        return mindate
    else:
        return np.nan

In [30]:
#Death record

#40001 [333:335] Underlaying cause of death
#40002 [335:363] Contributory cause of death
#40000 [331:333] Date of death
#40007 [397:399] Age of death

# define columns

def Death_variable(patient):
    column_diseases = list(range(333,363))
    
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return ', '.join(set(cause))  
    else:
        return np.nan
    
    
def Death_date(patient):
    column_diseases = list(range(333,363))
        
    # select columns
    cause = [i for i in patient[column_diseases] if str(i).startswith(icd_10_9)]
    if len(cause) > 0:
        return patient['40000-0.0']
    else:
        return np.nan

## 3. Generate variables including ICD10 or ICD9 codes

In [31]:
# Define Hematological neoplasms

icd_10_9 = ('C90', 'C91', 'C92', 'C93', 'C94', 'C95', 'D45', 'D46', 'D47',
           '203', '204', '205', '206', '207', '208', '2384', '2385', '2386', '2387')
srcancer = ('1048', '1051', '1055', '1056', '1058', '1074')
srdisease = ()


cancer_ukb200k['HN'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['HN_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['HN_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['HN_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['HN_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['HN_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['HN_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['HN_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [32]:
#Some checkings
a = cancer_ukb200k[['HN', 'HN_date']]
a[a['HN'].notnull()]

Unnamed: 0,HN,HN_date
141,C900,2019-12-09
154,C911,2010-04-28
186,C911,2013-06-04
227,D471,2015-07-30
334,C919,2007-02-23
...,...,...
502105,C911,2000-11-22
502184,C930,2013-07-15
502208,C900,2014-05-21
502236,D473,2018-07-14


In [33]:
# Define Myeloid malignances
icd_10_9 = ('C920', 'C922', 'C923', 'C924', 'C925', 'C926', 'C927', 'C928',
            'C929', 'C930', 'C931', 'C932', 'C937', 'C939', 'C940', 'C942',
            'C943', 'C944', 'C945', 'C946', 'C962', 'D45', 'D46', 'D470',
            'D471', 'D473', 'D474', 'D475',
            '2050', '2052', '2053', '2058',  '2059',  '2060', '2062', '2070',
            '2072', '2384', '2385', '2387')
srcancer = ('1051', '1074')
srdisease = ('1449', '1438')


cancer_ukb200k['MM'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['MM_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['MM_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['MM_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

cancer_ukb200k['MM_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cancer_ukb200k['MM_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['MM_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['MM_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [34]:
# Define Myeloproliferative neoplasms

icd_10_9 = ('D45', 'D470', 'D471', 'D473', 'D474', '2384', '2385')
srcancer = ()
srdisease = ('1449', '1438')


cancer_ukb200k['MPN'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['MPN_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

# cancer_ukb200k['MPN_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
# cancer_ukb200k['MPN_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

cancer_ukb200k['MPN_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
cancer_ukb200k['MPN_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['MPN_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['MPN_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [35]:
# Define Myelodysplastic syndromes

icd_10_9 = ('C946', 'D46', '2387')
srcancer = ('1051')
srdisease = ()


cancer_ukb200k['MDS'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['MDS_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['MDS_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['MDS_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['MDS_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['MDS_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['MDS_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['MDS_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [36]:
# Define Chronic myelomonocytic leukemia

icd_10_9 = ('C931', '2061')
srcancer = ()
srdisease = ()


cancer_ukb200k['CMML'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['CMML_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

# cancer_ukb200k['CMML_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
# cancer_ukb200k['CMML_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['CMML_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['CMML_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['CMML_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['CMML_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [37]:
# Define Acute myeloid leukemias

icd_10_9 = ('C920', 'C922', 'C923', 'C924', 'C925', 'C926', 'C927', 'C928', 'C929',
            'C930', 'C932', 'C940', 'C942',
           '2050', '2052', '2053', '2058', '2059',  '2060', '2062', '2070', '2072')
srcancer = ('1074')
srdisease = ()


cancer_ukb200k['AML'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['AML_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['AML_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['AML_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['AML_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['AML_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['AML_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['AML_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [38]:
# Define Lymphoid leukemias

icd_10_9 = ('C91', '204')
srcancer = ('1055')
srdisease = ()


cancer_ukb200k['LL'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['LL_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['LL_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['LL_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['LL_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['LL_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['LL_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['LL_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [39]:
# Define Other malign neoplams

icd_10_9 = ('C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C81', 'C82', 'C83', 'C84',
            'C85', 'C86', 'C87', 'C88', 'C89', 'C97',
           '14', '15', '16', '17', '18', '19', '200', '201', '202', '209')
srcancer = ('100', '101', '102', '103', '1040', '1041', '1042', '1043', '1044', '1045',
            '1046', '1047', '1050', '1052', '1053', '1059', '106', '1070', '1071',
            '1072', '1073', '1075', '1076', '1077', '1078', '1079', '1080', '1081',
            '1082', '1083', '1084', '1085', '1086', '1087', '1088')
srdisease = ()


cancer_ukb200k['N-HN'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['N-HN_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['N-HN_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['N-HN_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['N-HN_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['N-HN_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['N-HN_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['N-HN_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [40]:
# Define Breast

icd_10_9 = ('C50', 'Z853', '174', '175', 'V103')
srcancer = ('1022')
srdisease = ()


cancer_ukb200k['Breast'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Breast_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Breast_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Breast_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Breast_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Breast_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Breast_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Breast_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

In [41]:
# Define Prostate

icd_10_9 = ('C61', '185')
srcancer = ('1044')
srdisease = ()


cancer_ukb200k['Prostate'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Prostate_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Prostate_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Prostate_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Prostate_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Prostate_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Prostate_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Prostate_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)

  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [42]:
# Define Lung

icd_10_9 = ('C33', 'C34', 'C399', 'Z851', '162', 'V101')
srcancer = ('1001', '1027', '1028', '1080')
srdisease = ()


cancer_ukb200k['Lung'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Lung_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Lung_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Lung_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Lung_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Lung_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Lung_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Lung_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [43]:
# Define Colorectal

icd_10_9 = ('C18', 'C19', 'C20', 'C21', 'C21', '153', '154')
srcancer = ('1020', '1021', '1022', '1023')
srdisease = ()


cancer_ukb200k['Colorectal'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Colorectal_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Colorectal_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Colorectal_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Colorectal_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Colorectal_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Colorectal_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Colorectal_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [44]:
# Define Small-intestine

icd_10_9 = ('C17', '152')
srcancer = ('1019')
srdisease = ()


cancer_ukb200k['Small-intestine'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Small-intestine_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Small-intestine_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Small-intestine_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Small-intestine_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Small-intestine_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Small-intestine_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Small-intestine_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [45]:
# Define Melanoma

icd_10_9 = ('C43', '172')
srcancer = ('1059')
srdisease = ()


cancer_ukb200k['Melanoma'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Melanoma_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Melanoma_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Melanoma_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Melanoma_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Melanoma_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Melanoma_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Melanoma_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [46]:
# Define Lymphoma

icd_10_9 = ('C81', 'C82', 'C83', 'C84', 'C85', 'C86', 'C87', 'C88', '200', '201', '202')
srcancer = ('1047', '1052', '1053')
srdisease = ()


cancer_ukb200k['Lymphoma'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Lymphoma_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Lymphoma_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Lymphoma_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Lymphoma_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Lymphoma_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Lymphoma_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Lymphoma_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [47]:
# Define Myeloma

icd_10_9 = ('C900', 'C901', '2030', '2031')
srcancer = ('1050')
srdisease = ()


cancer_ukb200k['Myeloma'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Myeloma_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Myeloma_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Myeloma_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Myeloma_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Myeloma_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Myeloma_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Myeloma_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [48]:
# Define Kidney

icd_10_9 = ('C64', 'Z855', '1890', 'V105')
srcancer = ('1034')
srdisease = ()


cancer_ukb200k['Kidney'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Kidney_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Kidney_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Kidney_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Kidney_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Kidney_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Kidney_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Kidney_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [49]:
# Define Head-neck

icd_10_9 = ('C0', 'C10', 'C11', 'C12', 'C13', 'C14', 'C30', 'C31', 'C32',
           '14', '160', '161')
srcancer = ('1006', '1007', '1009', '1004', '1010', '1011', '1012', '1077',
            '1078', '1079', '1005', '1015', '1016')
srdisease = ()


cancer_ukb200k['Head-neck'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Head-neck_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Head-neck_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Head-neck_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Head-neck_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Head-neck_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Head-neck_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Head-neck_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [50]:
# Define Brain

icd_10_9 = ('C70', 'C71', 'C720', 'C723', '191', '1920', '1921', '1922', '1923')
srcancer = ('1031', '1032', '1033')
srdisease = ()


cancer_ukb200k['Brain'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Brain_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Brain_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Brain_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Brain_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Brain_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Brain_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Brain_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [51]:
# Define Bladder

icd_10_9 = ('C65', 'C66', 'C67', 'C723', '188', '1891', '1892')
srcancer = ('1035')
srdisease = ()


cancer_ukb200k['Bladder'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Bladder_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Bladder_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Bladder_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Bladder_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Bladder_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Bladder_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Bladder_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [52]:
# Define Pancreas

icd_10_9 = ('C25', '157')
srcancer = ('1026')
srdisease = ()


cancer_ukb200k['Pancreas'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Pancreas_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Pancreas_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Pancreas_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Pancreas_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Pancreas_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Pancreas_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Pancreas_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [53]:
# Define Uterine

icd_10_9 = ('C57', 'C55', '179', '182')
srcancer = ('1040')
srdisease = ()


cancer_ukb200k['Uterine'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Uterine_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Uterine_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Uterine_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Uterine_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Uterine_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Uterine_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Uterine_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [54]:
# Define Oesophageal

icd_10_9 = ('C15', '150')
srcancer = ('1017')
srdisease = ()


cancer_ukb200k['Oesophageal'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Oesophageal_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Oesophageal_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Oesophageal_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Oesophageal_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Oesophageal_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Oesophageal_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Oesophageal_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [55]:
# Define Ovarian

icd_10_9 = ('C56', 'C570', 'C574', '1830', '1832', '1838', '1839')
srcancer = ('1039', '1087')
srdisease = ()


cancer_ukb200k['Ovarian'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Ovarian_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Ovarian_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Ovarian_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Ovarian_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Ovarian_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Ovarian_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Ovarian_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [56]:
# Define Stomach

icd_10_9 = ('C16', 'Z8502', '151')
srcancer = ('1018')
srdisease = ()


cancer_ukb200k['Stomach'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Stomach_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Stomach_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Stomach_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Stomach_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Stomach_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Stomach_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Stomach_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [57]:
# Define Liver

icd_10_9 = ('C220', '1550')
srcancer = ('1024')
srdisease = ()


cancer_ukb200k['Liver'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Liver_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Liver_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Liver_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Liver_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Liver_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Liver_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Liver_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [58]:
# Define Thyroid

icd_10_9 = ('C73', '193')
srcancer = ('1065')
srdisease = ()


cancer_ukb200k['Thyroid'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Thyroid_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Thyroid_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Thyroid_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Thyroid_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Thyroid_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Thyroid_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Thyroid_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [59]:
# Define Biliari

icd_10_9 = ('C23', 'C24', 'C221', '1551', '1560')
srcancer = ('1025')
srdisease = ()


cancer_ukb200k['Biliari'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Biliari_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Biliari_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Biliari_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Biliari_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Biliari_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Biliari_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Biliari_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [60]:
# Define Cervical

icd_10_9 = ('C53', '180')
srcancer = ('1041')
srdisease = ()


cancer_ukb200k['Cervical'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Cervical_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Cervical_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Cervical_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Cervical_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Cervical_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Cervical_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Cervical_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [61]:
# Define Sarcoma

icd_10_9 = ('C49', '171')
srcancer = ('1068')
srdisease = ()


cancer_ukb200k['Sarcoma'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Sarcoma_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Sarcoma_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Sarcoma_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Sarcoma_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Sarcoma_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Sarcoma_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Sarcoma_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


  mindate = np.nanmin(dates_hypertension_disease)
  mindate = np.nanmin(dates_hypertension_disease)


In [62]:
# Define Testicular

icd_10_9 = ('C62', '186')
srcancer = ('1045')
srdisease = ()


cancer_ukb200k['Testicular'] = cancer_ukb200k.apply(lambda x: Diagnose_variable(x), axis=1)
cancer_ukb200k['Testicular_date'] = cancer_ukb200k.apply(lambda x: Diagnose_date(x), axis=1)

cancer_ukb200k['Testicular_SRC'] = cancer_ukb200k.apply(lambda x: SRC_variable(x), axis=1)
cancer_ukb200k['Testicular_SRC_age'] = cancer_ukb200k.apply(lambda x: SRC_age(x), axis=1)

# cancer_ukb200k['Testicular_SRD'] = cancer_ukb200k.apply(lambda x: SRD_variable(x), axis=1)
# cancer_ukb200k['Testicular_SRD_age'] = cancer_ukb200k.apply(lambda x: SRD_age(x), axis=1)

cancer_ukb200k['Testicular_death'] = cancer_ukb200k.apply(lambda x: Death_variable(x), axis=1)
cancer_ukb200k['Testicular_death_date'] = cancer_ukb200k.apply(lambda x: Death_date(x), axis=1)


In [65]:
cancer_ukb200k.reset_index(drop=True, inplace=True)

In [66]:
cancer_var = ('HN', 'MM', 'MPN', 'MDS', 'CMML', 'AML', 'LL', 'N-HN', 
           'Breast', 'Prostate', 'Lung', 'Colorectal', 'Small-intestine', 
           'Melanoma', 'Lymphoma', 'Myeloma', 'Kidney', 'Head-neck',
           'Brain', 'Bladder', 'Pancreas', 'Uterine', 'Oesophageal', 'Ovarian',
           'Stomach', 'Liver', 'Thyroid', 'Biliari', 'Cervical', 'Sarcoma', 'Testicular')

variables = [var for var in cancer_ukb200k.columns.to_list() if var.endswith(('age', 'date'))]
variables

['HN_date',
 'HN_SRC_age',
 'HN_death_date',
 'MM_date',
 'MM_SRC_age',
 'MM_SRD_age',
 'MM_death_date',
 'MPN_date',
 'MPN_SRD_age',
 'MPN_death_date',
 'MDS_date',
 'MDS_SRC_age',
 'MDS_death_date',
 'CMML_date',
 'CMML_death_date',
 'AML_date',
 'AML_SRC_age',
 'AML_death_date',
 'LL_date',
 'LL_SRC_age',
 'LL_death_date',
 'N-HN_date',
 'N-HN_SRC_age',
 'N-HN_death_date',
 'Breast_date',
 'Breast_SRC_age',
 'Breast_death_date',
 'Prostate_date',
 'Prostate_SRC_age',
 'Prostate_death_date',
 'Lung_date',
 'Lung_SRC_age',
 'Lung_death_date',
 'Colorectal_date',
 'Colorectal_SRC_age',
 'Colorectal_death_date',
 'Small-intestine_date',
 'Small-intestine_SRC_age',
 'Small-intestine_death_date',
 'Melanoma_date',
 'Melanoma_SRC_age',
 'Melanoma_death_date',
 'Lymphoma_date',
 'Lymphoma_SRC_age',
 'Lymphoma_death_date',
 'Myeloma_date',
 'Myeloma_SRC_age',
 'Myeloma_death_date',
 'Kidney_date',
 'Kidney_SRC_age',
 'Kidney_death_date',
 'Head-neck_date',
 'Head-neck_SRC_age',
 'Head-neck_d

In [73]:
from tqdm import tqdm

for variable in tqdm(variables):
    years = []
    if variable.endswith('date'):
        for n, x in enumerate(cancer_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = pd.to_datetime(x, format='%Y-%m-%d')-pd.to_datetime(cancer_ukb200k.loc[n,'53-0.0'], format='%Y-%m-%d')
                years.append(year/np.timedelta64(1,'Y'))
    elif variable.endswith('age'):
        for n, x in enumerate(cancer_ukb200k[variable]):
            if x is np.nan:
                years.append(x)
            else:
                year = x-cancer_ukb200k.loc[n,'21022-0.0']
                years.append(year)

    var = '_'.join(variable.split('_')[:-1])
    print(var)
    cancer_ukb200k[var+'_years'] = years


  1%|▍                                           | 1/93 [00:01<02:56,  1.92s/it]

HN


  2%|▉                                           | 2/93 [00:06<05:08,  3.39s/it]

HN_SRC


  3%|█▍                                          | 3/93 [00:07<03:15,  2.17s/it]

HN_death


  4%|█▉                                          | 4/93 [00:07<02:24,  1.62s/it]

MM


  5%|██▎                                         | 5/93 [00:12<03:52,  2.64s/it]

MM_SRC


  6%|██▊                                         | 6/93 [00:16<04:43,  3.26s/it]

MM_SRD


  8%|███▎                                        | 7/93 [00:17<03:21,  2.35s/it]

MM_death


  9%|███▊                                        | 8/93 [00:17<02:28,  1.75s/it]

MPN


 10%|████▎                                       | 9/93 [00:22<03:36,  2.58s/it]

MPN_SRD


 11%|████▌                                      | 10/93 [00:22<02:34,  1.87s/it]

MPN_death


 12%|█████                                      | 11/93 [00:22<01:54,  1.40s/it]

MDS


 13%|█████▌                                     | 12/93 [00:27<03:08,  2.33s/it]

MDS_SRC


 14%|██████                                     | 13/93 [00:27<02:16,  1.70s/it]

MDS_death


 15%|██████▍                                    | 14/93 [00:27<01:39,  1.26s/it]

CMML


 16%|██████▉                                    | 15/93 [00:27<01:13,  1.06it/s]

CMML_death


 17%|███████▍                                   | 16/93 [00:28<01:00,  1.27it/s]

AML


 18%|███████▊                                   | 17/93 [00:32<02:23,  1.89s/it]

AML_SRC


 19%|████████▎                                  | 18/93 [00:33<01:47,  1.43s/it]

AML_death


 20%|████████▊                                  | 19/93 [00:33<01:27,  1.18s/it]

LL


 22%|█████████▏                                 | 20/93 [00:38<02:38,  2.17s/it]

LL_SRC


 23%|█████████▋                                 | 21/93 [00:38<01:55,  1.61s/it]

LL_death


 24%|██████████▏                                | 22/93 [01:11<13:04, 11.05s/it]

N-HN


 25%|██████████▋                                | 23/93 [01:15<10:34,  9.07s/it]

N-HN_SRC


 26%|███████████                                | 24/93 [01:22<09:25,  8.20s/it]

N-HN_death


 27%|███████████▌                               | 25/93 [01:28<08:41,  7.66s/it]

Breast


 28%|████████████                               | 26/93 [01:33<07:28,  6.70s/it]

Breast_SRC


 29%|████████████▍                              | 27/93 [01:33<05:24,  4.92s/it]

Breast_death


 30%|████████████▉                              | 28/93 [01:38<05:22,  4.97s/it]

Prostate


 31%|█████████████▍                             | 29/93 [01:43<05:07,  4.81s/it]

Prostate_SRC


 32%|█████████████▊                             | 30/93 [01:44<03:45,  3.57s/it]

Prostate_death


 33%|██████████████▎                            | 31/93 [01:45<03:08,  3.04s/it]

Lung


 34%|██████████████▊                            | 32/93 [01:50<03:30,  3.45s/it]

Lung_SRC


 35%|███████████████▎                           | 33/93 [01:51<02:49,  2.82s/it]

Lung_death


 37%|███████████████▋                           | 34/93 [01:54<02:52,  2.92s/it]

Colorectal


 38%|████████████████▏                          | 35/93 [01:59<03:15,  3.37s/it]

Colorectal_SRC


 39%|████████████████▋                          | 36/93 [02:00<02:29,  2.63s/it]

Colorectal_death


 40%|█████████████████                          | 37/93 [02:00<01:48,  1.94s/it]

Small-intestine


 41%|█████████████████▌                         | 38/93 [02:04<02:27,  2.68s/it]

Small-intestine_SRC


 42%|██████████████████                         | 39/93 [02:05<01:45,  1.95s/it]

Small-intestine_death


 43%|██████████████████▍                        | 40/93 [02:06<01:43,  1.96s/it]

Melanoma


 44%|██████████████████▉                        | 41/93 [02:11<02:19,  2.68s/it]

Melanoma_SRC


 45%|███████████████████▍                       | 42/93 [02:11<01:41,  1.98s/it]

Melanoma_death


 46%|███████████████████▉                       | 43/93 [02:13<01:32,  1.85s/it]

Lymphoma


 47%|████████████████████▎                      | 44/93 [02:17<02:08,  2.62s/it]

Lymphoma_SRC


 48%|████████████████████▊                      | 45/93 [02:18<01:35,  1.99s/it]

Lymphoma_death


 49%|█████████████████████▎                     | 46/93 [02:18<01:13,  1.57s/it]

Myeloma


 51%|█████████████████████▋                     | 47/93 [02:23<01:51,  2.42s/it]

Myeloma_SRC


 52%|██████████████████████▏                    | 48/93 [02:23<01:21,  1.81s/it]

Myeloma_death


 53%|██████████████████████▋                    | 49/93 [02:24<01:08,  1.55s/it]

Kidney


 54%|███████████████████████                    | 50/93 [02:28<01:44,  2.42s/it]

Kidney_SRC


 55%|███████████████████████▌                   | 51/93 [02:29<01:16,  1.82s/it]

Kidney_death


 56%|████████████████████████                   | 52/93 [02:30<01:03,  1.55s/it]

Head-neck


 57%|████████████████████████▌                  | 53/93 [02:34<01:36,  2.41s/it]

Head-neck_SRC


 58%|████████████████████████▉                  | 54/93 [02:35<01:09,  1.79s/it]

Head-neck_death


 59%|█████████████████████████▍                 | 55/93 [02:35<00:54,  1.42s/it]

Brain


 60%|█████████████████████████▉                 | 56/93 [02:40<01:25,  2.31s/it]

Brain_SRC


 61%|██████████████████████████▎                | 57/93 [02:40<01:03,  1.77s/it]

Brain_death


 62%|██████████████████████████▊                | 58/93 [02:41<00:53,  1.52s/it]

Bladder


 63%|███████████████████████████▎               | 59/93 [02:45<01:20,  2.38s/it]

Bladder_SRC


 65%|███████████████████████████▋               | 60/93 [02:46<00:59,  1.79s/it]

Bladder_death


 66%|████████████████████████████▏              | 61/93 [02:46<00:46,  1.46s/it]

Pancreas


 67%|████████████████████████████▋              | 62/93 [02:51<01:12,  2.35s/it]

Pancreas_SRC


 68%|█████████████████████████████▏             | 63/93 [02:52<00:55,  1.86s/it]

Pancreas_death


 69%|█████████████████████████████▌             | 64/93 [02:52<00:40,  1.40s/it]

Uterine


 70%|██████████████████████████████             | 65/93 [02:56<01:04,  2.31s/it]

Uterine_SRC


 71%|██████████████████████████████▌            | 66/93 [02:57<00:45,  1.70s/it]

Uterine_death


 72%|██████████████████████████████▉            | 67/93 [02:57<00:35,  1.38s/it]

Oesophageal


 73%|███████████████████████████████▍           | 68/93 [03:02<00:57,  2.30s/it]

Oesophageal_SRC


 74%|███████████████████████████████▉           | 69/93 [03:02<00:42,  1.78s/it]

Oesophageal_death


 75%|████████████████████████████████▎          | 70/93 [03:03<00:34,  1.50s/it]

Ovarian


 76%|████████████████████████████████▊          | 71/93 [03:08<00:52,  2.37s/it]

Ovarian_SRC


 77%|█████████████████████████████████▎         | 72/93 [03:08<00:37,  1.80s/it]

Ovarian_death


 78%|█████████████████████████████████▊         | 73/93 [03:09<00:28,  1.42s/it]

Stomach


 80%|██████████████████████████████████▏        | 74/93 [03:13<00:43,  2.31s/it]

Stomach_SRC


 81%|██████████████████████████████████▋        | 75/93 [03:13<00:31,  1.74s/it]

Stomach_death


 82%|███████████████████████████████████▏       | 76/93 [03:14<00:22,  1.32s/it]

Liver


 83%|███████████████████████████████████▌       | 77/93 [03:18<00:36,  2.25s/it]

Liver_SRC


 84%|████████████████████████████████████       | 78/93 [03:18<00:25,  1.67s/it]

Liver_death


 85%|████████████████████████████████████▌      | 79/93 [03:19<00:18,  1.33s/it]

Thyroid


 86%|████████████████████████████████████▉      | 80/93 [03:23<00:29,  2.24s/it]

Thyroid_SRC


 87%|█████████████████████████████████████▍     | 81/93 [03:24<00:19,  1.64s/it]

Thyroid_death


 88%|█████████████████████████████████████▉     | 82/93 [03:24<00:14,  1.28s/it]

Biliari


 89%|██████████████████████████████████████▍    | 83/93 [03:28<00:22,  2.21s/it]

Biliari_SRC


 90%|██████████████████████████████████████▊    | 84/93 [03:29<00:14,  1.66s/it]

Biliari_death


 91%|███████████████████████████████████████▎   | 85/93 [03:29<00:10,  1.34s/it]

Cervical


 92%|███████████████████████████████████████▊   | 86/93 [03:34<00:15,  2.26s/it]

Cervical_SRC


 94%|████████████████████████████████████████▏  | 87/93 [03:34<00:09,  1.65s/it]

Cervical_death


 95%|████████████████████████████████████████▋  | 88/93 [03:34<00:06,  1.27s/it]

Sarcoma


 96%|█████████████████████████████████████████▏ | 89/93 [03:39<00:08,  2.21s/it]

Sarcoma_SRC


 97%|█████████████████████████████████████████▌ | 90/93 [03:39<00:04,  1.63s/it]

Sarcoma_death


 98%|██████████████████████████████████████████ | 91/93 [03:40<00:02,  1.29s/it]

Testicular


 99%|██████████████████████████████████████████▌| 92/93 [03:44<00:02,  2.24s/it]

Testicular_SRC


100%|███████████████████████████████████████████| 93/93 [03:44<00:00,  2.42s/it]

Testicular_death





In [78]:
cancer_var = ['HN', 'MM', 'MPN', 'MDS', 'CMML', 'AML', 'LL', 'N-HN', 
           'Breast', 'Prostate', 'Lung', 'Colorectal', 'Small-intestine', 
           'Melanoma', 'Lymphoma', 'Myeloma', 'Kidney', 'Head-neck',
           'Brain', 'Bladder', 'Pancreas', 'Uterine', 'Oesophageal', 'Ovarian',
           'Stomach', 'Liver', 'Thyroid', 'Biliari', 'Cervical', 'Sarcoma', 'Testicular']
for var in cancer_var:
    yeears = [x for x in cancer_ukb200k.columns if (x.startswith(var)) and (x.endswith('years'))]
    cancer_ukb200k['var_'+var+'_years'] = cancer_ukb200k.loc[:,yeears].min(axis=1)
    cancer_ukb200k['var_'+var+'_var'] = np.where(cancer_ukb200k['var_'+var+'_years'].notnull(), 1, 0)
    cancer_ukb200k['var_'+var+'_post'] = np.where(cancer_ukb200k['var_'+var+'_years'] > 0, 1, 0)
    cancer_ukb200k['var_'+var+'_pre'] = np.where(cancer_ukb200k['var_'+var+'_years'] < 0, 1, 0)

## 4. Save output

In [89]:
tuple(cancer_var)

('HN',
 'MM',
 'MPN',
 'MDS',
 'CMML',
 'AML',
 'LL',
 'N-HN',
 'Breast',
 'Prostate',
 'Lung',
 'Colorectal',
 'Small-intestine',
 'Melanoma',
 'Lymphoma',
 'Myeloma',
 'Kidney',
 'Head-neck',
 'Brain',
 'Bladder',
 'Pancreas',
 'Uterine',
 'Oesophageal',
 'Ovarian',
 'Stomach',
 'Liver',
 'Thyroid',
 'Biliari',
 'Cervical',
 'Sarcoma',
 'Testicular')

In [19]:
df1 = cancer_ukb200k[['eid', '53-0.0', '21022-0.0']+[x for x in cancer_ukb200k.columns if (x.startswith(('var')) or x.startswith(tuple(cancer_var)))]]

In [92]:
### SAVE
df1.to_csv('cancer_ukb200k_Siddharta_670124_simple.txt.gz', sep="\t", index=False, compression='gzip')