# Generate matrix with CH information per case

In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)
%precision 2


import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

In [17]:
### Read a file containing 'patient', 'age_recruitment', 'age_group' columns
patient_age_ch = pd.read_csv('patient_450k_age_670124.txt.gz',sep="\t")
print(f'Rows:{len(patient_age_ch)}')
print(f'Columns:{list(patient_age_ch.columns)}')
print(f'Columns:{list(set(patient_age_ch.age_group))}')

Rows:502391
Columns:['patient', 'age_recruitment', 'age_group']
Columns:['56-60', '51-55', '38-45', '61-65', '46-50', '66-72']


In [15]:
### Get a list of the cases analyzed
withdraw = pd.read_csv('w69794_2023-04-25.csv',sep="\t", header=None)

### Get a list of withdrawals from the UKB webpage
case = pd.read_csv('ukb450k_cohort_participants.txt',sep="\t", header=None)

### Remove cases not analyzed or withdrawals
patient_age_ch = patient_age_ch[patient_age_ch['patient'].isin(case[0])]
patient_age_ch = patient_age_ch[~patient_age_ch['patient'].isin(list(withdraw[0]))].reset_index(drop=True)
len(patient_age_ch)

469880

In [18]:
### Read cancer information matrix
patient_cancer = pd.read_csv('cancer_ukb450k_670124.txt.gz', sep="\t")
ukb200K_ch_cancer = pd.merge(patient_age_ch, patient_cancer,  how='inner', on='patient')
ukb200K_ch_cancer.columns

  patient_cancer = pd.read_csv('../../../Paper_data/UKB_clinic_info/cancer_ukb450k_670124.txt.gz', sep="\t")


Index(['patient', 'age_recruitment_x', 'age_group_x', 'age_recruitment_y',
       'age_group_y', 'eid', 'cancer', 'cancer2+', 'age_1cancer',
       'years_tocancer1', 'cancer1_pre', 'type_1cancer', 'hemato_cancer',
       'hematocancer_icd10_type', 'hematocancer_icd10_age',
       'hematocancer_icd9_type', 'hematocancer_icd9_age', 'hematocancer_type',
       'hematocancer_age', 'hematocancer_icd10_date', 'hematocancer_icd9_date',
       'hematocancer_date', 'yearsto_hematocan', 'hemato_cancer_post',
       'hemato_cancer_pre', 'yearsto_hematocan_dates',
       'hemato_cancer_post_dates', 'hemato_cancer_pre_dates', 'hemato_1cancer',
       'hemato_cancer_class', 'lymphoid_myeloid', 'lymphoid_cancer_post',
       'myeloid_cancer_post', '40005-0.0', '40005-1.0', '40005-2.0',
       '40005-3.0', '40005-4.0', '40005-5.0', '40005-6.0', '40005-7.0',
       '40006-0.0', '40006-1.0', '40006-2.0', '40006-3.0', '40006-4.0',
       '40006-5.0', '40006-6.0', '40006-7.0', '40008-0.0', '40008-1.0',
 

In [19]:
### Upload BoostDM mutations
boostDM_predictions = pd.read_csv('All450k_filtered_boostDM_ALL_age_20230802_v3.vcf', sep="\t", compression='gzip')
boostDM_predictions = boostDM_predictions.rename(columns={'case': 'patient', 'BoostDM':'Drivers_12genes'})
boostDM_predictions = boostDM_predictions[~boostDM_predictions['Prot_pos'].isna()]
boostDM_predictions = boostDM_predictions[~boostDM_predictions['Consequence'].isin(['start_lost', 'stop_lost', 'stop_retained_variant'])]
boostDM_predictions = boostDM_predictions[boostDM_predictions['patient'].isin(patient_age_ch['patient'])]
boostDM_predictions = boostDM_predictions[~boostDM_predictions['ALT'].str.contains(',')]
boostDM_predictions = boostDM_predictions[boostDM_predictions['DP']>=10]
len(boostDM_predictions)

203351

In [37]:
# list of patients with potential CH mutation
patients_potentialCH = boostDM_predictions['patient'].to_list()
print(len(patients_potentialCH))
patients_potentialCH_big = boostDM_predictions[boostDM_predictions['VAF_alt']>=0.10]['patient'].to_list()
print(len(patients_potentialCH_big))
patients_potentialCH_small = boostDM_predictions[boostDM_predictions['VAF_alt']<0.10]['patient'].to_list()
print(len(patients_potentialCH_small))

203351
65849
137502


In [38]:
# list of patients with BoostDM driver
patients_CHdrivers = boostDM_predictions[boostDM_predictions['Drivers_12genes'] == 1]['patient'].to_list()
print(len(patients_CHdrivers))
patients_CHdrivers_big = boostDM_predictions[(boostDM_predictions['Drivers_12genes']==1) & (boostDM_predictions['VAF_alt']>=0.10)]['patient'].to_list()
print(len(patients_CHdrivers_big))
patients_CHdrivers_small = boostDM_predictions[(boostDM_predictions['Drivers_12genes']==1) & (boostDM_predictions['VAF_alt']<0.10)]['patient'].to_list()
print(len(patients_CHdrivers_small))

41805
11491
30314


In [39]:
# list of patients with BoostDM passengers
patients_CH_passengers = boostDM_predictions[boostDM_predictions['Drivers_12genes'] == 0]['patient'].to_list()
print(len(patients_CH_passengers))
patients_CH_passengers_big = boostDM_predictions[(boostDM_predictions['Drivers_12genes']==0) & (boostDM_predictions['VAF_alt']>=0.10)]['patient'].to_list()
print(len(patients_CH_passengers_big))
patients_CH_passengers_small = boostDM_predictions[(boostDM_predictions['Drivers_12genes']==0) & (boostDM_predictions['VAF_alt']<0.10)]['patient'].to_list()
print(len(patients_CH_passengers_small))

161546
54358
107188


In [40]:
patient_age_ch['CH_driver'] = np.where(patient_age_ch['patient'].isin(patients_CHdrivers), 1, 0)
patient_age_ch['CH_driver'].value_counts()

0    431328
1     38552
Name: CH_driver, dtype: int64

In [41]:
# Add to table num of drivers
patient_age_ch['CH_driver_big'] = np.where(patient_age_ch['patient'].isin(patients_CHdrivers_big), 1, 0)
patient_age_ch['CH_driver_big'].value_counts()

0    458884
1     10996
Name: CH_driver_big, dtype: int64

In [42]:
# Add to table num of drivers
patient_age_ch['CH_driver_small'] = np.where(patient_age_ch['patient'].isin(patients_CHdrivers_small), 1, 0)
patient_age_ch['CH_driver_small'].value_counts()

0    441361
1     28519
Name: CH_driver_small, dtype: int64

In [43]:
#Add cases with potential CH mutations
patient_age_ch['CH_potential'] = np.where(patient_age_ch['patient'].isin(patients_potentialCH), 1, 0)
patient_age_ch['CH_potential'].value_counts()

0    320762
1    149118
Name: CH_potential, dtype: int64

In [44]:
#Add cases with potential
patient_age_ch['CH_passengers'] = np.where(patient_age_ch['patient'].isin(patients_CH_passengers), 1, 0)
patient_age_ch['CH_passengers'].value_counts()

0    347227
1    122653
Name: CH_passengers, dtype: int64

In [45]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [46]:
# Add num of drivers
patient_age_ch['num_CH_drivers'] = patient_age_ch['patient'].progress_apply(lambda x: patients_CHdrivers.count(x) if x in patients_CHdrivers else 0)
patient_age_ch['num_CH_drivers'].value_counts()

  0%|          | 0/469880 [00:00<?, ?it/s]

0    431328
1     35645
2      2632
3       227
4        35
6         6
5         5
7         2
Name: num_CH_drivers, dtype: int64

In [47]:
# Add column if case have more than 1 driver mutations
patient_age_ch['CH_driver_multiple'] = patient_age_ch['num_CH_drivers'].progress_apply(lambda x: 1 if x > 1 else 0)
patient_age_ch['CH_driver_multiple'].value_counts()

  0%|          | 0/469880 [00:00<?, ?it/s]

0    466973
1      2907
Name: CH_driver_multiple, dtype: int64

In [48]:
# Add num of drivers
patient_age_ch['num_CH_potential'] = patient_age_ch['patient'].progress_apply(lambda x: patients_potentialCH.count(x) if x in patients_potentialCH else 0)
patient_age_ch['num_CH_potential'].value_counts()

  0%|          | 0/469880 [00:00<?, ?it/s]

0     320762
1     114167
2      25978
3       5718
4       1560
5        577
6        303
7        192
8        122
9         93
10        83
11        45
12        42
14        41
13        37
16        23
15        21
19        19
18        17
20        15
17        15
21        14
22         9
26         8
24         6
25         5
23         3
27         2
28         1
34         1
31         1
Name: num_CH_potential, dtype: int64

In [49]:
genes12 = ['ASXL1', 'CHEK2', 'DNMT3A', 'GNAS', 'IDH2', 'MDM4', 'PPM1D', 'SF3B1', 'SRSF2', 'TET2', 'TP53', 'U2AF1']

In [50]:
# Get patients with CH mutation in the gene

for i in genes12:
    
    gene_CH_patient = boostDM_predictions[boostDM_predictions['SYMBOL'] == i]
    
    gene_CH_patient_drivers = set(gene_CH_patient[gene_CH_patient['Drivers_12genes'] == 1]['patient'].to_list())
    patient_age_ch[i+'_driver'] = np.where(patient_age_ch['patient'].isin(gene_CH_patient_drivers), 1, 0)
    
    gene_CH_patient_passengers = set(gene_CH_patient[gene_CH_patient['Drivers_12genes'] == 0]['patient'].to_list())
    patient_age_ch[i+'_passenger'] = np.where(patient_age_ch['patient'].isin(gene_CH_patient_passengers), 1, 0)

    gene_CH_patient_potential = set(gene_CH_patient['patient'].to_list())
    patient_age_ch[i+'_potential'] = np.where(patient_age_ch['patient'].isin(gene_CH_patient_potential), 1, 0)
    
patient_age_ch

# check
for i in genes12:
    print(i, sum(patient_age_ch[i+'_driver']), sum(patient_age_ch[i+'_passenger']), sum(patient_age_ch[i+'_potential']))

ASXL1 2463 29747 31883
CHEK2 2232 8544 10405
DNMT3A 20957 14748 34748
GNAS 254 18879 19121
IDH2 135 7521 7652
MDM4 115 6428 6539
PPM1D 903 12454 13305
SF3B1 723 8357 9058
SRSF2 575 5260 5828
TET2 9598 28271 36809
TP53 2192 6364 8484
U2AF1 465 4910 5363


In [52]:
### Create variables with gene function groups

for x in ['_driver', '_passenger', '_potential']:
    ## 1. Chromatin
    chromatin_genes = patient_age_ch[(patient_age_ch['DNMT3A'+x]==1)|\
                                     (patient_age_ch['TET2'+x]==1)|\
                                     (patient_age_ch['ASXL1'+x]==1)]['patient'].tolist()
    patient_age_ch['chromatin_gene'+x] = np.where(patient_age_ch['patient'].isin(chromatin_genes), 1, 0)

  
    ## 2. DDR
    DDR_genes = patient_age_ch[(patient_age_ch['CHEK2'+x]==1)|\
                               (patient_age_ch['TP53'+x]==1)|\
                               (patient_age_ch['PPM1D'+x]==1)|\
                               (patient_age_ch['MDM4'+x]==1)]['patient'].tolist()
    patient_age_ch['DDR_gene'+x] = np.where(patient_age_ch['patient'].isin(DDR_genes), 1, 0)

    ## 3. Splicing
    splicing_genes = patient_age_ch[(patient_age_ch['SF3B1'+x]==1)|\
                                     (patient_age_ch['SRSF2'+x]==1)|\
                                     (patient_age_ch['U2AF1'+x]==1)]['patient'].tolist()
    patient_age_ch['splicing_gene'+x] = np.where(patient_age_ch['patient'].isin(splicing_genes), 1, 0)

In [82]:
patient_age_ch.to_csv("patient_450k_age_670124_ALL_MUTATIONS_20230802_v3.txt.gz", sep="\t", index=False, compression='gzip')