# Merge mutations from the general variant calling and the U2AF1

In [1]:
import pandas as pd
import os
import gzip
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)
%precision 2

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

In [4]:
### Read a file containing 'patient', 'age_recruitment', 'age_group' columns
patient_age_ch = pd.read_csv('patient_450k_age_670124.txt.gz',sep="\t")
print(f'Rows:{len(patient_age_ch)}')
print(f'Columns:{list(patient_age_ch.columns)}')
print(f'Columns:{list(set(patient_age_ch.age_group))}')

Rows:502391
Columns:['patient', 'age_recruitment', 'age_group']
Columns:['56-60', '61-65', '51-55', '38-45', '46-50', '66-72']


In [12]:
### Get a list of the cases analyzed
withdraw = pd.read_csv('w69794_2023-04-25.csv',sep="\t", header=None)

### Get a list of withdrawals from the UKB webpage
case = pd.read_csv('ukb450k_cohort_participants.txt',sep="\t", header=None)

### Remove cases not analyzed or withdrawals
patient_age_ch = patient_age_ch[patient_age_ch['patient'].isin(case[0])]
patient_age_ch = patient_age_ch[~patient_age_ch['patient'].isin(list(withdraw[0]))].reset_index(drop=True)
patient_age_ch = patient_age_ch.astype({"patient": str})
len(patient_age_ch)

469880

# 1. Upload U2AF1 mutaitons

In [13]:
U2AF1_mutations = pd.read_csv('All450k_filtered_boostDM_U2AF1.vcf', sep="\t", compression='gzip')
U2AF1_mutations = U2AF1_mutations.astype({"case": str})
len(U2AF1_mutations)

5620

In [14]:
# Check that there are not mutations from withdrawal cases
print(len(U2AF1_mutations))
U2AF1_mutations = U2AF1_mutations[U2AF1_mutations['case'].isin(patient_age_ch['patient'].to_list())]
print(len(U2AF1_mutations))

5620
5620


In [15]:
# Include agegroup
from tqdm import tqdm
age_grup = []
for x in tqdm(U2AF1_mutations['case'].tolist()):
    age = patient_age_ch[patient_age_ch['patient']==x]['age_group'].values[0]
    age_grup.append(age)

100%|████████████████████████████████████████████████████████████| 5620/5620 [02:43<00:00, 34.28it/s]


In [16]:
U2AF1_mutations['age_group'] = age_grup

# 2. Upload mutations from the general calling (11 genes)

In [17]:
boostDM_old = pd.read_csv('All450k_filtered_boostDM.vcf', sep="\t", compression='gzip')
boostDM_old = boostDM_old.astype({"case": str})
boostDM_old = boostDM_old[~boostDM_old['Consequence'].isin(['start_lost', 'stop_lost', 'stop_retained_variant'])]
boostDM_old = boostDM_old[~boostDM_old['Prot_pos'].isna()]
len(boostDM_old)

197841

In [18]:
boostDM_old = boostDM_old[boostDM_old['case'].isin(patient_age_ch['patient'].to_list())]
len(boostDM_old)

197835

In [16]:
from tqdm import tqdm
age_grup = []
for x in tqdm(boostDM_old['case'].tolist()):
    age = patient_age_ch[patient_age_ch['patient']==x]['age_group'].values[0]
    age_grup.append(age)

100%|██████████| 197835/197835 [1:12:19<00:00, 45.59it/s]


In [18]:
boostDM_old['age_group'] = age_grup

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
boostDM.columns

Index(['name_file', '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER',
       'VEP', 'AD_alt', 'DP', 'VAF_alt', 'ADF_alt', 'ADR_alt', 'VEP_cano',
       'Consequence', 'IMPACT', 'SYMBOL', 'ENST', 'CDS_pos', 'Prot_pos',
       'AA_change', 'rs_ID', 'ENSP', 'EUR_AF', 'gnomAD_AF', 'gnomAD_AF_NFE',
       'case', 'rep', 'rep2', 'poN', 'Occur', 'ID2', 'BoostDM', 'age_group'],
      dtype='object')

In [17]:
boostDM_new = boostDM[['name_file', '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'AD_alt', 
                       'DP', 'VAF_alt', 'Consequence', 'IMPACT', 'SYMBOL', 'ENST', 'CDS_pos',
                       'Prot_pos', 'AA_change', 'rs_ID', 'ENSP', 'EUR_AF', 'gnomAD_AF',
                       'gnomAD_AF_NFE', 'FILTER', 'case', 'rep', 'rep2', 
                       'Occur', 'poN', 'ID2', 'BoostDM', 'age_group']]

In [18]:
boostDM_old = boostDM_old[['name_file', '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'AD_alt', 
                       'DP', 'VAF_alt', 'Consequence', 'IMPACT', 'SYMBOL', 'ENST', 'CDS_pos',
                       'Prot_pos', 'AA_change', 'rs_ID', 'ENSP', 'EUR_AF', 'gnomAD_AF',
                       'gnomAD_AF_NFE', 'FILTER', 'case', 'rep', 'rep2', 
                       'Occur', 'poN', 'ID2', 'BoostDM', 'age_group']]

In [19]:
boostDM_all = pd.concat([boostDM_old, boostDM_new], axis=0)

In [20]:
boostDM_all = boostDM_all[boostDM_all['DP']>=10]

In [24]:
# Save the merged file
boostDM_all.to_csv("All450k_filtered_boostDM_ALL_age_20230802_v3.vcf", sep="\t", index=False, compression='gzip')