# Generate pull of normals for U2AF1

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)
%precision 2


import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

In [3]:
### Read a file containing 'patient', 'age_recruitment', 'age_group' columns
patient_age_ch = pd.read_csv('../patient_450k_age_670124.txt.gz',sep="\t")
print(f'Rows:{len(patient_age_ch)}')
print(f'Columns:{list(patient_age_ch.columns)}')
print(f'Columns:{list(set(patient_age_ch.age_group))}')

### Get a list of the cases analyzed
withdraw = pd.read_csv('../w69794_2023-04-25.csv',sep="\t", header=None)

### Get a list of withdrawals from the UKB webpage
case = pd.read_csv('../ukb450k_cohort_participants.txt',sep="\t", header=None)

### Remove cases not analyzed or withdrawals
patient_age_ch = patient_age_ch[patient_age_ch['patient'].isin(case[0])]
patient_age_ch = patient_age_ch[~patient_age_ch['patient'].isin(list(withdraw[0]))].reset_index(drop=True)
patient_age_ch = patient_age_ch.astype({"patient": str})
len(patient_age_ch)

Rows:502391
Columns:['patient', 'age_recruitment', 'age_group']
Columns:['61-65', '38-45', '66-72', '56-60', '51-55', '46-50']


469880

In [4]:
# List files in directory
files = os.listdir('/mnt/project/analysis/output/U2AF1/')
files = [file for file in files if file.endswith('output')]

len(files)
files

['U2AF1_wnode1.output', 'U2AF1_wnode2.output']

In [4]:
off_target = ['intron_variant', 'synonymous_variant', '5_prime_UTR_variant', '3_prime_UTR_variant', 'upstream_gene_variant', 'downstream_gene_variant']

Mutect2_tar_var = pd.read_csv('/mnt/project/analysis/output/U2AF1/U2AF1_wnode1.output',sep="\t")
print(len(Mutect2_tar_var))
Mutect2_tar_var = Mutect2_tar_var[~Mutect2_tar_var['Consequence'].isin(off_target)]
print(len(Mutect2_tar_var))


files = ['U2AF1_wnode2.output']

Mutect2_tar_var_df = pd.read_csv('/mnt/project/analysis/output/U2AF1/U2AF1_wnode2.output',sep="\t")
print(len(Mutect2_tar_var_df))
Mutect2_tar_var_df = Mutect2_tar_var_df[~Mutect2_tar_var_df['Consequence'].isin(off_target)]
print(len(Mutect2_tar_var_df))

Mutect2_tar_var = pd.concat([Mutect2_tar_var, Mutect2_tar_var_df], ignore_index=True)


In [11]:
Mutect2_tar_var = Mutect2_tar_var[Mutect2_tar_var['Consequence']!= 'Consequence']

In [5]:
Mutect2_tar_var['case'] = [x.split('/')[-1].split('_')[0] for x in list(Mutect2_tar_var['name_file'])]

In [13]:
Mutect2_tar_var = Mutect2_tar_var.rename(columns={'#CHROM_x': '#CHROM', 'POS_x': 'POS', 'ID_x':'ID', 'REF_x':'REF', 'QUAL_x': 'QUAL', 'FILTER_x': 'FILTER'})
Mutect2_tar_var = Mutect2_tar_var.astype({'POS': str, '#CHROM': str, 'REF': str, 'ALT': str, 'case': str}) 
Mutect2_tar_var['rep'] = Mutect2_tar_var['#CHROM']+':'+Mutect2_tar_var['POS']+','+Mutect2_tar_var['REF']  
Mutect2_tar_var['ID2'] = Mutect2_tar_var['#CHROM']+"-"+Mutect2_tar_var['POS'].astype(str) +"-"+Mutect2_tar_var['ALT']

In [14]:
Rules_matrix = pd.read_csv('/mnt/project/analysis_jupyter_nb/NGS_pipeline/genes12_BoostdmCH_Harvard_simp.tsv.gz', compression='gzip',sep="\t")
BoostDM_dirvers = Rules_matrix[Rules_matrix['boostDM_class']==1]
BoostDM_dirvers

Unnamed: 0,gene,chr,pos,alt,Prot_pos,aachange,csqn_type_missense,csqn_type_nonsense,csqn_type_splicing,csqn_type_synonymous,role_Act,role_LoF,boostDM_score,boostDM_class,Niroula,Bick,CNIC
2,ASXL1,20,32358779,T,2,K2*,0,1,0,0,0,1,0.996397,1,0.0,0.0,0.0
20,ASXL1,20,32358785,T,4,K4*,0,1,0,0,0,1,0.997874,1,0.0,0.0,0.0
29,ASXL1,20,32358788,T,5,Q5*,0,1,0,0,0,1,0.993813,1,0.0,0.0,0.0
38,ASXL1,20,32358791,T,6,K6*,0,1,0,0,0,1,0.997998,1,0.0,0.0,0.0
47,ASXL1,20,32358794,T,7,K7*,0,1,0,0,0,1,0.997686,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87808,U2AF1,21,43104400,G,16,V16A,1,0,0,0,1,0,0.542636,1,0.0,0.0,0.0
87809,U2AF1,21,43104400,T,16,V16D,1,0,0,0,1,0,0.542636,1,0.0,0.0,0.0
87810,U2AF1,21,43104401,A,16,V16F,1,0,0,0,1,0,0.701533,1,0.0,0.0,0.0
87811,U2AF1,21,43104401,G,16,V16L,1,0,0,0,1,0,0.701533,1,0.0,0.0,0.0


In [15]:
BoostDM_dirvers['ID2'] = BoostDM_dirvers['chr'].astype(str) +"-"+\
                            BoostDM_dirvers['pos'].astype(str) +"-"+\
                            BoostDM_dirvers['alt'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
test = pd.read_csv('/mnt/project/analysis_jupyter_nb/panel_normals/case_polN.txt',sep="\t")

In [7]:
Mutect2_tar_var[Mutect2_tar_var['case'].astype(str).isin(test['patient'].astype(str).tolist())]

In [8]:
a = Mutect2_tar_var[Mutect2_tar_var['case'].astype(str).isin(test['patient'].astype(str).tolist())]
a['Occur'] = [a['rep'].tolist().count(x) for x in a['rep']]
blacklist = a[a['Occur']>1]

In [27]:
blacklist['BoostDM'] = np.where(blacklist['ID2'].isin(BoostDM_dirvers['ID2'].to_list()), 1, 0)
blacklist['BoostDM'].value_counts()

0    53
Name: BoostDM, dtype: int64

In [28]:
len(set(blacklist['ID2']))

2

In [29]:
blacklist[blacklist['Consequence']=='missense_variant']['SYMBOL'].value_counts()

U2AF1    53
Name: SYMBOL, dtype: int64

In [30]:
blacklist.Consequence.value_counts()

missense_variant    53
Name: Consequence, dtype: int64

In [31]:
blacklist.to_csv("polN_U2AF1_v2.vcf", sep="\t", index=False, compression='gzip')