# Generate pull of normals

In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from io import StringIO
import seaborn as sns
import scipy.stats as stats
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 40)
%precision 2

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 300

In [9]:
### Read a file containing 'patient', 'age_recruitment', 'age_group' columns
patient_age_ch = pd.read_csv('../patient_450k_age_670124.txt.gz',sep="\t")
print(f'Rows:{len(patient_age_ch)}')
print(f'Columns:{list(patient_age_ch.columns)}')
print(f'Columns:{list(set(patient_age_ch.age_group))}')

### Get a list of the cases analyzed
withdraw = pd.read_csv('../w69794_2023-04-25.csv',sep="\t", header=None)

### Get a list of withdrawals from the UKB webpage
case = pd.read_csv('../ukb450k_cohort_participants.txt',sep="\t", header=None)

### Remove cases not analyzed or withdrawals
patient_age_ch = patient_age_ch[patient_age_ch['patient'].isin(case[0])]
patient_age_ch = patient_age_ch[~patient_age_ch['patient'].isin(list(withdraw[0]))].reset_index(drop=True)
patient_age_ch = patient_age_ch.astype({"patient": str})
len(patient_age_ch)

Rows:502391
Columns:['patient', 'age_recruitment', 'age_group']
Columns:['56-60', '46-50', '38-45', '66-72', '51-55', '61-65']


469880

In [3]:
# List files in directory
files = os.listdir('/mnt/project/analysis/output/')
files = [file for file in files if file.endswith('merged.filt.gz')]

len(files)
files

['results1.merged.filt.gz',
 'results10.merged.filt.gz',
 'results2.merged.filt.gz',
 'results3.merged.filt.gz',
 'results4.merged.filt.gz',
 'results5.merged.filt.gz',
 'results6.merged.filt.gz',
 'results7.merged.filt.gz',
 'results8.merged.filt.gz',
 'results9.merged.filt.gz']

In [1]:
# Merge files

Mutect2_tar_var = pd.read_csv('/mnt/project/analysis/output/results1.merged.filt.gz',sep="\t", compression='gzip')
print(len(Mutect2_tar_var))

files = ['results10.merged.filt.gz',
 'results2.merged.filt.gz',
 'results3.merged.filt.gz',
 'results4.merged.filt.gz',
 'results5.merged.filt.gz',
 'results6.merged.filt.gz',
 'results7.merged.filt.gz',
 'results8.merged.filt.gz',
 'results9.merged.filt.gz']

for file in files:
    print(file)
    Mutect2_tar_var_df = pd.read_csv('/mnt/project/analysis/output/'+file ,sep="\t", compression='gzip')
    print(len(Mutect2_tar_var_df))
    Mutect2_tar_var = pd.concat([Mutect2_tar_var, Mutect2_tar_var_df], ignore_index=True)

In [5]:
Mutect2_tar_var = Mutect2_tar_var[Mutect2_tar_var['Consequence']!= 'Consequence']

In [11]:
Mutect2_tar_var['case'] = [x.split('/')[-1].split('_')[0] for x in list(Mutect2_tar_var['name_file'])]

In [9]:
Mutect2_tar_var = Mutect2_tar_var.astype({'POS': str, '#CHROM': str, 'REF': str, 'ALT': str, 'case': str}) 
Mutect2_tar_var['rep'] = Mutect2_tar_var['#CHROM']+':'+Mutect2_tar_var['POS']+','+Mutect2_tar_var['REF']  
Mutect2_tar_var['ID2'] = Mutect2_tar_var['#CHROM']+"-"+Mutect2_tar_var['POS'].astype(str) +"-"+Mutect2_tar_var['ALT']

In [12]:
Rules_matrix = pd.read_csv('../../../../Paper_data/Expert_curated_rules/genes12_BoostdmCH_Harvard_simp_run20230803.tsv.gz', compression='gzip',sep="\t")
BoostDM_dirvers = Rules_matrix[Rules_matrix['boostDM_class']==1]
BoostDM_dirvers

Unnamed: 0,gene,chr,pos,alt,Prot_pos,aachange,csqn_type_missense,csqn_type_nonsense,csqn_type_splicing,csqn_type_synonymous,boostDM_score,boostDM_class,Niroula,Bick,CNIC,WHO
2,ASXL1,20,32358779,T,2,K2*,0,1,0,0,0.989503,1,0.0,0.0,0.0,0.0
20,ASXL1,20,32358785,T,4,K4*,0,1,0,0,0.989768,1,0.0,0.0,0.0,0.0
29,ASXL1,20,32358788,T,5,Q5*,0,1,0,0,0.975674,1,0.0,0.0,0.0,0.0
38,ASXL1,20,32358791,T,6,K6*,0,1,0,0,0.991413,1,0.0,0.0,0.0,0.0
47,ASXL1,20,32358794,T,7,K7*,0,1,0,0,0.988842,1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87811,U2AF1,21,43104401,G,16,V16L,1,0,0,0,0.897579,1,0.0,0.0,0.0,0.0
87812,U2AF1,21,43104401,T,16,V16I,1,0,0,0,0.897579,1,0.0,0.0,0.0,0.0
87816,U2AF1,21,43107451,A,15,K15I,1,0,0,0,0.508676,1,0.0,0.0,0.0,0.0
87817,U2AF1,21,43107451,C,15,K15R,1,0,0,0,0.508676,1,0.0,0.0,0.0,0.0


In [13]:
BoostDM_dirvers['ID2'] = BoostDM_dirvers['chr'].astype(str) +"-"+\
                            BoostDM_dirvers['pos'].astype(str) +"-"+\
                            BoostDM_dirvers['alt'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BoostDM_dirvers['ID2'] = BoostDM_dirvers['chr'].astype(str) +"-"+\


In [3]:
patient_age_40 = patient_age_ch[patient_age_ch['age_recruitment']<41]
len(patient_age_40)
test = patient_age_40.sample(n=100)
test = test.astype({'patient': str}) 

In [13]:
# Save a list of cases used as normals
test.to_csv("case_polN.txt", sep="\t", index=False)

In [2]:
a = Mutect2_tar_var[Mutect2_tar_var['case'].isin(test['patient'].tolist())]
a['Occur'] = [a['rep'].tolist().count(x) for x in a['rep']]
blacklist = a[a['Occur']>1]

In [16]:
blacklist['BoostDM'] = np.where(blacklist['ID2'].isin(BoostDM_dirvers['ID2'].to_list()), 1, 0)
blacklist['BoostDM'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0    584
Name: BoostDM, dtype: int64

In [17]:
len(set(blacklist['ID2']))

115

In [19]:
blacklist['SYMBOL'].value_counts()

DNMT3A    117
SF3B1      86
TP53       79
MDM4       63
CHEK2      53
GNAS       51
TET2       49
ASXL1      45
SRSF2      14
IDH2       14
HSCB        9
PPM1D       4
Name: SYMBOL, dtype: int64

In [20]:
blacklist.Consequence.value_counts()

intron_variant                          473
missense_variant                         33
5_prime_UTR_variant                      32
synonymous_variant                       20
3_prime_UTR_variant                      11
upstream_gene_variant                     9
splice_region_variant&intron_variant      6
Name: Consequence, dtype: int64

In [21]:
blacklist.to_csv("polN_v2.vcf", sep="\t", index=False, compression='gzip')