In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import os
import numpy as np
import pandas as pd
from models.aa_common.data_loader import get_patho_and_likelypatho_SNVs

In [43]:
task = "patho_and_likelypatho"
variants_df = get_patho_and_likelypatho_SNVs(home_dir)

raw data: (6476, 15)
Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'seq_len', 'seq'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 1434
#-of rs-ids mapped to pathogenicity dataset:  331
Likely-pathogenic    4168
Pathogenic           2308
Name: class, dtype: int64
total: 6476


In [44]:
# loading and cleaning patho predictions from dbnsfp
pred_df = pd.read_csv(home_dir+"models/dbnsfp/outputs/dbnsfp_outputs/patho_and_likelypatho_preds.txt", sep="\t")
pred_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
pred_df.rename(columns={"#chr":"chrom", "pos(1-based)":"chrom_pos", "ref":"ref_allele", "alt":"alt_allele", 'aaref':"wt", 'aaalt':"mut", 
                        'CADD_raw': 'CADD_raw_score', 'phyloP17way_primate':'phyloP17way_primate_score', 
                        'phastCons17way_primate':'phastCons17way_primate_score', 'bStatistic':'bStatistic_score'}, inplace=True)

pred_df.loc[pred_df["chrom"]=="X", "chrom"] = 23
pred_df.loc[pred_df["chrom"]=="Y", "chrom"] = 24
pred_df = pred_df.astype({'chrom': 'int64'}) # this line is mandatory

print(pred_df.columns)
print(pred_df.shape)

# computing average scores for each method for each row
def compute_avg(x):
    x = str(x).split(";")
    return np.mean([float(i) for i in x if i!="."]) 

model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    model_scores = pred_df[model_name].apply(compute_avg) # can have multiple scores, ie '0.4573521;0.4573521;0.4573521;0.4573521'. taking the avg
    pred_df[model_name] = model_scores


print(f"#-of SNVs found from dbNSFP: {pred_df.shape[0]}")

# for a single chromosomal position, a model can have multiple outputs from dbnsfp, differing at amino acid mutation.
# the following line is an example.
# pred_df[pred_df["chrom_pos"]==79211614]

Index(['chrom', 'chrom_pos', 'ref_allele', 'alt_allele', 'wt', 'mut',
       'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score',
       'MVP_score', 'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')
(6574, 16)


  return _methods._mean(a, axis=axis, dtype=dtype,


#-of SNVs found from dbNSFP: 6574


In [45]:
# merging dbNSFP extracted scores with patho_and_likelypatho dataset.
variants_df["chrom"] = variants_df["chrom_acc_version"].apply(lambda x: int(x[x.index("_")+1:x.index(".")])) # taking only chromosom number for dbNSFP inputs

# using inner and wt, mut for merging for specifically protein mutation.
result_df = pd.merge(variants_df, pred_df, how="inner", on=["chrom", "chrom_pos", "ref_allele", "alt_allele", "wt", "mut"])
result_df = result_df.drop_duplicates(keep="first")
print(result_df.columns)
print(result_df.shape)
result_df["class"].value_counts()

Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'seq_len', 'seq', 'chrom', 'SIFT_score',
       'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
       'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')
(6467, 26)


Likely-pathogenic    4162
Pathogenic           2305
Name: class, dtype: int64

In [46]:
model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    missing, total = result_df[pd.isna(result_df[model_name])].shape[0], result_df.shape[0]
    print(f"\t{model_name}: ({missing}/{total})*100 = {(missing / total) * 100:.4f}")

	MetaRNN_score: (0/6467)*100 = 0.0000
	MVP_score: (18/6467)*100 = 0.2783
	SIFT_score: (184/6467)*100 = 2.8452
	Polyphen2_HVAR_score: (273/6467)*100 = 4.2214
	CADD_raw_score: (0/6467)*100 = 0.0000
	REVEL_score: (87/6467)*100 = 1.3453
	integrated_fitCons_score: (738/6467)*100 = 11.4118
	phyloP17way_primate_score: (0/6467)*100 = 0.0000
	phastCons17way_primate_score: (0/6467)*100 = 0.0000
	bStatistic_score: (111/6467)*100 = 1.7164


In [50]:
result_df.to_csv(home_dir+f"models/dbnsfp/postprocessed_outputs/{task}.tsv", sep="\t", index=False)