In [5]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
from models.aa_common.data_loader import get_pmd_dataset

In [6]:
task = "pmd"
variants_df = get_pmd_dataset(home_dir)


Log: Loading Protein Mutation Dataset (PMD) ...
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'mut_real', 'chrom', 'chrom_pos',
       'ref_allele', 'alt_allele', 'function', 'source', 'crossref', 'snp_id',
       'CADD_raw', 'MetaRNN_score', 'MVP_score', 'Polyphen2_HVAR_score',
       'REVEL_score', 'SIFT_score', 'integrated_fitCons_score',
       'phyloP17way_primate', 'phastCons17way_primate', 'bStatistic', 'seq',
       'wt', 'mut', 'prot_pos', 'function_summarized', 'functional_effect',
       'pmd_nr_id'],
      dtype='object')
Effect       4633
Knock-out    1981
No-effect    1968
Name: functional_effect, dtype: int64
(8582, 30)


In [None]:
# loading and cleaning patho predictions from dbnsfp
pred_df = pd.read_csv(home_dir+f"models/dbnsfp/dbnsfp_outputs/{task}.txt", sep="\t")
pred_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
pred_df.rename(columns={"#chr":"chrom", "pos(1-based)":"chrom_pos", "ref":"ref_allele", "alt":"alt_allele", 'aaref':"wt", 'aaalt':"mut", 
                        'CADD_raw': 'CADD_raw_score', 'phyloP17way_primate':'phyloP17way_primate_score', 
                        'phastCons17way_primate':'phastCons17way_primate_score', 'bStatistic':'bStatistic_score'}, inplace=True)

pred_df.loc[pred_df["chrom"]=="X", "chrom"] = 23
pred_df.loc[pred_df["chrom"]=="Y", "chrom"] = 24
pred_df = pred_df.astype({'chrom': 'int64'}) # this line is mandatory

print(pred_df.columns)
print(pred_df.shape)


# computing average scores for each method for each row
def compute_avg(x):
    x = str(x).split(";")
    return np.mean([float(i) for i in x if i!="."]) 

model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    model_scores = pred_df[model_name].apply(compute_avg) # can have multiple scores, ie '0.4573521;0.4573521;0.4573521;0.4573521'. taking the avg
    pred_df[model_name] = model_scores


print(f"#-of SNVs found from dbNSFP: {pred_df.shape[0]}")

In [None]:
# merging dbNSFP extracted scores with pmd dataset.
variants_df["chrom"] = variants_df["chrom_acc_version"].apply(lambda x: int(x[x.index("_")+1:x.index(".")])) # taking only chromosom number for dbNSFP inputs

# using inner and wt, mut for merging for specifically protein mutation.
result_df = pd.merge(variants_df, pred_df, how="inner", on=["chrom", "chrom_pos", "ref_allele", "alt_allele", "wt", "mut"])
result_df = result_df.drop_duplicates(keep="first")
print(result_df.columns)
print(result_df.shape)
result_df["class"].value_counts()

In [None]:
result_df.to_csv(home_dir+f"models/dbnsfp/postprocessed_outputs/{task}.tsv", sep="\t", index=False)

In [None]:
# this copy is saving for running the other methods.
out_filepath = home_dir+f"models/aa_common/datasets_pathogenicity/{task}_dbnsfp_preds"
result_df.to_csv(out_filepath+".tsv", sep="\t", index=False)
# result_df = pd.read_csv(out_filepath+".tsv", sep="\t")

# Creating merged fasta document ...
protein_acc_list = list(result_df["prot_acc_version"].unique())
print(len(protein_acc_list))
from utils.ncbi_proteins import create_combined_fasta
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)

In [None]:
model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    missing, total = result_df[pd.isna(result_df[model_name])].shape[0], result_df.shape[0]
    print(f"\t{model_name}: ({missing}/{total})*100 = {(missing / total) * 100:.4f}")