In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
from models.aa_common.data_loader import get_pmd_dataset

In [2]:
task = "pmd"
variants_df = get_pmd_dataset(home_dir)
cols = ['mut_id', 'md5', 'pmd_id', 'nr', 'pmd_nr_id', 'snp_id', 'mut_real', 'wt', 'mut', 'prot_pos', 
        'chrom', 'chrom_pos','ref_allele', 'alt_allele', 
        'function', 'source', 'crossref', 'function_summarized', 'functional_effect', 'seq']
variants_df = variants_df[cols]
variants_df.rename(columns={'functional_effect': 'class', 'pmd_nr_id': 'prot_acc_version'}, inplace=True)

print(variants_df.columns)
print(variants_df["class"].value_counts())
print(variants_df.shape)


Log: Loading Protein Mutation Dataset (PMD) ...
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'mut_real', 'chrom', 'chrom_pos',
       'ref_allele', 'alt_allele', 'function', 'source', 'crossref', 'snp_id',
       'CADD_raw', 'MetaRNN_score', 'MVP_score', 'Polyphen2_HVAR_score',
       'REVEL_score', 'SIFT_score', 'integrated_fitCons_score',
       'phyloP17way_primate', 'phastCons17way_primate', 'bStatistic', 'seq',
       'wt', 'mut', 'prot_pos', 'function_summarized', 'functional_effect',
       'pmd_nr_id'],
      dtype='object')
Effect       4633
Knock-out    1981
No-effect    1968
Name: functional_effect, dtype: int64
(8582, 30)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'seq'],
      dtype='object')
Effect       4633
Knock-out    1981
No-effect    1968
Name: class, dtype: int64
(8582, 2

In [3]:
# loading and cleaning patho predictions from dbnsfp
pred_df = pd.read_csv(home_dir+f"models/dbnsfp/dbnsfp_outputs/{task}.txt", sep="\t")
pred_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
pred_df.rename(columns={"#chr":"chrom", "pos(1-based)":"chrom_pos", "ref":"ref_allele", "alt":"alt_allele", 'aaref':"wt", 'aaalt':"mut", 
                        'CADD_raw': 'CADD_raw_score', 'phyloP17way_primate':'phyloP17way_primate_score', 
                        'phastCons17way_primate':'phastCons17way_primate_score', 'bStatistic':'bStatistic_score'}, inplace=True)

pred_df.loc[pred_df["chrom"]=="X", "chrom"] = 23
pred_df.loc[pred_df["chrom"]=="Y", "chrom"] = 24
# pred_df = pred_df.astype({'chrom': 'int64'}) # this line is mandatory

print(pred_df.columns)
print(pred_df.shape)


# computing average scores for each method for each row
def compute_avg(x):
    x = str(x).split(";")
    return np.mean([float(i) for i in x if i!="."]) 

model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    model_scores = pred_df[model_name].apply(compute_avg) # can have multiple scores, ie '0.4573521;0.4573521;0.4573521;0.4573521'. taking the avg
    pred_df[model_name] = model_scores


print(f"#-of SNVs found from dbNSFP: {pred_df.shape[0]}")

Index(['chrom', 'chrom_pos', 'ref_allele', 'alt_allele', 'wt', 'mut',
       'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score',
       'MVP_score', 'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')
(6524, 16)


  return _methods._mean(a, axis=axis, dtype=dtype,


#-of SNVs found from dbNSFP: 6524


In [4]:
# merging dbNSFP extracted scores with pmd dataset.
# using inner and wt, mut for merging for specifically protein mutation.
result_df = pd.merge(variants_df, pred_df, how="inner", on=["chrom", "chrom_pos", "ref_allele", "alt_allele", "wt", "mut"])
result_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
print(result_df.columns)
print(result_df.shape)
result_df["class"].value_counts()

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'seq', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
(7179, 30)


Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64

In [5]:
result_df.to_csv(home_dir+f"models/dbnsfp/postprocessed_outputs/{task}.tsv", sep="\t", index=False)

In [6]:
# this copy is saving for running the other methods.
out_filepath = home_dir+f"models/aa_common/datasets_pmd_analysis/{task}_dbnsfp"

# Creating merged fasta document ...
sequences_df = result_df[['prot_acc_version', 'crossref', 'seq']].drop_duplicates(keep="first")
print("#-of sequences", sequences_df.shape[0])

out_fasta_filepath = out_filepath+".fasta"
out_fasta_file_handle = open(out_fasta_filepath, "w")

for tuple in sequences_df.itertuples():
    out_fasta_file_handle.write(f">{tuple.prot_acc_version} | {tuple.crossref}\n")
    out_fasta_file_handle.write(f"{tuple.seq}\n")
    # break
out_fasta_file_handle.close() 

result_df.drop(columns=["seq"], inplace=True)
result_df.to_csv(out_filepath+".tsv", sep="\t", index=False)

#-of sequences 2056


In [7]:
model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    missing, total = result_df[pd.isna(result_df[model_name])].shape[0], result_df.shape[0]
    print(f"\t{model_name}: ({missing}/{total})*100 = {(missing / total) * 100:.4f}")

print(result_df.columns)
print(result_df["class"].value_counts())
print(result_df.shape)
result_df["prot_acc_version"].unique().shape

	MetaRNN_score: (3/7179)*100 = 0.0418
	MVP_score: (129/7179)*100 = 1.7969
	SIFT_score: (169/7179)*100 = 2.3541
	Polyphen2_HVAR_score: (145/7179)*100 = 2.0198
	CADD_raw_score: (3/7179)*100 = 0.0418
	REVEL_score: (40/7179)*100 = 0.5572
	integrated_fitCons_score: (6/7179)*100 = 0.0836
	phyloP17way_primate_score: (0/7179)*100 = 0.0000
	phastCons17way_primate_score: (0/7179)*100 = 0.0000
	bStatistic_score: (42/7179)*100 = 0.5850
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: 

(2056,)