In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd

In [2]:
task="pmd"
result_df = pd.read_csv(home_dir+f"models/dbnsfp/outputs_postprocessed/{task}.tsv", sep="\t")
print(result_df.columns)
print(result_df.shape)
result_df["class"].value_counts()

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'seq', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
(7179, 30)


Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64

In [3]:
col_renaming_dict = {"prot_pos":"1indexed_prot_mt_pos", "wt":'wt_aa_1letter', "mut":'mt_aa_1letter', 'chrom': "chrom_num"}
result_renamed_df = result_df.rename(columns=col_renaming_dict)
print(result_renamed_df.columns)

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt_aa_1letter', 'mt_aa_1letter', '1indexed_prot_mt_pos',
       'chrom_num', 'chrom_pos', 'ref_allele', 'alt_allele', 'function',
       'source', 'crossref', 'function_summarized', 'class', 'seq',
       'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score',
       'MVP_score', 'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')


In [9]:
# this copy is saving for running the other methods.
out_filepath = home_dir+f"data/datasets_pmd/{task}_dbnsfp"

# Creating merged fasta document ...
sequences_df = result_renamed_df[['prot_acc_version', 'crossref', 'seq']].drop_duplicates(keep="first")
print("#-of sequences", sequences_df.shape[0])

out_fasta_filepath = out_filepath+".fasta"
out_fasta_file_handle = open(out_fasta_filepath, "w")

for tuple in sequences_df.itertuples():
    out_fasta_file_handle.write(f">{tuple.prot_acc_version} | {tuple.crossref}\n")
    out_fasta_file_handle.write(f"{tuple.seq}\n")
    # break
out_fasta_file_handle.close() 

result_renamed_df.drop(columns=["seq"], inplace=True)
result_renamed_df.to_csv(out_filepath+".tsv", sep="\t", index=False)

#-of sequences 2056


In [7]:

def print_summary(df, classes):
    print("", "#-genes", "#-proteins", "#-protein-variants", "#-genomic-variants", sep="\t")
    for cls in classes:
        cls_df = df[df["class"]==cls]
        # n_genes = cls_df["gene_name"].unique().shape[0]
        n_prots = cls_df["prot_acc_version"].unique().shape[0]
        n_prot_variants = cls_df[['mut_id', 'md5', 'pmd_id', 'nr', 'mut_real']].shape[0] # this and the following line are same
        n_genomic_variants = cls_df[["chrom_num", "chrom_pos", "ref_allele", "alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
        print(cls, n_prots, n_prot_variants, n_genomic_variants, sep="\t")

    # total_n_genes = df["gene_name"].unique().shape[0]
    total_n_prots = df["prot_acc_version"].unique().shape[0]
    total_n_prot_variants = df[['mut_id', 'md5', 'pmd_id', 'nr', 'mut_real']].shape[0]
    total_n_genomic_variants = df[["chrom_num", "chrom_pos", "ref_allele", "alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
    print("total", total_n_prots, total_n_prot_variants, total_n_genomic_variants, sep="\t")

print_summary(result_renamed_df, ["Knock-out", "No-effect", "Effect"])


	#-genes	#-proteins	#-protein-variants	#-genomic-variants
Knock-out	743	1584	1335
No-effect	622	1777	1659
Effect	1416	3818	3070
total	2056	7179	5652


In [12]:
model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score',
                'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score',
                'integrated_fitCons_score', 'phyloP17way_primate_score',
                'phastCons17way_primate_score', 'bStatistic_score']
def print_missing_things(x:pd.DataFrame, classes):
    print("\t", end="")
    for i, cls in enumerate(classes):
        prots = x[(x["class"]==cls)]["prot_acc_version"].unique().shape[0]
        print(f"{cls}({prots})", end="\t")
    print()
    for model_name in model_names:
        print(model_name, end="\t")
        for i, cls in enumerate(classes):
            missing = x[(x["class"]==cls) & pd.isna(x[model_name])].shape[0]
            not_missing = x[(x["class"]==cls) & ~pd.isna(x[model_name])].shape[0]
            total = x[(x["class"]==cls)].shape[0]
            
            print(f"{missing}/{not_missing}", end="\t")
            if i==len(classes)-1: print()

print_missing_things(result_renamed_df, ["Knock-out", "No-effect", "Effect"])

	Knock-out(743)	No-effect(622)	Effect(1416)	
MetaRNN_score	0/1584	0/1777	3/3815	
MVP_score	20/1564	48/1729	61/3757	
SIFT_score	19/1565	29/1748	121/3697	
Polyphen2_HVAR_score	14/1570	49/1728	82/3736	
CADD_raw_score	0/1584	0/1777	3/3815	
REVEL_score	10/1574	16/1761	14/3804	
integrated_fitCons_score	0/1584	3/1774	3/3815	
phyloP17way_primate_score	0/1584	0/1777	0/3818	
phastCons17way_primate_score	0/1584	0/1777	0/3818	
bStatistic_score	11/1573	4/1773	27/3791	
