In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [17]:
renamed_cols = {'SIFT_score':'sift', 'Polyphen2_HVAR_score':'polyphen2_HVAR', 'MetaRNN_score':'metarnn', 'REVEL_score':'revel', 'MVP_score':'mvp', 'sequnet_score':'sequnet',  
                'CADD_raw_score':'cadd_raw', 'integrated_fitCons_score':'integrated_fitCons', 'phyloP17way_primate_score':'phyloP17way_primate', 'phastCons17way_primate_score':'phastCons17way_primate', 'bStatistic_score':'bStatistic', 'conservation_score':'conservation',
                'esm1b_t33_650M_UR50S_score':'esm1b_t33_650M_UR50S', 'esm1v_t33_650M_UR90S_score':'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D_score':'esm2_t33_650M_UR50D', 
                'prottrans_bert_bfd_score':'prottrans_bert_bfd', 'prottrans_t5_xl_u50_score':'prottrans_t5_xl_u50', 'vespa_score':'vespa', 
                'proteinbert_score':'proteinbert', 'protbert_score':'protbert', 'unirep_score':'unirep', 'prottrans_albert_bfd_score':'prottrans_albert_bfd',
                'vespa_pred':'vespa', 'vespal_pred':'vespal'}

masked_llm_cols = [("esm_rives","esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"), 
                   ("bioembeddings_dallago", "prottrans_bert_bfd"), ("bioembeddings_dallago", "prottrans_albert_bfd"), 
                   ("rostlab_huggingface", "prottrans_t5_xl_u50"), 
                   ("vespa_marquet", "vespa"), 
                   ("proteinbert_brandes", "proteinbert"), ("sequnet_dunham", "sequnet")]

embeddings_llm_cols = [("tape_rao_1", "protbert"), ("jax_unirep", "unirep")]

# "plus_rnn"

In [2]:
# loading conservation scores for PMD sequences
# pmd_conservation_df = pd.read_csv(home_dir+"data/cdd_conservation/cdd_conservationTable_pmdSequences.csv.gz", compression='gzip', comment='#')
# pmd_conservation_df = pmd_conservation_df.drop_duplicates(["NPid", "qPos"], keep="first") #qPos is 1-indexed
# print(pmd_conservation_df.shape)
# print(pmd_conservation_df.columns)

In [3]:
# loading conservation scores for human-prot sequences
# humprots_conservation_df = pd.read_csv(home_dir+"data/cdd_conservation/cdd_conservationTable_18kHumanProts.csv.gzip", compression='gzip', comment='#')
# humprots_conservation_df = humprots_conservation_df.drop_duplicates(["NPid", "qPos"], keep="first")
# print(humprots_conservation_df.shape)
# print(humprots_conservation_df.columns)

In [4]:
# helper function to extract and merge conservation scores.
# def merge_conservation_df(input_df, conservation_df):
#     # merge conservation_df with the input df
#     temp_conservation_df = conservation_df[["NPid", "qPos", "CScore"]].drop_duplicates(keep="first")
#     merged_df = pd.merge(input_df, temp_conservation_df, how="left", left_on=["prot_acc_version", "prot_pos"], right_on=["NPid", "qPos"])
#     columns = list(input_df.columns)
#     columns.append("CScore")
#     merged_df = merged_df[columns]
#     merged_df = merged_df.rename(columns={"CScore": "conservation_score"})
#     return merged_df

In [14]:
# main function that merges all methods prediction scores.
def do_merge(merged_df, model_root_and_name_tuple_list, merge_on_col_list, task, suffix):
    for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
        models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}{suffix}.tsv", sep="\t")
        models_pred_df = models_pred_df.drop_duplicates(keep="first")

        merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_on_col_list)
        merged_df = merged_df.rename(columns={"pred": f"{model_name}_score"})
        print(model_root, model_name, models_pred_df.shape, merged_df.shape)
        
        # if i==2: break
    return merged_df

In [18]:
# this block merges scores on popu-freq
from models.aa_common.data_loader import get_popu_freq_dbnsfp_dataset
merged_df, _ = get_popu_freq_dbnsfp_dataset(home_dir)

task = "popu_freq"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, task, "_masked")
# merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, task, "_embed")
# merged_df = merge_conservation_df(merged_df, humprots_conservation_df)  
merged_df.rename(columns=renamed_cols, inplace=True)

print(merged_df.shape)
print(merged_df.columns)
# merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic', 'n_methods_having_preds',
       'is_selected_prev'],
      dtype='object')
(35082, 37)
#-rsids: 35059
#-genes 9134
#-refseq-prots: 9142
#-seqs: 9142
Common        9142
Ultra-rare    9124
Singleton     9119
Rare          7697
Name: class, dtype: int64
esm_rives esm1b_t33_650M_UR50S (35082, 38) (35082, 38)
esm_rives esm1v_t33_650M_UR90S

In [7]:
# this block merges scores on PMD 
from models.aa_common.data_loader import get_pmd_dbnsfp_dataset
merged_df, _ = get_pmd_dbnsfp_dataset(home_dir)

task = "pmd"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, task,"_masked")
merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, task, "_embed")
# merged_df = merge_conservation_df(merged_df, pmd_conservation_df)  
merged_df.rename(columns=renamed_cols, inplace=True)

print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
(7179, 29)
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64
#-unique prots:  2056
esm_rives esm1b_t33_650M_UR50S (7179, 30) (7179, 30)
esm_rives esm1v_t33_650M_UR90S (7179, 30) (7179, 31)
esm_rives esm2_t33_650M_UR50D (7179, 30) (7179, 32)
bioembeddings_dallago prottrans_bert_bfd (7179, 30) (7179, 33)
rostlab_huggingface prottrans_t5_xl_u50 (7179, 30) (7179, 34)
vespa_marquet vespa (7179, 30) (7179, 35)
proteinbert_brandes proteinbert (7179, 30) (7179, 36)
sequnet_du

In [8]:
# this block merges scores on patho 
from models.aa_common.data_loader import get_patho_likelypatho_neutral_dbnsfp_dataset
merged_df, _ = get_patho_likelypatho_neutral_dbnsfp_dataset(home_dir)

task = "patho"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, task,"_masked")
merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, task, "_embed")
# merged_df = merge_conservation_df(merged_df, humprots_conservation_df)  
merged_df.rename(columns=renamed_cols, inplace=True)

print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)
# merged_df["esm1b_t33_650M_UR50S"]

Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'chrom', 'SIFT_score', 'Polyphen2_HVAR_score',
       'MetaRNN_score', 'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')
(9472, 28)
Likely-pathogenic    4162
Pathogenic           2305
Rare                 1983
Common               1022
Name: class, dtype: int64
#-unique prots:  1430
esm_rives esm1b_t33_650M_UR50S (9472, 29) (9472, 29)
esm_rives esm1v_t33_650M_UR90S (9472, 29) (9472, 30)
esm_rives esm2_t33_650M_UR50D (9472, 29) (9472, 31)
bioembeddings_dallago prottrans_bert_bfd (9472, 29) (9472, 32)
rostlab_huggingface prottrans_t5_xl_u50 (9472, 29) (9472, 33)
vespa_marquet vespa (9472, 29) (9472, 34)