In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [2]:
dbnsfp_cols = ['SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
                          'CADD_raw_score', 'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
masked_llm_cols = [("esm_rives","esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"), 
            ("bioembeddings_dallago", "prottrans_bert_bfd"), ("rostlab_huggingface", "prottrans_t5_xl_u50"), ("vespa_marquet", "vespa"), 
            ("proteinbert_brandes", "proteinbert"), ("sequnet_dunham", "sequnet")]
embeddings_llm_cols = [("tape_rao_1", "protbert"), ("jax_unirep", "unirep")]

# "plus_rnn"

In [3]:
def do_merge(merged_df, model_root_and_name_tuple_list, merge_on_col_list, suffix):
    for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
        models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}{suffix}.tsv", sep="\t")
        models_pred_df = models_pred_df.drop_duplicates(keep="first")

        
        merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_on_col_list)
        merged_df = merged_df.rename(columns={"pred": f"{model_name}_score"})
        print(model_root, model_name, models_pred_df.shape, merged_df.shape)
        
        # if i==2: break
    return merged_df

In [8]:
from models.aa_common.data_loader import get_pmd_dbnsfp_dataset
merged_df, _ = get_pmd_dbnsfp_dataset(home_dir)

task = "pmd"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, "_masked")
merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, "_embed")

print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
(7179, 29)
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64
#-unique prots:  2056
esm_rives esm1b_t33_650M_UR50S (7179, 30) (7179, 30)
esm_rives esm1v_t33_650M_UR90S (7179, 30) (7179, 31)
esm_rives esm2_t33_650M_UR50D (7179, 30) (7179, 32)
bioembeddings_dallago prottrans_bert_bfd (7179, 30) (7179, 33)
rostlab_huggingface prottrans_t5_xl_u50 (7179, 30) (7179, 34)
vespa_marquet vespa (7179, 30) (7179, 35)
proteinbert_brandes proteinbert (7179, 30) (7179, 36)
sequnet_du

In [9]:
from models.aa_common.data_loader import get_patho_likelypatho_neutral_dbnsfp_dataset
merged_df, _ = get_patho_likelypatho_neutral_dbnsfp_dataset(home_dir)

task = "patho"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, "_masked")
merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, "_embed")

print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'chrom', 'SIFT_score', 'Polyphen2_HVAR_score',
       'MetaRNN_score', 'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')
(9472, 28)
Likely-pathogenic    4162
Pathogenic           2305
Rare                 1983
Common               1022
Name: class, dtype: int64
#-unique prots:  1430
esm_rives esm1b_t33_650M_UR50S (9472, 29) (9472, 29)
esm_rives esm1v_t33_650M_UR90S (9472, 29) (9472, 30)
esm_rives esm2_t33_650M_UR50D (9472, 29) (9472, 31)
bioembeddings_dallago prottrans_bert_bfd (9472, 29) (9472, 32)
rostlab_huggingface prottrans_t5_xl_u50 (9472, 29) (9472, 33)
vespa_marquet vespa (9472, 29) (9472, 34)

In [7]:
from models.aa_common.data_loader import get_popu_freq_dbnsfp_dataset
merged_df, _ = get_popu_freq_dbnsfp_dataset(home_dir)

task = "popu_freq"
merge_on_col_list = list(merged_df.columns)
merged_df = do_merge(merged_df, masked_llm_cols, merge_on_col_list, "_masked")
merged_df = do_merge(merged_df, embeddings_llm_cols, merge_on_col_list, "_embed")

print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'chrom', 'SIFT_score',
       'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
       'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score', 'n_methods_having_preds'],
      dtype='object')
(26409, 26)
Common        6976
Ultra-rare    6957
Singleton     6955
Rare          5521
Name: class, dtype: int64
#-unique prots:  6976
esm_rives esm1b_t33_650M_UR50S (26409, 27) (26409, 27)
esm_rives esm1v_t33_650M_UR90S (26409, 27) (26409, 28)
esm_rives esm2_t33_650M_UR50D (26409, 27) (26409, 29)
bioembeddings_dallago prottrans_bert_bfd (26409, 27) (26409, 30)
rostlab_huggingface prottrans_t5_xl_u50 (26409, 27) (26409, 31)
vespa_marquet vespa (26409, 27) (26409, 32)
proteinbert_brandes proteinb

FileNotFoundError: [Errno 2] No such file or directory: '../../models/tape_rao_1/outputs/protbert/popu_freq/preds_protbert_embed.tsv'