In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.aa_common.performance_metrics import *
from data_loader import get_merged_scores_df

In [2]:
method_names = ['random_classifier', 'sift', 'polyphen2_HVAR', 'metarnn', 'revel', 'mvp', 'sequnet', 
           'cadd_raw', 'integrated_fitCons', 'phyloP17way_primate', 'phastCons17way_primate', 'bStatistic', 'conservation',
           'esm1b_t33_650M_UR50S', 'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 
           'prottrans_bert_bfd', 'prottrans_t5_xl_u50', 'vespa', 'proteinbert', 'protbert', 'unirep']

In [5]:
task = "pmd"
result_df = get_merged_scores_df(task, home_dir)

# Effect, Knock-out
positive_cls, negative_cls, n_runs, n_samples = "Knock-out", "No-effect", 10, 1000

performance_scores_dict = {}
for i, method_name in enumerate(method_names):
    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir)
    # if i==1:break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"models/aa_common/performance_analysis/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'sift', 'polyphen2_HVAR', 'metarnn', 'revel', 'mvp',
       'cadd_raw', 'integrated_fitCons', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic', 'esm1b_t33_650M_UR50S',
       'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 'prottrans_bert_bfd',
       'prottrans_t5_xl_u50', 'vespa', 'proteinbert', 'sequnet', 'protbert',
       'unirep', 'conservation'],
      dtype='object')
(7179, 40)
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64
random_classifier
	AUC-ROC: 0.490
	AUC-PR: 0.497
	Best F1-Score: 0.668 at threshold: -0.994
	Precision score: 0.502 at threshold: -0.994
	Recall score: 0.997 at threshold: -0.994
	Accuracy score: 0.504 at threshold: -0.994
	Balanced accuracy score: 0.504 at threshold:

In [7]:
task = "patho"
result_df = get_merged_scores_df(task, home_dir)

# Pathogenic, Likely-pathogenic
positive_cls, negative_cls, n_runs, n_samples = "Likely-pathogenic", "Neutral", 10, 1000

prots = result_df[result_df["class"]==positive_cls]["prot_acc_version"].unique()

result_df.loc[(result_df["class"]=="Common") | (result_df["class"]=="Rare"), "class"] = negative_cls # putting negative class level at Common and Rare level
result_df = result_df[result_df["prot_acc_version"].isin(prots) & ((result_df["class"]==positive_cls) | (result_df["class"]==negative_cls))]
print(result_df.shape)
print(result_df["class"].value_counts())

performance_scores_dict = {}
for i, method_name in enumerate(method_names):
    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir)
    # if i==1:break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"models/aa_common/performance_analysis/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'chrom', 'sift', 'polyphen2_HVAR', 'metarnn',
       'revel', 'mvp', 'cadd_raw', 'integrated_fitCons', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'esm1b_t33_650M_UR50S',
       'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 'prottrans_bert_bfd',
       'prottrans_t5_xl_u50', 'vespa', 'proteinbert', 'sequnet', 'protbert',
       'unirep', 'conservation'],
      dtype='object')
(9472, 39)
Likely-pathogenic    4162
Pathogenic           2305
Rare                 1983
Common               1022
Name: class, dtype: int64
(6205, 39)
Likely-pathogenic    4162
Neutral              2043
Name: class, dtype: int64
random_classifier
	AUC-ROC: 0.518
	AUC-PR: 0.506
	Best F1-Score: 0.667 at threshold: 0.000
	Computed th from patho

In [10]:
task = "popu_freq"
result_df = get_merged_scores_df(task, home_dir)

# Rare, Ultra-rare, Singleton
positive_cls, negative_cls, n_runs, n_samples = "Singleton", "Common", 1, None

prots = result_df[result_df["class"]==positive_cls]["prot_acc_version"].unique()

result_df = result_df[result_df["prot_acc_version"].isin(prots) & ((result_df["class"]==positive_cls) | (result_df["class"]==negative_cls))]
print(result_df.shape)
print(result_df["class"].value_counts())

performance_scores_dict = {}
for i, method_name in enumerate(method_names):
    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir)
    # if i==1:break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"models/aa_common/performance_analysis/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'chrom', 'sift',
       'polyphen2_HVAR', 'metarnn', 'revel', 'mvp', 'cadd_raw',
       'integrated_fitCons', 'phyloP17way_primate', 'phastCons17way_primate',
       'bStatistic', 'n_methods_having_preds', 'esm1b_t33_650M_UR50S',
       'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 'prottrans_bert_bfd',
       'prottrans_t5_xl_u50', 'vespa', 'proteinbert', 'sequnet', 'protbert',
       'unirep', 'conservation'],
      dtype='object')
(26409, 37)
Common        6976
Ultra-rare    6957
Singleton     6955
Rare          5521
Name: class, dtype: int64
(13910, 37)
Common       6955
Singleton    6955
Name: class, dtype: int64
random_classifier
	AUC-ROC: 0.506
	AUC-PR: 0.500
	Best F1-Score: 0.667 at threshold: 0.000
	Precision score: 0.500 at threshold: 0.000
	Recall score: 1.000 at threshold: 