In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

from data_loader import get_merged_scores_unidirectional_df
from utils.data_dicts import all_method_names
from utils.performance_metrics import *

In [2]:
all_method_names = all_method_names + ["random_classifier"]
all_method_names

['sift',
 'polyphen2_HVAR',
 'metarnn',
 'revel',
 'mvp',
 'cadd_raw',
 'integrated_fitCons',
 'phyloP17way_primate',
 'phastCons17way_primate',
 'bStatistic',
 'esm1b_t33_650M_UR50S',
 'esm1v_t33_650M_UR90S',
 'esm2_t33_650M_UR50D',
 'prottrans_bert_bfd',
 'prottrans_albert_bfd',
 'plus_rnn',
 'prottrans_t5_xl_u50',
 'vespa',
 'vespal',
 'proteinbert',
 'sequnet',
 'protbert',
 'unirep',
 'conservation',
 'random_classifier']

In [7]:
task = "pmd"
result_df = get_merged_scores_unidirectional_df(task, home_dir)

# Effect, Knock-out
positive_cls, negative_cls, n_runs, n_samples, fill_missing_with_median = "Effect", "No-effect", 10, None, False
# if n_samples is None, we choose the #-samples as the minority class

performance_scores_dict = {}
for i, method_name in enumerate(all_method_names):
    # method_name = 'phastCons17way_primate'

    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir, fill_missing_with_median)
    # break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"data/performance_analysis_minority_cls/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt_aa_1letter', 'mt_aa_1letter', '1indexed_prot_mt_pos',
       'chrom_num', 'chrom_pos', 'ref_allele', 'alt_allele', 'function',
       'source', 'crossref', 'function_summarized', 'class', 'sift',
       'polyphen2_HVAR', 'metarnn', 'revel', 'mvp', 'cadd_raw',
       'integrated_fitCons', 'phyloP17way_primate', 'phastCons17way_primate',
       'bStatistic', 'esm1b_t33_650M_UR50S', 'esm1v_t33_650M_UR90S',
       'esm2_t33_650M_UR50D', 'prottrans_bert_bfd', 'prottrans_albert_bfd',
       'plus_rnn', 'prottrans_t5_xl_u50', 'vespa', 'vespal', 'proteinbert',
       'sequnet', 'protbert', 'unirep', 'conservation'],
      dtype='object')
(7179, 43)
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64
sift
polyphen2_HVAR
metarnn
revel
mvp
cadd_raw
integrated_fitCons
phyloP17way_primate
phastCons17way_primate
bStatistic
esm1b_t33_650M_UR50S
esm1v_t33_650M_UR90S
esm2_t33_650M_UR

In [10]:
task = "patho"
result_df = get_merged_scores_unidirectional_df(task, home_dir)

# Pathogenic, Likely-pathogenic
positive_cls, negative_cls, n_runs, n_samples, fill_missing_with_median = "Likely-pathogenic", "Neutral", 10, None, False

prots = result_df[result_df["class"]==positive_cls]["prot_acc_version"].unique()

result_df.loc[(result_df["class"]=="Common") | (result_df["class"]=="Rare"), "class"] = negative_cls # putting negative class level at Common and Rare level
result_df = result_df[result_df["prot_acc_version"].isin(prots) & ((result_df["class"]==positive_cls) | (result_df["class"]==negative_cls))] # taking variants for only the positive class proteins
print(result_df.shape)
print(result_df["class"].value_counts())
# result_df["esm1b_t33_650M_UR50S"]
performance_scores_dict = {}
for i, method_name in enumerate(all_method_names):
    # method_name = "esm1b_t33_650M_UR50S"
    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir, fill_missing_with_median)
    # if i==1:break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"data/performance_analysis_minority_cls/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['clinvar_id', 'gene_name', 'gene_id', 'snp_id', 'mrna_acc_version',
       'mrna_gi', 'prot_variant', 'prot_acc_version', '1indexed_prot_mt_pos',
       'wt_aa', 'mt_aa', 'wt_aa_1letter', 'mt_aa_1letter', 'chrom_variant',
       'chrom_acc_version', 'chrom_num', 'chrom_pos', 'ref_allele',
       'alt_allele', 'class', 'metarnn', 'mvp', 'sift', 'polyphen2_HVAR',
       'cadd_raw', 'revel', 'integrated_fitCons', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic', 'esm1b_t33_650M_UR50S',
       'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 'prottrans_bert_bfd',
       'prottrans_albert_bfd', 'plus_rnn', 'prottrans_t5_xl_u50', 'mut_real',
       'vespa', 'vespal', 'proteinbert', 'sequnet', 'protbert', 'unirep',
       'conservation'],
      dtype='object')
(12263, 45)
Likely-pathogenic    4804
Rare                 3073
Pathogenic           2499
Common               1887
Name: class, dtype: int64
(8233, 45)
Likely-pathogenic    4804
Neutral              3429
Name: 

In [13]:
task = "popu_freq"
result_df = get_merged_scores_unidirectional_df(task, home_dir)

# Rare, Ultra-rare, Singleton
positive_cls, negative_cls, n_runs, n_samples, fill_missing_with_median = "Singleton", "Common", 1, None, False

prots = result_df[result_df["class"]==positive_cls]["prot_acc_version"].unique()

result_df = result_df[result_df["prot_acc_version"].isin(prots) & ((result_df["class"]==positive_cls) | (result_df["class"]==negative_cls))]
print(result_df.shape)
print(result_df["class"].value_counts())

performance_scores_dict = {}
for i, method_name in enumerate(all_method_names):
    performance_scores_dict[method_name] = compute_performance_metrics(result_df, method_name, positive_cls, negative_cls, n_runs, n_samples, home_dir, fill_missing_with_median)
    # if i==1:break
write_metrics_outputs(performance_scores_dict, output_file=home_dir+f"data/performance_analysis_minority_cls/{task}_{positive_cls}_vs_{negative_cls}.tsv")

Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'metarnn', 'mvp', 'sift',
       'polyphen2_HVAR', 'cadd_raw', 'revel', 'integrated_fitCons',
       'phyloP17way_primate', 'phastCons17way_primate', 'bStatistic',
       'n_methods_having_preds', 'is_selected_prev', 'esm1b_t33_650M_UR50S',
       'esm1v_t33_650M_UR90S', 'esm2_t33_650M_UR50D', 'prottrans_bert_bfd',
       'prottrans_albert_bfd', 'plus_rnn', 'prottrans_t5_xl_u50', 'mut_real',
       'vespa', 'vespal', 'proteinbert', 'sequnet', 'protbert', 'unirep',
       'conservation'],
      dtype='object')
(35082, 52)
Common        914