In [12]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
from models.aa_common.data_loader import get_pmd_dbnsfp_dataset, get_popu_freq_dbnsfp_dataset, get_patho_likelypatho_neutral_dbnsfp_dataset
from plots_helper import pmd_class_order, patho_class_order, popu_freq_class_order

In [22]:
def print_summary(df, classes):
    for cls in classes:
        if cls=="Neutral": df.loc[(df["class"]!="Pathogenic") & (df["class"]!="Likely-pathogenic"), "class"] = "Neutral"

        cls_df = df[df["class"]==cls]
        n_prots = cls_df["prot_acc_version"].unique().shape[0]
        n_prot_variants = cls_df[['prot_acc_version', 'wt', 'mut', 'prot_pos']].shape[0]
        n_genomic_variants = cls_df[["chrom", "chrom_pos", "ref_allele", "alt_allele"]].shape[0]
        print(cls, n_prots, n_prot_variants, n_genomic_variants, sep="\t")

    total_n_prots = df["prot_acc_version"].unique().shape[0]
    total_n_prot_variants = df.shape[0]
    total_n_genomic_variants = df[["chrom", "chrom_pos", "ref_allele", "alt_allele"]].shape[0]
    print("total", total_n_prots, total_n_prot_variants, total_n_genomic_variants, sep="\t")

In [23]:
df, _ = get_pmd_dbnsfp_dataset(home_dir)
print_summary(df, pmd_class_order)

Index(['mut_id', 'md5', 'pmd_id', 'nr', 'prot_acc_version', 'snp_id',
       'mut_real', 'wt', 'mut', 'prot_pos', 'chrom', 'chrom_pos', 'ref_allele',
       'alt_allele', 'function', 'source', 'crossref', 'function_summarized',
       'class', 'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score',
       'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score'],
      dtype='object')
(7179, 29)
Effect       3818
No-effect    1777
Knock-out    1584
Name: class, dtype: int64
#-unique prots:  2056
Knock-out	743	1584	1584
Effect	1416	3818	3818
No-effect	622	1777	1777
total	2056	7179	7179


In [24]:
df, _ = get_patho_likelypatho_neutral_dbnsfp_dataset(home_dir)
print_summary(df, patho_class_order)

Index(['clinvar_id', 'gene_symbol', 'gene_id', 'snp_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class', 'chrom', 'SIFT_score', 'Polyphen2_HVAR_score',
       'MetaRNN_score', 'REVEL_score', 'MVP_score', 'CADD_raw_score',
       'integrated_fitCons_score', 'phyloP17way_primate_score',
       'phastCons17way_primate_score', 'bStatistic_score', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')
(9472, 28)
Likely-pathogenic    4162
Pathogenic           2305
Rare                 1983
Common               1022
Name: class, dtype: int64
#-unique prots:  1430
#-unique genes:  1428
Pathogenic	910	2305	2305
Likely-pathogenic	1073	4162	4162
Neutral	985	3005	3005
total	1430	9472	9472


In [25]:
df, _ = get_popu_freq_dbnsfp_dataset(home_dir)
print_summary(df, popu_freq_class_order)

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'chrom', 'SIFT_score',
       'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
       'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score', 'n_methods_having_preds', 'gene_symbol'],
      dtype='object')
(26409, 27)
Common        6976
Ultra-rare    6957
Singleton     6955
Rare          5521
Name: class, dtype: int64
#-unique prots:  6976
#-unique genes:  6971
Singleton	6955	6955	6955
Ultra-rare	6957	6957	6957
Rare	5521	5521	5521
Common	6976	6976	6976
total	6976	26409	26409
