In [2]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd 
from models.aa_common.data_loader import get_population_freq_SNVs

In [3]:
task = "popu_freq"
variants_df = get_population_freq_SNVs(home_dir)
variants_df = variants_df[variants_df["class"]!="Zero-population"]


Log: Loading data ...
Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class'],
      dtype='object')
Singleton     462444
Ultra-rare    314013
Rare           28622
Common         18167
Name: class, dtype: int64
total:  (823246, 14)


In [6]:
# merging dbNSFP extracted scores with popu-freq dataset.
pred_df = pd.read_csv(home_dir+f"models/dbnsfp/dbnsfp_outputs/{task}.txt", sep="\t")
pred_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
pred_df.rename(columns={"#chr":"chrom", "pos(1-based)":"chrom_pos", "ref":"ref_allele", "alt":"alt_allele", 'aaref':"wt", 'aaalt':"mut", 
                        'CADD_raw': 'CADD_raw_score', 'phyloP17way_primate':'phyloP17way_primate_score', 
                        'phastCons17way_primate':'phastCons17way_primate_score', 'bStatistic':'bStatistic_score'}, inplace=True)

pred_df.loc[pred_df["chrom"]=="X", "chrom"] = 23
pred_df.loc[pred_df["chrom"]=="Y", "chrom"] = 24
pred_df = pred_df.astype({'chrom': 'int64'}) # this line is mandatory

print(pred_df.columns)
print(pred_df.shape)

# computing average scores for each method for each row
def compute_avg(x):
    x = str(x).split(";")
    return np.mean([float(i) for i in x if i!="."]) 

model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
               'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']
for model_name in model_names:
    model_scores = pred_df[model_name].apply(compute_avg) # can have multiple scores, ie '0.4573521;0.4573521;0.4573521;0.4573521'. taking the avg
    pred_df[model_name] = model_scores


print(f"#-of SNVs found from dbNSFP: {pred_df.shape[0]}")

  pred_df = pd.read_csv(home_dir+f"models/dbnsfp/dbnsfp_outputs/{task}.txt", sep="\t")


Index(['chrom', 'chrom_pos', 'ref_allele', 'alt_allele', 'wt', 'mut',
       'SIFT_score', 'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score',
       'MVP_score', 'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')
(808496, 16)


  return _methods._mean(a, axis=axis, dtype=dtype,


#-of SNVs found from dbNSFP: 808496


In [7]:
# merging dbNSFP extracted scores with popu-freq dataset.
variants_df["chrom"] = variants_df["chrom_acc_version"].apply(lambda x: int(x[x.index("_")+1:x.index(".")])) # taking only chromosom number for dbNSFP inputs

# using inner and wt, mut for merging for specifically protein mutation.
result_df = pd.merge(variants_df, pred_df, how="inner", on=["chrom", "chrom_pos", "ref_allele", "alt_allele", "wt", "mut"])
result_df = result_df.drop_duplicates(keep="first")
print(result_df.columns)
print(result_df.shape)
result_df["class"].value_counts()

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'chrom', 'SIFT_score',
       'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
       'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score'],
      dtype='object')
(745861, 25)


Singleton     420895
Ultra-rare    285181
Rare           25414
Common         14371
Name: class, dtype: int64

In [9]:
result_df.to_csv(home_dir+f"models/dbnsfp/postprocessed_outputs/{task}.tsv", sep="\t", index=False)

In [23]:
def print_missing_things(x:pd.DataFrame):
    model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw_score', 'REVEL_score', 
                'integrated_fitCons_score', 'phyloP17way_primate_score', 'phastCons17way_primate_score', 'bStatistic_score']

    # print("", "Common", "Rare", "Ultra-rare", "Singleton", sep="\t")
    print("\t", end="")
    for i, cls in enumerate(["Common", "Rare", "Ultra-rare", "Singleton"]):
        prots = x[(x["class"]==cls)]["prot_acc_version"].unique().shape[0]
        print(f"{cls}({prots})", end="\t")
    print()
    for model_name in model_names:
        print(model_name, end="\t")
        for i, cls in enumerate(["Common", "Rare", "Ultra-rare", "Singleton"]):
            missing = x[(x["class"]==cls) & pd.isna(x[model_name])].shape[0]
            not_missing = x[(x["class"]==cls) & ~pd.isna(x[model_name])].shape[0]
            total = x[(x["class"]==cls)].shape[0]
            
            print(f"{missing}/{not_missing}", end="\t")
            if i==3: print()

print_missing_things(result_df)

	Common(5476)	Rare(5476)	Ultra-rare(5476)	Singleton(5476)	
MetaRNN_score	0/12260	0/16170	0/138633	0/191571	
MVP_score	9672/2588	1467/14703	1747/136886	2029/189542	
SIFT_score	1225/11035	1397/14773	6972/131661	9644/181927	
Polyphen2_HVAR_score	1838/10422	2216/13954	10675/127958	14655/176916	
CADD_raw_score	0/12260	0/16170	0/138633	0/191571	
REVEL_score	1273/10987	1512/14658	6775/131858	9194/182377	
integrated_fitCons_score	530/11730	631/15539	3123/135510	2429/189142	
phyloP17way_primate_score	7/12253	4/16166	14/138619	18/191553	
phastCons17way_primate_score	7/12253	4/16166	14/138619	18/191553	
bStatistic_score	353/11907	495/15675	2663/135970	3397/188174	
