In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd

In [3]:
task = "popu_freq"
snvs_with_dbnsfp_result_df = pd.read_csv(home_dir+f"models/dbnsfp/outputs_postprocessed/{task}.tsv", sep="\t")
print(snvs_with_dbnsfp_result_df.columns)
print(snvs_with_dbnsfp_result_df.shape)
print(snvs_with_dbnsfp_result_df["class"].value_counts())

Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic'],
      dtype='object')
(1027660, 35)
Singleton     578418
Ultra-rare    388752
Rare           36200
Common         24290
Name: class, dtype: int64


In [4]:
print(snvs_with_dbnsfp_result_df.shape)
print(snvs_with_dbnsfp_result_df["snp_id"].unique().shape[0])
print(snvs_with_dbnsfp_result_df["gene_name"].unique().shape[0])
print(snvs_with_dbnsfp_result_df["mane_refseq_prot"].unique().shape[0])

(1027660, 35)
1010254
16577
16608


In [3]:
# computing number of supervised models score available for each row
supervised_method_cols = ['MetaRNN_score', 'MVP_score', 'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score']
def get_num_of_methods_have_prediction_on_this_row(row):
    n = 0
    for col in supervised_method_cols:
        # print(pd.isna(row[col]))
        if not pd.isna(row[col]):
            n += 1
    # print(n)
    return n

snvs_with_dbnsfp_result_df["n_methods_having_preds"] = snvs_with_dbnsfp_result_df.apply(get_num_of_methods_have_prediction_on_this_row, axis=1)

print(snvs_with_dbnsfp_result_df.shape)
print(snvs_with_dbnsfp_result_df.columns)

(1027660, 36)
Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic', 'n_methods_having_preds'],
      dtype='object')


In [4]:
def print_missing_things(x:pd.DataFrame):
    model_names = ['MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic']

    # print("", "Common", "Rare", "Ultra-rare", "Singleton", sep="\t")
    print("\t", end="")
    for i, cls in enumerate(["Common", "Rare", "Ultra-rare", "Singleton"]):
        prots = x[(x["class"]==cls)]["prot_acc_version"].unique().shape[0]
        print(f"{cls}({prots})", end="\t")
    print()
    for model_name in model_names:
        print(model_name, end="\t")
        for i, cls in enumerate(["Common", "Rare", "Ultra-rare", "Singleton"]):
            missing = x[(x["class"]==cls) & pd.isna(x[model_name])].shape[0]
            not_missing = x[(x["class"]==cls) & ~pd.isna(x[model_name])].shape[0]
            total = x[(x["class"]==cls)].shape[0]
            
            print(f"{missing}/{not_missing}", end="\t")
            if i==3: print()

print_missing_things(snvs_with_dbnsfp_result_df)

	Common(9153)	Rare(12120)	Ultra-rare(16486)	Singleton(16539)	
MetaRNN_score	0/24290	0/36200	0/388752	0/578418	
MVP_score	20057/4233	3640/32560	4917/383835	6247/572171	
SIFT_score	2496/21794	2935/33265	17934/370818	26297/552121	
Polyphen2_HVAR_score	3654/20636	4460/31740	25657/363095	36801/541617	
CADD_raw	0/24290	0/36200	0/388752	0/578418	
REVEL_score	2516/21774	3005/33195	16122/372630	23363/555055	
integrated_fitCons_score	1077/23213	1449/34751	10914/377838	8949/569469	
phyloP17way_primate	26/24264	27/36173	71/388681	100/578318	
phastCons17way_primate	26/24264	27/36173	71/388681	100/578318	
bStatistic	686/23604	928/35272	6653/382099	9297/569121	


In [5]:
# loading the previous sampled data
prev_snvs_with_dbnsfp_result_df = pd.read_csv(home_dir+f"data/datasets_popu_freq/popu_freq_with_dbnsfp_sampled_prev.tsv", sep="\t")
print(prev_snvs_with_dbnsfp_result_df.columns)
print(prev_snvs_with_dbnsfp_result_df.shape)
print(prev_snvs_with_dbnsfp_result_df["class"].value_counts())

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'chrom', 'SIFT_score',
       'Polyphen2_HVAR_score', 'MetaRNN_score', 'REVEL_score', 'MVP_score',
       'CADD_raw_score', 'integrated_fitCons_score',
       'phyloP17way_primate_score', 'phastCons17way_primate_score',
       'bStatistic_score', 'n_methods_having_preds'],
      dtype='object')
(26409, 26)
Common        6976
Ultra-rare    6957
Singleton     6955
Rare          5521
Name: class, dtype: int64


In [6]:
# this is only to see of how many can be mapped
selected_df = prev_snvs_with_dbnsfp_result_df[['prot_acc_version', 'prot_pos', 'wt', 'mut']]
y = snvs_with_dbnsfp_result_df.merge(right=selected_df, how="inner", 
                                 left_on=['prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa_1letter', 'mt_aa_1letter'],
                                 right_on=['prot_acc_version', 'prot_pos', 'wt', 'mut'])
y.shape

(26367, 39)

In [7]:
selected_df = prev_snvs_with_dbnsfp_result_df[['prot_acc_version', 'prot_pos', 'wt', 'mut']]
result_with_prevsamples_df = snvs_with_dbnsfp_result_df.merge(right=selected_df, how="left", 
                                 left_on=['prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa_1letter', 'mt_aa_1letter'],
                                 right_on=['prot_acc_version', 'prot_pos', 'wt', 'mut'])
result_with_prevsamples_df.shape

(1027660, 39)

In [8]:
# keeping 1 col if a row is previously sampled
def xxx(row):
    if not pd.isna(row.prot_pos) and not pd.isna(row.wt) and not pd.isna(row.mut):
        return True
    return False
    
result_with_prevsamples_df["is_selected_prev"] = result_with_prevsamples_df.apply(xxx, axis=1)

result_with_prevsamples_df.drop(['prot_pos', 'wt', 'mut'], axis=1, inplace=True)
result_with_prevsamples_df["is_selected_prev"].value_counts()

False    1001293
True       26367
Name: is_selected_prev, dtype: int64

In [9]:
a = set(result_with_prevsamples_df[(result_with_prevsamples_df["class"]=="Common")]["prot_acc_version"].unique())
b = set(result_with_prevsamples_df[(result_with_prevsamples_df["class"]=="Rare")]["prot_acc_version"].unique())
c = set(result_with_prevsamples_df[(result_with_prevsamples_df["class"]=="Ultra-rare")]["prot_acc_version"].unique())
d = set(result_with_prevsamples_df[(result_with_prevsamples_df["class"]=="Singleton")]["prot_acc_version"].unique())

cmn_and_rare_prots = a.intersection(b)
cmn_and_ultrarare_prots = a.intersection(c)
cmn_and_singleton_prots = a.intersection(d)
cmn_prots = cmn_and_rare_prots.union(cmn_and_ultrarare_prots).union(cmn_and_singleton_prots)
cmn_rare_ultrarare_singleton_prots = a.intersection(b).intersection(c).intersection(d)
print(len(cmn_and_rare_prots), len(cmn_and_ultrarare_prots), len(cmn_and_singleton_prots), len(cmn_prots), len(cmn_rare_ultrarare_singleton_prots))

7697 9124 9119 9142 7679


In [10]:
def sample_variants(class_name, prots_to_cover):
    print(len(prots_to_cover))
    y = result_with_prevsamples_df[(result_with_prevsamples_df["class"]==class_name)].copy()
    rem = prots_to_cover

    for i in range(6, 1, -1):
        y_i = y[(y["prot_acc_version"].isin(rem)) & (y["n_methods_having_preds"]==i)] #rare-cmn=null
        
        # select the rows that was previously sampled and keep only 1 for each protein
        prev_selected_rows = y_i[(y_i["is_selected_prev"]==True)]
        prev_selected_rows = prev_selected_rows.groupby("prot_acc_version").sample(1)

        # defining new df for the new proteins
        prev_selected_prot_set = set(prev_selected_rows["prot_acc_version"].unique())
        remaining_prot_set = set(y_i["prot_acc_version"].unique()) - prev_selected_prot_set
        z_i = y_i[(y_i["prot_acc_version"].isin(remaining_prot_set))]

        # sample 1 row corresponding to each new proteins
        new_sampled_rows = z_i.groupby("prot_acc_version").sample(1)

        # concate previously sampled and newly sampled rows
        sampled_rows = pd.concat([prev_selected_rows, new_sampled_rows])

        if i==6: data = sampled_rows
        else: data = pd.concat([data, sampled_rows])
        covered = set(data["prot_acc_version"].unique())
        rem = prots_to_cover - covered
        print("total covered: ", len(covered), "remained: ", len(rem))
        if len(rem)==0: break

    # print_missing_things(data)
    return data

c = sample_variants("Common", cmn_prots)
r = sample_variants("Rare", cmn_and_rare_prots)
ur = sample_variants("Ultra-rare", cmn_and_ultrarare_prots)
s = sample_variants("Singleton", cmn_and_singleton_prots)

# test
# c_set = set(c["prot_acc_version"].unique())
# r_set = set(r["prot_acc_version"].unique())
# r_set - c_set # this should be empty

9142
total covered:  1879 remained:  7263
total covered:  8118 remained:  1024
total covered:  8620 remained:  522
total covered:  8905 remained:  237
total covered:  9142 remained:  0
7697
total covered:  6567 remained:  1130
total covered:  7192 remained:  505
total covered:  7404 remained:  293
total covered:  7565 remained:  132
total covered:  7697 remained:  0
9124
total covered:  8074 remained:  1050
total covered:  8612 remained:  512
total covered:  8825 remained:  299
total covered:  8992 remained:  132
total covered:  9124 remained:  0
9119
total covered:  8079 remained:  1040
total covered:  8613 remained:  506
total covered:  8825 remained:  294
total covered:  8987 remained:  132
total covered:  9119 remained:  0


In [11]:
good_data = pd.concat([c, r, ur, s])
print_missing_things(good_data)

c_set = set(good_data[good_data["class"]=="Common"]["prot_acc_version"].unique())
r_set = set(good_data[good_data["class"]=="Rare"]["prot_acc_version"].unique())
ur_set = set(good_data[good_data["class"]=="Ultra-rare"]["prot_acc_version"].unique())
s_set = set(good_data[good_data["class"]=="Singleton"]["prot_acc_version"].unique())

print(len(c_set), len(r_set), len(ur_set), len(s_set))

print(r_set - c_set)
print(ur_set - c_set)
print(s_set - c_set)

	Common(9142)	Rare(7697)	Ultra-rare(9124)	Singleton(9119)	
MetaRNN_score	0/9142	0/7697	0/9124	0/9119	
MVP_score	6931/2211	321/7376	191/8933	189/8930	
SIFT_score	639/8503	501/7196	504/8620	500/8619	
Polyphen2_HVAR_score	878/8264	754/6943	782/8342	777/8342	
CADD_raw	0/9142	0/7697	0/9124	0/9119	
REVEL_score	598/8544	484/7213	516/8608	506/8613	
integrated_fitCons_score	345/8797	264/7433	307/8817	295/8824	
phyloP17way_primate	4/9138	4/7693	3/9121	3/9116	
phastCons17way_primate	4/9138	4/7693	3/9121	3/9116	
bStatistic	184/8958	162/7535	179/8945	171/8948	
9142 7697 9124 9119
set()
set()
set()


In [12]:
good_data["is_selected_prev"].value_counts()

True     26029
False     9053
Name: is_selected_prev, dtype: int64

In [13]:
out_filepath = home_dir+f"data/datasets_popu_freq/popu_freq_with_dbnsfp_sampled"
good_data.to_csv(out_filepath+".tsv", sep="\t", index=False)
# result_df = pd.read_csv(out_filepath+".tsv", sep="\t")

# Creating merged fasta document ...
protein_acc_list = list(good_data["prot_acc_version"].unique())
print(len(protein_acc_list))
from utils.ncbi_proteins import create_combined_fasta
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)

9142
0 NP_000015.2 Already existis
1 NP_000024.2 Already existis
2 NP_000053.2 Already existis
3 NP_000054.2 Already existis
4 NP_000068.1 Already existis
5 NP_000076.2 Already existis
6 NP_000086.2 Already existis
7 NP_000133.1 Already existis
8 NP_000144.2 Already existis
9 NP_000148.2 Already existis
10 NP_000156.1 Already existis
11 NP_000161.2 Already existis
12 NP_000162.2 Already existis
13 NP_000194.2 Already existis
14 NP_000209.2 Already existis
15 NP_000210.2 Already existis
16 NP_000249.1 Already existis
17 NP_000253.1 Already existis
18 NP_000256.4 Already existis
19 NP_000264.2 Already existis
20 NP_000265.1 Already existis
21 NP_000274.3 Already existis
22 NP_000280.1 Already existis
23 NP_000286.3 Already existis
24 NP_000292.1 Already existis
25 NP_000333.1 Already existis
26 NP_000337.1 Already existis
27 NP_000340.2 Already existis
28 NP_000352.1 Already existis
29 NP_000388.2 Already existis
30 NP_000408.1 Already existis
31 NP_000412.4 Already existis
32 NP_000422.

In [2]:
out_filepath = home_dir+f"data/datasets_popu_freq/popu_freq_with_dbnsfp_sampled"
good_data = pd.read_csv(out_filepath+".tsv", sep="\t")

In [4]:
from utils.plots_dicts import popu_freq_class_order
def print_summary(df, classes):
    print("", "#-genes", "#-proteins", "#-protein-variants", "#-unique-genomic-variants", sep="\t")
    for cls in classes:
        cls_df = df[df["class"]==cls]
        n_genes = cls_df["gene_name"].unique().shape[0]
        n_prots = cls_df["prot_acc_version"].unique().shape[0]
        n_prot_variants = cls_df["prot_variant"].shape[0] # this and the following line are same
        n_genomic_variants = cls_df[["chrom_num", "alfa_chrom_pos", "alfa_ref_allele", "alfa_alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
        print(cls, n_genes, n_prots, n_prot_variants, n_genomic_variants, sep="\t")

    total_n_genes = df["gene_name"].unique().shape[0]
    total_n_prots = df["prot_acc_version"].unique().shape[0]
    total_n_prot_variants = df["prot_variant"].shape[0]
    total_n_genomic_variants = df[["chrom_num", "alfa_chrom_pos", "alfa_ref_allele", "alfa_alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
    print("total", total_n_genes, total_n_prots, total_n_prot_variants, total_n_genomic_variants, sep="\t")

print_summary(good_data, popu_freq_class_order)


	#-genes	#-proteins	#-protein-variants	#-unique-genomic-variants
Singleton	9111	9119	9119	9119
Ultra-rare	9116	9124	9124	9123
Rare	7693	7697	7697	7693
Common	9134	9142	9142	9136
total	9134	9142	35082	35071
