In [1]:
import os
import sys
home_dir = "../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd

In [2]:
patho_with_dbnsfp_df = pd.read_csv(home_dir+f"models/dbnsfp/outputs_postprocessed/patho_and_likelypatho.tsv", sep="\t")
print(patho_with_dbnsfp_df.columns)
print(patho_with_dbnsfp_df.shape)
print(patho_with_dbnsfp_df["class"].value_counts())

Index(['clinvar_id', 'gene_name', 'gene_id', 'snp_id', 'mrna_acc_version',
       'mrna_gi', 'prot_variant', 'prot_acc_version', '1indexed_prot_mt_pos',
       'wt_aa', 'mt_aa', 'wt_aa_1letter', 'mt_aa_1letter', 'chrom_variant',
       'chrom_acc_version', 'chrom_num', 'chrom_pos', 'ref_allele',
       'alt_allele', 'class', 'MetaRNN_score', 'MVP_score', 'SIFT_score',
       'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic'],
      dtype='object')
(7303, 30)
Likely-pathogenic    4804
Pathogenic           2499
Name: class, dtype: int64


In [3]:
popu_freq_with_dbnsfp_df = pd.read_csv(home_dir+f"models/dbnsfp/outputs_postprocessed/popu_freq.tsv", sep="\t")
print(popu_freq_with_dbnsfp_df.columns)
print(popu_freq_with_dbnsfp_df.shape)
print(popu_freq_with_dbnsfp_df["class"].value_counts())

Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic'],
      dtype='object')
(1027660, 35)
Singleton     578418
Ultra-rare    388752
Rare           36200
Common         24290
Name: class, dtype: int64


In [5]:
patho_prots_set = set(patho_with_dbnsfp_df["prot_acc_version"].dropna().unique())
popufreq_prots_set = set(popu_freq_with_dbnsfp_df["prot_acc_version"].dropna().unique())

a = patho_prots_set.intersection(popufreq_prots_set)
print(len(patho_prots_set), len(popufreq_prots_set), len(a))

1598 16608 1594


In [6]:
# taking variants as neutral set from Common&Rare classes.
x = popu_freq_with_dbnsfp_df[popu_freq_with_dbnsfp_df["prot_acc_version"].isin(patho_prots_set)]
neutral_df = x[(x["class"]=="Common") | (x["class"]=="Rare")]

neutral_prots_set = set(neutral_df["prot_acc_version"].unique())
remaining_prots = patho_prots_set - neutral_prots_set
print("prots covered in Common&Rare: ", len(neutral_prots_set))
print("prots not covered in Common&Rare: ", len(remaining_prots))

print(neutral_df.shape)
print(neutral_df["class"].value_counts())
neutral_df.columns

prots covered in Common&Rare:  1247
prots not covered in Common&Rare:  351
(4960, 35)
Rare      3073
Common    1887
Name: class, dtype: int64


Index(['snp_id', 'gene_name', 'mane_refseq_prot', 'mane_refseq_nuc',
       'mane_status', 'chrom_acc_version', 'chrom_num', 'source_ref_allele',
       'source_alt_alleles', 'alfa_chrom_pos', 'alfa_ref_allele',
       'alfa_alt_allele', 'alfa_alt_alleles', 'prot_variant',
       'prot_acc_version', '1indexed_prot_mt_pos', 'wt_aa', 'mt_aa',
       'wt_aa_1letter', 'mt_aa_1letter', 'wt_population', 'mt_population',
       'wt_freq', 'mt_freq', 'class', 'MetaRNN_score', 'MVP_score',
       'SIFT_score', 'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
       'integrated_fitCons_score', 'phyloP17way_primate',
       'phastCons17way_primate', 'bStatistic'],
      dtype='object')

In [7]:
cols_renaming_dict = {'alfa_chrom_pos':'chrom_pos', 'alfa_ref_allele':'ref_allele', 'alfa_alt_allele':'alt_allele',
                      'mane_refseq_nuc':'mrna_acc_version'}
neutral_renamed_df = neutral_df.rename(columns=cols_renaming_dict)

cols_from_popu = ['gene_name', 'snp_id', 'mrna_acc_version',
                'prot_variant', 'prot_acc_version', '1indexed_prot_mt_pos',
                'wt_aa', 'mt_aa', 'wt_aa_1letter', 'mt_aa_1letter',
                'chrom_acc_version', 'chrom_num', 'chrom_pos', 'ref_allele',
                'alt_allele', 'class', 'MetaRNN_score', 'MVP_score', 'SIFT_score',
                'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
                'integrated_fitCons_score', 'phyloP17way_primate',
                'phastCons17way_primate', 'bStatistic']
neutral_selectedcols_df = neutral_renamed_df[cols_from_popu]

In [8]:
z = pd.concat([patho_with_dbnsfp_df, neutral_selectedcols_df])
print(z["class"].value_counts())
z

Likely-pathogenic    4804
Rare                 3073
Pathogenic           2499
Common               1887
Name: class, dtype: int64


Unnamed: 0,clinvar_id,gene_name,gene_id,snp_id,mrna_acc_version,mrna_gi,prot_variant,prot_acc_version,1indexed_prot_mt_pos,wt_aa,...,MetaRNN_score,MVP_score,SIFT_score,Polyphen2_HVAR_score,CADD_raw,REVEL_score,integrated_fitCons_score,phyloP17way_primate,phastCons17way_primate,bStatistic
0,1320032.0,PERM1,84808.0,,NM_001394713.1,2.031836e+09,(p.Val777Ala),NP_001381642.1,777,Val,...,7.783105e-07,,0.024000,,1.476529,0.016,0.615755,0.688,0.633,878.0
1,1452181.0,B3GALT6,126792.0,,NM_080605.4,1.435410e+09,(p.Met1Thr),NP_542172.2,1,Met,...,9.704089e-01,0.498705,0.000000,0.02600,2.745302,0.203,0.437478,0.408,0.737,745.0
2,2151860.0,B3GALT6,126792.0,,NM_080605.4,1.435410e+09,(p.Met1Ile),NP_542172.2,1,Met,...,9.637719e-01,0.639789,0.000000,0.02600,2.915920,0.240,0.437478,0.411,0.754,745.0
3,2506981.0,INTS11,54973.0,,NM_017871.6,1.677557e+09,(p.Val515Met),NP_060341.2,515,Val,...,5.925809e-01,0.654906,0.006000,0.54475,3.357409,0.158,0.706548,0.594,0.668,774.0
4,2506979.0,INTS11,54973.0,,NM_017871.6,1.677557e+09,(p.His414Tyr),NP_060341.2,414,His,...,9.879951e-01,0.945333,0.000000,1.00000,3.542766,0.874,0.671770,0.676,0.612,774.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027573,,HPS1,,rs7914192,NM_000195.5,,NP_000186.2:p.Glu9Asp,NP_000186.2,9,Glu,...,7.298380e-03,0.252681,0.099333,0.87925,1.909228,0.144,0.706548,0.599,0.902,807.0
1027602,,CHAT,,rs3810948,NM_020549.5,,NP_065574.4:p.Asp47Glu,NP_065574.4,47,Asp,...,3.795713e-03,,0.000000,0.03300,1.426740,0.255,0.597740,0.491,0.098,599.0
1027644,,AIRE,,rs1800520,NM_000383.4,,NP_000374.1:p.Ser278Arg,NP_000374.1,278,Ser,...,4.146248e-03,,0.689000,0.00000,0.719650,0.200,0.580535,-0.181,0.007,929.0
1027651,,GJA3,,rs968566,NM_021954.4,,NP_068773.2:p.Leu299Met,NP_068773.2,299,Leu,...,6.860920e-07,,0.466000,0.00200,-1.319293,0.144,0.597740,-3.083,0.000,758.0


In [18]:
out_filepath = home_dir+f"data/datasets_patho/patho_likelypatho_neutral_dbnsfp"
z.to_csv(out_filepath+".tsv", sep="\t", index=False)

# Creating merged fasta document ...
protein_acc_list = list(z["prot_acc_version"].unique())
print(len(protein_acc_list))
from utils.ncbi_proteins import create_combined_fasta
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)

1598
0 NP_001381642.1 Already existis
1 NP_542172.2 Already existis
2 NP_060341.2 Already existis
3 NP_001164006.1 Already existis
4 NP_002065.1 Already existis
5 NP_000806.2 Already existis
6 NP_003027.1 Already existis
7 NP_002608.1 Already existis
8 NP_009193.2 Already existis
9 NP_004276.2 Already existis
10 NP_005948.3 Already existis
11 NP_055689.1 Already existis
12 NP_000076.2 Already existis
13 NP_002991.2 Already existis
14 NP_000469.3 Already existis
15 NP_001782.1 Already existis
16 NP_005817.1 Already existis
17 NP_000138.2 Already existis
18 NP_065184.2 Already existis
19 NP_060116.2 Already existis
20 NP_116182.2 Already existis
21 NP_000301.1 Already existis
22 NP_683763.2 Already existis
23 NP_006507.2 Already existis
24 NP_005364.1 Already existis
25 NP_001246.2 Already existis
26 NP_060620.2 Already existis
27 NP_000365.3 Already existis
28 NP_116145.1 Already existis
29 NP_001041639.1 Already existis
30 NP_060209.4 Already existis
31 NP_000089.1 Already existis
32 N

In [9]:
model_names = ['MetaRNN_score', 'MVP_score', 'SIFT_score',
                'Polyphen2_HVAR_score', 'CADD_raw', 'REVEL_score',
                'integrated_fitCons_score', 'phyloP17way_primate',
                'phastCons17way_primate', 'bStatistic']
def print_missing_things(x:pd.DataFrame, classes):
    print("\t", end="")
    for i, cls in enumerate(classes):
        prots = x[(x["class"]==cls)]["prot_acc_version"].dropna().unique().shape[0]
        print(f"{cls}({prots})", end="\t")
    print()
    for model_name in model_names:
        print(model_name, end="\t")
        for i, cls in enumerate(classes):
            missing = x[(x["class"]==cls) & pd.isna(x[model_name])].shape[0]
            not_missing = x[(x["class"]==cls) & ~pd.isna(x[model_name])].shape[0]
            total = x[(x["class"]==cls)].shape[0]
            
            print(f"{missing}/{not_missing}", end="\t")
            if i==len(classes)-1: print()

print_missing_things(z, ["Pathogenic", "Likely-pathogenic", "Common", "Rare"])

	Pathogenic(967)	Likely-pathogenic(1210)	Common(803)	Rare(1102)	
MetaRNN_score	0/2499	0/4804	0/1887	0/3073	
MVP_score	13/2486	11/4793	1541/346	234/2839	
SIFT_score	82/2417	136/4668	55/1832	75/2998	
Polyphen2_HVAR_score	108/2391	211/4593	96/1791	131/2942	
CADD_raw	0/2499	0/4804	0/1887	0/3073	
REVEL_score	48/2451	68/4736	44/1843	62/3011	
integrated_fitCons_score	293/2206	527/4277	73/1814	162/2911	
phyloP17way_primate	0/2499	0/4804	1/1886	1/3072	
phastCons17way_primate	0/2499	0/4804	1/1886	1/3072	
bStatistic	44/2455	74/4730	11/1876	14/3059	


In [10]:

def print_summary(df, classes):
    print("", "#-genes", "#-proteins", "#-protein-variants", "#-unique-genomic-variants", sep="\t")
    for cls in classes:
        cls_df = df[df["class"]==cls]
        n_genes = cls_df["gene_name"].dropna().unique().shape[0]
        n_prots = cls_df["prot_acc_version"].dropna().unique().shape[0]
        n_prot_variants = cls_df["prot_variant"].shape[0] # this and the following line are same
        n_genomic_variants = cls_df[["chrom_num", "chrom_pos", "ref_allele", "alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
        print(cls, n_genes, n_prots, n_prot_variants, n_genomic_variants, sep="\t")

    total_n_genes = df["gene_name"].dropna().unique().shape[0]
    total_n_prots = df["prot_acc_version"].dropna().unique().shape[0]
    total_n_prot_variants = df["prot_variant"].shape[0]
    total_n_genomic_variants = df[["chrom_num", "chrom_pos", "ref_allele", "alt_allele"]].drop_duplicates(keep="first").shape[0] # these keywords are from ALFAs
    print("total", total_n_genes, total_n_prots, total_n_prot_variants, total_n_genomic_variants, sep="\t")

print_summary(z, ["Pathogenic", "Likely-pathogenic", "Common", "Rare"])


	#-genes	#-proteins	#-protein-variants	#-unique-genomic-variants
Pathogenic	967	967	2499	2499
Likely-pathogenic	1207	1210	4804	4803
Common	802	803	1887	1887
Rare	1102	1102	3073	3073
total	1598	1598	12263	12238
