In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [2]:
# model_root_and_name_tuple_list = [("vespa_marquet", "vespa"), ("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("esm_rives", "esm1b_t33_650M_UR50S"), ("bioembeddings_dallago", "prottrans_bert_bfd")]

model_root_and_name_tuple_list = [("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("dbnsfp", "cadd"), ("dbnsfp", "mvp"), ("dbnsfp", "metarnn"), ("dbnsfp", "revel"),
                                  ("sequnet_dunham", "sequnet"), ("vespa_marquet", "vespa"),
                                  ("tape_rao", "unirep"), ("tape_rao", "protbert"), 
                                  ("esm_rives", "esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"),
                                  ("bioembeddings_dallago", "plus_rnn"), 
                                  ("bioembeddings_dallago", "prottrans_bert_bfd"), ("bioembeddings_dallago", "prottrans_albert_bfd"),
                                  ("bioembeddings_dallago", "prottrans_xlnet_uniref100"), 
                                  ("bioembeddings_dallago", "prottrans_t5_bfd"), ("bioembeddings_dallago", "prottrans_t5_uniref50"), ("bioembeddings_dallago", "prottrans_t5_xl_u50"),
                                  ("dbnsfp", "integrated_fitCons"), ("dbnsfp", "phyloP17way_primate"), ("dbnsfp", "phastCons17way_primate"), ("dbnsfp", "bStatistic")]

### Helper functions

In [3]:
supervised_method_cols = ['sift_pred', 'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred', 'revel_pred', 'sequnet_pred', 'vespa_pred']
def get_n_supervised_methods_havings_prediction(row):
    # print(row)
    n = 0
    for col in supervised_method_cols:
        # print(pd.isna(row[col]))
        if not pd.isna(row[col]):
            n += 1
    # print(n)
    return n

### CDD Conservation dataframe

In [25]:
conservation_df = pd.read_csv(home_dir+"data/cdd_conservation/cdd_conservationTable_18kHumanProts.csv.gzip", compression='gzip', comment='#')
conservation_df = conservation_df.drop_duplicates(["NPid", "qPos"], keep="first")
print(conservation_df[["NPid", "qPos"]].value_counts())
print(conservation_df.shape)
print(conservation_df.columns)

NPid         qPos
NP_000005.3  129     1
NP_056277.2  237     1
             224     1
             225     1
             226     1
                    ..
NP_001703.2  263     1
             264     1
             265     1
             266     1
NP_999627.2  774     1
Length: 4097087, dtype: int64
(4097087, 36)
Index(['qNo', 'NPid', 'accession', 'que', 'sub', 'ali', 'qPos', 'sPos', 'aPos',
       'qPos_', 'sPos_', 'bitscore', 'A', 'D', 'C', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'cons',
       'entropy', 'qcov', 'CScore'],
      dtype='object')


In [26]:
def merge_conservation_df(df:pd.DataFrame):
    # merge conservation_df with the input df
    temp_conservation_df = conservation_df[["NPid", "qPos", "CScore"]].drop_duplicates(keep="first")
    merged_df = pd.merge(df, temp_conservation_df, how="left", left_on=["prot_acc_version", "prot_pos"], right_on=["NPid", "qPos"])
    columns = list(df.columns)
    columns.append("CScore")
    merged_df = merged_df[columns]
    merged_df = merged_df.rename(columns={"CScore": "conservation_pred"})
    return merged_df

### Merging pathogenicity analysis predictions

In [4]:
from models.aa_common.data_loader import get_patho_and_likelypatho_SNVs
patho_and_likelypatho_variants_df = get_patho_and_likelypatho_SNVs(home_dir)
patho_and_likelypatho_variants_df.value_counts()

raw data: (10282, 12)
Index(['clinvar_id', 'gene_symbol', 'gene_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 1434
Pathogenic: 2308, Likely pathogenic: 4168, total: (6476, 12)


clinvar_id  gene_symbol  gene_id  chrom_acc_version  chrom_pos  ref_allele  alt_allele  prot_acc_version  prot_pos  wt  mut  class            
492         QDPR         5860.0   NC_000004.12       17509363   A           G           NP_000311.2       36        W   R    pathogenic           1
1722630     G6PD         2539.0   NC_000023.11       154532389  C           A           NP_001346945.1    454       R   L    likely_pathogenic    1
1722650     G6PD         2539.0   NC_000023.11       154536169  C           T           NP_001346945.1    44        A   T    pathogenic           1
1722647     G6PD         2539.0   NC_000023.11       154533077  C           T           NP_001346945.1    306       G   S    pathogenic           1
1722638     G6PD         2539.0   NC_000023.11       154532258  G           A           NP_001346945.1    463       R   C    likely_pathogenic    1
                                                                                                                     

In [7]:
task = "patho_and_likelypatho"

merged_df = patho_and_likelypatho_variants_df.copy(deep=True)
merge_col_list = list(patho_and_likelypatho_variants_df.columns)

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):    
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.tsv", sep="\t")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    print(merged_df.shape)
    # if i==5: break

# merged_df = merge_conservation_df(merged_df)  
# print("conservation_score")  
# print(merged_df.shape)

dbnsfp sift (6476, 13)
(6476, 13)
dbnsfp polyphen2_HVAR (6476, 13)
(6476, 14)
dbnsfp cadd (6476, 13)
(6476, 15)
dbnsfp mvp (6476, 13)
(6476, 16)
dbnsfp metarnn (6476, 13)
(6476, 17)
dbnsfp revel (6476, 13)
(6476, 18)
sequnet_dunham sequnet (6472, 13)
(6476, 19)
vespa_marquet vespa (6476, 13)
(6476, 20)
tape_rao unirep (6476, 13)
(6476, 21)
tape_rao protbert (6476, 13)
(6476, 22)
esm_rives esm1b_t33_650M_UR50S (6476, 13)
(6476, 23)
esm_rives esm1v_t33_650M_UR90S (6476, 13)
(6476, 24)
esm_rives esm2_t33_650M_UR50D (6476, 13)
(6476, 25)
bioembeddings_dallago plus_rnn (6476, 13)
(6476, 26)
bioembeddings_dallago prottrans_bert_bfd (6476, 13)
(6476, 27)
bioembeddings_dallago prottrans_albert_bfd (6476, 13)
(6476, 28)
bioembeddings_dallago prottrans_xlnet_uniref100 (6476, 13)
(6476, 29)
bioembeddings_dallago prottrans_t5_bfd (6476, 13)
(6476, 30)
bioembeddings_dallago prottrans_t5_uniref50 (6476, 13)
(6476, 31)
bioembeddings_dallago prottrans_t5_xl_u50 (6476, 13)
(6476, 32)
dbnsfp integrated_

In [8]:
for i in range(merged_df.shape[0]):
    merged_df.loc[i, "n_methods_having_preds"] = get_n_supervised_methods_havings_prediction(merged_df.loc[i])

merged_df.replace(["pathogenic", "likely_pathogenic"], ["Pathogenic", "Likely-pathogenic"], inplace=True)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}_analysis.csv", sep="\t", index=False, header=True)

### Merging population frequency predictions

In [9]:
from models.aa_common.data_loader import get_population_freq_SNVs
popu_freq_variants_df = get_population_freq_SNVs(home_dir)
popu_freq_variants_df


Log: Loading data ...
(2865836, 14)
Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class'],
      dtype='object')
NP_000242.1       1154
NP_000526.2       1056
NP_003493.1        898
NP_000240.1        856
NP_004351.1        830
                  ... 
NP_001345345.1       1
NP_001577.1          1
NP_001005166.3       1
NP_000992.1          1
NP_001091888.1       1
Name: prot_acc_version, Length: 15962, dtype: int64
Common: 18167, rare: 28622, ultra-rare: 314013, singletons: 462444, zero-population: 2042590 and total: 2865836


Unnamed: 0,snp_id,chrom_acc_version,chrom_pos,ref_allele,alt_allele,prot_acc_version,prot_pos,wt,mut,wt_population,mut_poulation,wt_freq,mt_freq,class
0,rs1474861721,NC_000020.11,33659968,A,G,NP_112509.3,187,L,P,10680,0,1.000000,0.000000,Zero-population
1,rs1473838200,NC_000020.11,33660296,C,T,NP_112509.3,163,A,T,10680,0,1.000000,0.000000,Zero-population
2,rs1473404810,NC_000020.11,33657999,G,A,NP_112509.3,369,R,C,14050,0,1.000000,0.000000,Zero-population
3,rs1469318731,NC_000020.11,33669380,T,C,NP_112509.3,128,K,E,35244,1,0.999972,0.000028,Singleton
4,rs1469008200,NC_000020.11,33658753,C,G,NP_112509.3,321,D,H,14050,0,1.000000,0.000000,Zero-population
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865831,rs201657622,NC_000001.11,144373466,C,T,NP_001137504.2,40,S,N,4294,59,0.986446,0.013554,Common
2865832,rs200452456,NC_000024.10,23229611,A,T,NP_001376232.1,171,Y,F,4262,2,0.999531,0.000469,Ultra-rare
2865833,rs142112856,NC_000004.12,9325189,A,G,NP_001229256.1,9,R,G,11862,3689,0.762781,0.237219,Common
2865834,rs28613881,NC_000004.12,9363153,A,G,NP_001243796.1,9,R,G,11862,6624,0.641675,0.358325,Common


In [12]:
task = "popu_freq"
merged_df = popu_freq_variants_df.copy(deep=True)
merge_col_list = list(popu_freq_variants_df.columns)
# merge_col_list.remove("class")

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.tsv", sep="\t")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    merged_df = merged_df.drop_duplicates(keep="first")
    print(i, merged_df.shape)
    # if i==3: break

# merged_df = merge_conservation_df(merged_df)  
# print("conservation_score")  

dbnsfp sift (2865836, 15)
0 (2865836, 15)
dbnsfp polyphen2_HVAR (2865836, 15)
1 (2865836, 16)
dbnsfp cadd (2865836, 15)
2 (2865836, 17)
dbnsfp mvp (2865836, 15)
3 (2865836, 18)
dbnsfp metarnn (2865836, 15)
4 (2865836, 19)
dbnsfp revel (2865836, 15)
5 (2865836, 20)
sequnet_dunham sequnet (2862363, 15)
6 (2865836, 21)
vespa_marquet vespa (2865836, 15)
7 (2865836, 22)
tape_rao unirep (2865836, 15)
8 (2865836, 23)
tape_rao protbert (2865836, 15)
9 (2865836, 24)
esm_rives esm1b_t33_650M_UR50S (2865836, 15)
10 (2865836, 25)
esm_rives esm1v_t33_650M_UR90S (2865836, 15)
11 (2865836, 26)
esm_rives esm2_t33_650M_UR50D (2865836, 15)
12 (2865836, 27)
bioembeddings_dallago plus_rnn (2865836, 15)
13 (2865836, 28)
bioembeddings_dallago prottrans_bert_bfd (2865836, 15)
14 (2865836, 29)
bioembeddings_dallago prottrans_albert_bfd (2865836, 15)
15 (2865836, 30)
bioembeddings_dallago prottrans_xlnet_uniref100 (2865836, 15)
16 (2865836, 31)
bioembeddings_dallago prottrans_t5_bfd (2865836, 15)
17 (2865836, 

In [13]:
for i in range(merged_df.shape[0]):
    if i%10000==0: print(i)
    merged_df.loc[i, "n_methods_having_preds"] = get_n_supervised_methods_havings_prediction(merged_df.loc[i])
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}_analysis.csv", sep="\t", index=False, header=True)
print(merged_df.shape)
merged_df.columns

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
13

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'sift_pred',
       'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred',
       'revel_pred', 'sequnet_pred', 'vespa_pred', 'unirep_pred',
       'protbert_pred', 'esm1b_t33_650M_UR50S_pred',
       'esm1v_t33_650M_UR90S_pred', 'esm2_t33_650M_UR50D_pred',
       'plus_rnn_pred', 'prottrans_bert_bfd_pred', 'prottrans_albert_bfd_pred',
       'prottrans_xlnet_uniref100_pred', 'prottrans_t5_bfd_pred',
       'prottrans_t5_uniref50_pred', 'prottrans_t5_xl_u50_pred',
       'integrated_fitCons_pred', 'phyloP17way_primate_pred',
       'phastCons17way_primate_pred', 'bStatistic_pred',
       'n_methods_having_preds'],
      dtype='object')

In [14]:
merged_df["class"].value_counts()

Zero-population    2042590
Singleton           462444
Ultra-rare          314013
Rare                 28622
Common               18167
Name: class, dtype: int64