In [12]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [13]:
# model_root_and_name_tuple_list = [("vespa_marquet", "vespal"), ("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("esm_rives", "esm1b_t33_650M_UR50S"), ("bioembeddings_dallago", "prottrans_bert_bfd")]

model_root_and_name_tuple_list = [("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("dbnsfp", "cadd"), ("dbnsfp", "mvp"), ("dbnsfp", "metarnn"), ("dbnsfp", "revel"),
                                  ("tape_rao", "unirep"), ("tape_rao", "protbert"), ("sequnet_dunham", "sequnet"), 
                                  ("esm_rives", "esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"),
                                  ("bioembeddings_dallago", "plus_rnn"), 
                                  ("bioembeddings_dallago", "prottrans_bert_bfd"), ("bioembeddings_dallago", "prottrans_albert_bfd"),
                                  ("bioembeddings_dallago", "prottrans_xlnet_uniref100"), 
                                  ("bioembeddings_dallago", "prottrans_t5_bfd"), ("bioembeddings_dallago", "prottrans_t5_uniref50"), ("bioembeddings_dallago", "prottrans_t5_xl_u50"),
                                  ("vespa_marquet", "vespal")]

### CDD Conservation dataframe

In [25]:
conservation_df = pd.read_csv(home_dir+"data/cdd_conservationTable_18kHumanProts.csv.gzip", compression='gzip', comment='#')
conservation_df = conservation_df.drop_duplicates(["NPid", "qPos"], keep="first")
print(conservation_df[["NPid", "qPos"]].value_counts())
print(conservation_df.shape)
print(conservation_df.columns)

NPid         qPos
NP_000005.3  129     1
NP_056277.2  237     1
             224     1
             225     1
             226     1
                    ..
NP_001703.2  263     1
             264     1
             265     1
             266     1
NP_999627.2  774     1
Length: 4097087, dtype: int64
(4097087, 36)
Index(['qNo', 'NPid', 'accession', 'que', 'sub', 'ali', 'qPos', 'sPos', 'aPos',
       'qPos_', 'sPos_', 'bitscore', 'A', 'D', 'C', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'cons',
       'entropy', 'qcov', 'CScore'],
      dtype='object')


In [26]:
def merge_conservation_df(df:pd.DataFrame):
    # merge conservation_df with the input df
    temp_conservation_df = conservation_df[["NPid", "qPos", "CScore"]].drop_duplicates(keep="first")
    merged_df = pd.merge(df, temp_conservation_df, how="left", left_on=["prot_acc_version", "prot_pos"], right_on=["NPid", "qPos"])
    columns = list(df.columns)
    columns.append("CScore")
    merged_df = merged_df[columns]
    merged_df = merged_df.rename(columns={"CScore": "conservation_pred"})
    return merged_df

### Merging pathogenicity analysis predictions

In [27]:
from models.aa_common.data_loader import get_patho_and_likelypatho_SNVs
patho_and_likelypatho_variants_df = get_patho_and_likelypatho_SNVs(home_dir)
patho_and_likelypatho_variants_df.value_counts()

raw data: (10282, 12)
Index(['clinvar_id', 'gene_symbol', 'gene_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 1434
Pathogenic: 2308, Likely pathogenic: 4168, total: (6476, 12)


clinvar_id  gene_symbol  gene_id  chrom_acc_version  chrom_pos  ref_allele  alt_allele  prot_acc_version  prot_pos  wt  mut  class            
492         QDPR         5860.0   NC_000004.12       17509363   A           G           NP_000311.2       36        W   R    pathogenic           1
1722630     G6PD         2539.0   NC_000023.11       154532389  C           A           NP_001346945.1    454       R   L    likely_pathogenic    1
1722650     G6PD         2539.0   NC_000023.11       154536169  C           T           NP_001346945.1    44        A   T    pathogenic           1
1722647     G6PD         2539.0   NC_000023.11       154533077  C           T           NP_001346945.1    306       G   S    pathogenic           1
1722638     G6PD         2539.0   NC_000023.11       154532258  G           A           NP_001346945.1    463       R   C    likely_pathogenic    1
                                                                                                                     

In [28]:
task = "patho_and_likelypatho"

merged_df = patho_and_likelypatho_variants_df.copy(deep=True)
merge_col_list = list(patho_and_likelypatho_variants_df.columns)

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):    
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.csv", sep="\t")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    print(merged_df.shape)
    # if i==5: break

merged_df = merge_conservation_df(merged_df)  
print("conservation_score")  
print(merged_df.shape)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}_analysis.csv", sep="\t", index=False, header=True)

dbnsfp sift (6476, 13)
(6476, 13)
dbnsfp polyphen2_HVAR (6476, 13)
(6476, 14)
dbnsfp cadd (6476, 13)
(6476, 15)
dbnsfp mvp (6476, 13)
(6476, 16)
dbnsfp metarnn (6476, 13)
(6476, 17)
dbnsfp revel (6476, 13)
(6476, 18)
tape_rao unirep (6476, 13)
(6476, 19)
tape_rao protbert (6476, 13)
(6476, 20)
sequnet_dunham sequnet (6472, 13)
(6476, 21)
esm_rives esm1b_t33_650M_UR50S (6476, 13)
(6476, 22)
esm_rives esm1v_t33_650M_UR90S (6476, 13)
(6476, 23)
esm_rives esm2_t33_650M_UR50D (6476, 13)
(6476, 24)
bioembeddings_dallago plus_rnn (6476, 13)
(6476, 25)
bioembeddings_dallago prottrans_bert_bfd (6476, 13)
(6476, 26)
bioembeddings_dallago prottrans_albert_bfd (6476, 13)
(6476, 27)
bioembeddings_dallago prottrans_xlnet_uniref100 (6476, 13)
(6476, 28)
bioembeddings_dallago prottrans_t5_bfd (6476, 13)
(6476, 29)
bioembeddings_dallago prottrans_t5_uniref50 (6476, 13)
(6476, 30)
bioembeddings_dallago prottrans_t5_xl_u50 (6476, 13)
(6476, 31)
vespa_marquet vespal (5571, 13)
(6476, 32)
conservation_scor

### Merging population frequency predictions

In [29]:
from models.aa_common.data_loader import get_population_freq_SNVs
popu_freq_variants_df = get_population_freq_SNVs(home_dir=home_dir)
popu_freq_variants_df


Log: Loading data ...
(2865836, 14)
Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class'],
      dtype='object')
NP_000242.1       1154
NP_000526.2       1056
NP_003493.1        898
NP_000240.1        856
NP_004351.1        830
                  ... 
NP_001345345.1       1
NP_001577.1          1
NP_001005166.3       1
NP_000992.1          1
NP_001091888.1       1
Name: prot_acc_version, Length: 15962, dtype: int64
Common: 18167, rare: 28622, ultra-rare: 314013, singletons: 462444, zero-population: 2042590 and total: 2865836


Unnamed: 0,snp_id,chrom_acc_version,chrom_pos,ref_allele,alt_allele,prot_acc_version,prot_pos,wt,mut,wt_population,mut_poulation,wt_freq,mt_freq,class
0,rs1474861721,NC_000020.11,33659968,A,G,NP_112509.3,187,L,P,10680,0,1.000000,0.000000,Zero-population
1,rs1473838200,NC_000020.11,33660296,C,T,NP_112509.3,163,A,T,10680,0,1.000000,0.000000,Zero-population
2,rs1473404810,NC_000020.11,33657999,G,A,NP_112509.3,369,R,C,14050,0,1.000000,0.000000,Zero-population
3,rs1469318731,NC_000020.11,33669380,T,C,NP_112509.3,128,K,E,35244,1,0.999972,0.000028,Singleton
4,rs1469008200,NC_000020.11,33658753,C,G,NP_112509.3,321,D,H,14050,0,1.000000,0.000000,Zero-population
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865831,rs201657622,NC_000001.11,144373466,C,T,NP_001137504.2,40,S,N,4294,59,0.986446,0.013554,Common
2865832,rs200452456,NC_000024.10,23229611,A,T,NP_001376232.1,171,Y,F,4262,2,0.999531,0.000469,Ultra-rare
2865833,rs142112856,NC_000004.12,9325189,A,G,NP_001229256.1,9,R,G,11862,3689,0.762781,0.237219,Common
2865834,rs28613881,NC_000004.12,9363153,A,G,NP_001243796.1,9,R,G,11862,6624,0.641675,0.358325,Common


In [30]:
task = "popu_freq"
merged_df = popu_freq_variants_df.copy(deep=True)
merge_col_list = list(popu_freq_variants_df.columns)
# merge_col_list.remove("class")

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.csv", sep="\t")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    merged_df = merged_df.drop_duplicates(keep="first")
    print(i, merged_df.shape)
    # if i==3: break

merged_df = merge_conservation_df(merged_df)  
print("conservation_score")  
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}_analysis.csv", sep="\t", index=False, header=True)
print(merged_df.shape)
merged_df.columns

dbnsfp sift (2865836, 15)
0 (2865836, 15)
dbnsfp polyphen2_HVAR (2865836, 15)
1 (2865836, 16)
dbnsfp cadd (2865836, 15)
2 (2865836, 17)
dbnsfp mvp (2865836, 15)
3 (2865836, 18)
dbnsfp metarnn (2865836, 15)
4 (2865836, 19)
dbnsfp revel (2865836, 15)
5 (2865836, 20)
tape_rao unirep (2865836, 15)
6 (2865836, 21)
tape_rao protbert (2865836, 15)
7 (2865836, 22)
sequnet_dunham sequnet (2862363, 15)
8 (2865836, 23)
esm_rives esm1b_t33_650M_UR50S (2865836, 15)
9 (2865836, 24)
esm_rives esm1v_t33_650M_UR90S (2865836, 15)
10 (2865836, 25)
esm_rives esm2_t33_650M_UR50D (2865836, 15)
11 (2865836, 26)
bioembeddings_dallago plus_rnn (2865836, 15)
12 (2865836, 27)
bioembeddings_dallago prottrans_bert_bfd (2865836, 15)
13 (2865836, 28)
bioembeddings_dallago prottrans_albert_bfd (2865836, 15)
14 (2865836, 29)
bioembeddings_dallago prottrans_xlnet_uniref100 (2865836, 15)
15 (2865836, 30)
bioembeddings_dallago prottrans_t5_bfd (2865836, 15)
16 (2865836, 31)
bioembeddings_dallago prottrans_t5_uniref50 (28

Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq', 'class', 'sift_pred',
       'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred',
       'revel_pred', 'unirep_pred', 'protbert_pred', 'sequnet_pred',
       'esm1b_t33_650M_UR50S_pred', 'esm1v_t33_650M_UR90S_pred',
       'esm2_t33_650M_UR50D_pred', 'plus_rnn_pred', 'prottrans_bert_bfd_pred',
       'prottrans_albert_bfd_pred', 'prottrans_xlnet_uniref100_pred',
       'prottrans_t5_bfd_pred', 'prottrans_t5_uniref50_pred',
       'prottrans_t5_xl_u50_pred', 'vespal_pred', 'conservation_pred'],
      dtype='object')

In [31]:
n_commnon = merged_df[merged_df["class"]=="Common"].shape[0]
n_rare = merged_df[merged_df["class"]=="Rare"].shape[0]
n_ultra_rare = merged_df[merged_df["class"]=="Ultra-rare"].shape[0]
n_singleton = merged_df[merged_df["class"]=="Singleton"].shape[0]
n_zero_population = merged_df[merged_df["class"]=="Zero-population"].shape[0]
print(f"Common: {n_commnon}, rare: {n_rare}, ultra-rare: {n_ultra_rare}, singletons: {n_singleton}, zero-population: {n_zero_population} and total: {merged_df.shape}")

Common: 18167, rare: 28622, ultra-rare: 314013, singletons: 462444, zero-population: 2042590 and total: (2865836, 35)


### Merging protein mutation dataset (PMD) predictions

In [14]:
from models.aa_common.data_loader import get_pmd_dataset
pmd_variants_df = get_pmd_dataset(home_dir)


Log: Loading Protein Mutation Dataset (PMD) ...


  pmd_df = pd.read_csv(home_dir+"models/aa_common/datasets_pmd_analysis/pmd_data.tsv", sep="\t") # PMD: protein mutation dataset


(66281, 32)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq', 'snp_id',
       'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele', 'protein_acc',
       'protein_ver', 'verified', 'chrom', 'chrom_pos', 'variation',
       'variant_type', 'ref_allele', 'alt_allele', 'pmd_nr_id'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 10750
(60656, 32)


In [23]:
task = "pmd"
merged_df = pmd_variants_df.copy(deep=True)
merge_col_list = list(pmd_variants_df.columns)

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    if model_name=="vespal": model_name="vespa"
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.tsv", sep="\t")
    models_pred_df = models_pred_df.drop_duplicates(keep="first")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    print(merged_df.shape)
    # if i==5: break

dbnsfp sift (60656, 33)
(60656, 33)
dbnsfp polyphen2_HVAR (60656, 33)
(60656, 34)
dbnsfp cadd (60656, 33)
(60656, 35)
dbnsfp mvp (60656, 33)
(60656, 36)
dbnsfp metarnn (60656, 33)
(60656, 37)
dbnsfp revel (60656, 33)
(60656, 38)
tape_rao unirep (60656, 33)
(60656, 39)
tape_rao protbert (60656, 33)
(60656, 40)
sequnet_dunham sequnet (60347, 33)
(60656, 41)
esm_rives esm1b_t33_650M_UR50S (60656, 33)
(60656, 42)
esm_rives esm1v_t33_650M_UR90S (60656, 33)
(60656, 43)
esm_rives esm2_t33_650M_UR50D (60656, 33)
(60656, 44)
bioembeddings_dallago plus_rnn (60427, 33)
(60656, 45)
bioembeddings_dallago prottrans_bert_bfd (60427, 33)
(60656, 46)
bioembeddings_dallago prottrans_albert_bfd (60427, 33)
(60656, 47)
bioembeddings_dallago prottrans_xlnet_uniref100 (60427, 33)
(60656, 48)
bioembeddings_dallago prottrans_t5_bfd (60427, 33)
(60656, 49)
bioembeddings_dallago prottrans_t5_uniref50 (60427, 33)
(60656, 50)
bioembeddings_dallago prottrans_t5_xl_u50 (60427, 33)
(60656, 51)
vespa_marquet vespa (6

In [24]:
geno2func = pd.read_csv(home_dir+f"data/SNPdbe/geno2func.tsv", sep="\t")
geno2func.replace("\\N", None, inplace=True)
print(geno2func.shape)
print(geno2func.columns)
geno2func = geno2func[['mut_id', 'SIFT_score']] # if needed, add more columns

  geno2func = pd.read_csv(home_dir+f"data/SNPdbe/geno2func.tsv", sep="\t")


(1748930, 23)
Index(['mut_id', 'md5', 'wt', 'pos', 'mt', 'in_dbSNP', 'in_SP', 'in_PMD',
       'in_1KG', 'SNAP_status', 'SNAP_bin', 'SNAP_score', 'SNAP_ri',
       'SNAP_acc', 'SIFT_bin', 'SIFT_score', 'PERC_wt', 'PERC_mt', 'PSSM_wt',
       'PSSM_mt', 'PSIC_wt', 'PSIC_mt', 'pph2'],
      dtype='object')


In [25]:
merged_df = pd.merge(left=merged_df, right=geno2func, how="left", on=["mut_id"])
merged_df = merged_df.rename(columns={"SIFT_score": f"sift_snpdbe_pred"})
print(merged_df.shape)
print(merged_df.columns)
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)

(60656, 53)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq', 'snp_id',
       'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele', 'protein_acc',
       'protein_ver', 'verified', 'chrom', 'chrom_pos', 'variation',
       'variant_type', 'ref_allele', 'alt_allele', 'pmd_nr_id', 'sift_pred',
       'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred',
       'revel_pred', 'unirep_pred', 'protbert_pred', 'sequnet_pred',
       'esm1b_t33_650M_UR50S_pred', 'esm1v_t33_650M_UR90S_pred',
       'esm2_t33_650M_UR50D_pred', 'plus_rnn_pred', 'prottrans_bert_bfd_pred',
       'prottrans_albert_bfd_pred', 'prottrans_xlnet_uniref100_pred',
       'prottrans_t5_bfd_pred', 'prottrans_t5_uniref50_pred',
       'prottrans_t5_xl_u50_pred', 'vespa_pred', 'sift_snpdbe_pred'],
      dtype='object')
