In [1]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [2]:
# model_root_and_name_tuple_list = [("vespa_marquet", "vespal"), ("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("esm_rives", "esm1b_t33_650M_UR50S"), ("bioembeddings_dallago", "prottrans_bert_bfd")]

model_root_and_name_tuple_list = [("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("dbnsfp", "cadd"), ("dbnsfp", "mvp"), ("dbnsfp", "metarnn"), ("dbnsfp", "revel"),
                                  ("tape_rao", "unirep"), ("tape_rao", "protbert"), ("sequnet_dunham", "sequnet"), 
                                  ("esm_rives", "esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"),
                                  ("bioembeddings_dallago", "plus_rnn"), 
                                  ("bioembeddings_dallago", "prottrans_bert_bfd"), ("bioembeddings_dallago", "prottrans_albert_bfd"),
                                  ("bioembeddings_dallago", "prottrans_xlnet_uniref100"), 
                                  ("bioembeddings_dallago", "prottrans_t5_bfd"), ("bioembeddings_dallago", "prottrans_t5_uniref50"), ("bioembeddings_dallago", "prottrans_t5_xl_u50"),
                                  ("vespa_marquet", "vespal")]

### CDD Conservation dataframe for PMD sequences

In [3]:
conservation_df = pd.read_csv(home_dir+"data/cdd_conservation/cdd_conservationTable_pmdSequences.csv.gz", compression='gzip', comment='#')
print(conservation_df.shape)
conservation_df = conservation_df.drop_duplicates(["NPid", "qPos"], keep="first") #qPos is 1-indexed
print(conservation_df[["NPid", "qPos"]].value_counts())
print(conservation_df.shape)
print(conservation_df.columns)
conservation_df

(3689094, 36)
NPid       qPos
A000006_2  45      1
A941326_1  366     1
           479     1
           480     1
           481     1
                  ..
A900216_1  489     1
           490     1
           491     1
           492     1
R950334_1  84      1
Length: 3687781, dtype: int64
(3687781, 36)
Index(['qNo', 'NPid', 'accession', 'que', 'sub', 'ali', 'qPos', 'sPos', 'aPos',
       'qPos_', 'sPos_', 'bitscore', 'A', 'D', 'C', 'E', 'F', 'G', 'H', 'I',
       'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'cons',
       'entropy', 'qcov', 'CScore'],
      dtype='object')


Unnamed: 0,qNo,NPid,accession,que,sub,ali,qPos,sPos,aPos,qPos_,...,R,S,T,V,W,Y,cons,entropy,qcov,CScore
0,1,A000006_2,cd15353,F,F,|,45,1,1,45,...,0.000000,0.0,0.0,0.000000,0.0,0.0,F,0.164362,0.810241,0.835638
1,1,A000006_2,cd15353,V,V,|,46,2,2,46,...,0.000000,0.0,0.0,0.800000,0.0,0.0,V,0.164362,0.810241,0.835638
2,1,A000006_2,cd15353,S,S,|,47,3,3,47,...,0.000000,1.0,0.0,0.000000,0.0,0.0,S,0.000000,0.810241,1.000000
3,1,A000006_2,cd15353,P,P,|,48,4,4,48,...,0.000000,0.0,0.2,0.000000,0.0,0.0,P,0.164362,0.810241,0.835638
4,1,A000006_2,cd15353,E,E,|,49,5,5,49,...,0.000000,0.0,0.0,0.000000,0.0,0.0,E,0.000000,0.810241,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3689089,973,A994631_1,smart00808,D,D,|,1100,122,123,1100,...,0.083333,0.0,0.0,0.000000,0.0,0.0,D,0.236971,0.483696,0.763029
3689090,973,A994631_1,smart00808,I,V,,1101,123,124,1101,...,0.000000,0.0,0.0,0.500000,0.0,0.0,X,0.370150,0.483696,0.629850
3689091,973,A994631_1,smart00808,V,V,|,1102,124,125,1102,...,0.000000,0.0,0.0,0.916667,0.0,0.0,V,0.094214,0.483696,0.905786
3689092,973,A994631_1,smart00808,Q,Q,|,1103,125,126,1103,...,0.000000,0.0,0.0,0.000000,0.0,0.0,Q,0.274916,0.483696,0.725084


In [4]:
def merge_conservation_df(df:pd.DataFrame):
    # merge conservation_df with the input df
    temp_conservation_df = conservation_df[["NPid", "qPos", "CScore"]].drop_duplicates(keep="first")
    merged_df = pd.merge(df, temp_conservation_df, how="left", left_on=["pmd_nr_id", "prot_pos"], right_on=["NPid", "qPos"]) # prot-pos is 1-indexed
    columns = list(df.columns)
    columns.append("CScore")
    merged_df = merged_df[columns]
    merged_df = merged_df.rename(columns={"CScore": "conservation_pred"})
    return merged_df

### Merging protein mutation dataset (PMD) predictions

In [5]:
from models.aa_common.data_loader import get_pmd_dataset
pmd_variants_df = get_pmd_dataset(home_dir)


Log: Loading Protein Mutation Dataset (PMD) ...


  pmd_df = pd.read_csv(home_dir+"models/aa_common/datasets_pmd_analysis/pmd_data.tsv", sep="\t") # PMD: protein mutation dataset


(65901, 32)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq', 'snp_id',
       'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele', 'protein_acc',
       'protein_ver', 'verified', 'chrom', 'chrom_pos', 'variation',
       'variant_type', 'ref_allele', 'alt_allele', 'pmd_nr_id'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 10715
(60306, 32)


In [6]:
task = "pmd"
merged_df = pmd_variants_df.copy(deep=True)
merge_col_list = list(pmd_variants_df.columns)

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    if model_name=="vespal": model_name="vespa"
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}.tsv", sep="\t")
    models_pred_df = models_pred_df.drop_duplicates(keep="first")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    print(merged_df.shape)
    # if i==2: break

merged_df = merge_conservation_df(merged_df)  
print("conservation_score")  
print(merged_df.shape)

dbnsfp sift (60306, 33)
(60306, 33)
dbnsfp polyphen2_HVAR (60306, 33)
(60306, 34)
dbnsfp cadd (60306, 33)
(60306, 35)
dbnsfp mvp (60306, 33)
(60306, 36)
dbnsfp metarnn (60306, 33)
(60306, 37)
dbnsfp revel (60306, 33)
(60306, 38)
tape_rao unirep (60306, 33)
(60306, 39)
tape_rao protbert (60306, 33)
(60306, 40)
sequnet_dunham sequnet (60225, 33)
(60306, 41)
esm_rives esm1b_t33_650M_UR50S (60306, 33)
(60306, 42)
esm_rives esm1v_t33_650M_UR90S (60306, 33)
(60306, 43)
esm_rives esm2_t33_650M_UR50D (60306, 33)
(60306, 44)
bioembeddings_dallago plus_rnn (60306, 33)
(60306, 45)
bioembeddings_dallago prottrans_bert_bfd (60306, 33)
(60306, 46)
bioembeddings_dallago prottrans_albert_bfd (60306, 33)
(60306, 47)
bioembeddings_dallago prottrans_xlnet_uniref100 (60306, 33)
(60306, 48)
bioembeddings_dallago prottrans_t5_bfd (60306, 33)
(60306, 49)
bioembeddings_dallago prottrans_t5_uniref50 (60306, 33)
(60306, 50)
bioembeddings_dallago prottrans_t5_xl_u50 (60306, 33)
(60306, 51)
vespa_marquet vespa (6

In [7]:
geno2func = pd.read_csv(home_dir+f"data/SNPdbe/geno2func.tsv", sep="\t")
geno2func.replace("\\N", None, inplace=True)
print(geno2func.shape)
print(geno2func.columns)
geno2func = geno2func[['mut_id', 'SIFT_score']] # if needed, add more columns

  geno2func = pd.read_csv(home_dir+f"data/SNPdbe/geno2func.tsv", sep="\t")


(1748930, 23)
Index(['mut_id', 'md5', 'wt', 'pos', 'mt', 'in_dbSNP', 'in_SP', 'in_PMD',
       'in_1KG', 'SNAP_status', 'SNAP_bin', 'SNAP_score', 'SNAP_ri',
       'SNAP_acc', 'SIFT_bin', 'SIFT_score', 'PERC_wt', 'PERC_mt', 'PSSM_wt',
       'PSSM_mt', 'PSIC_wt', 'PSIC_mt', 'pph2'],
      dtype='object')


In [8]:
merged_df = pd.merge(left=merged_df, right=geno2func, how="left", on=["mut_id"])
merged_df = merged_df.rename(columns={"SIFT_score": "sift_snpdbe_pred"})
print(merged_df.shape)
print(merged_df.columns)

(60306, 54)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq', 'snp_id',
       'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele', 'protein_acc',
       'protein_ver', 'verified', 'chrom', 'chrom_pos', 'variation',
       'variant_type', 'ref_allele', 'alt_allele', 'pmd_nr_id', 'sift_pred',
       'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred',
       'revel_pred', 'unirep_pred', 'protbert_pred', 'sequnet_pred',
       'esm1b_t33_650M_UR50S_pred', 'esm1v_t33_650M_UR90S_pred',
       'esm2_t33_650M_UR50D_pred', 'plus_rnn_pred', 'prottrans_bert_bfd_pred',
       'prottrans_albert_bfd_pred', 'prottrans_xlnet_uniref100_pred',
       'prottrans_t5_bfd_pred', 'prottrans_t5_uniref50_pred',
       'prottrans_t5_xl_u50_pred', 'vespa_pred', 'conservation_pred',
       'sift_snpdbe_pred'],
      dtype='object')


In [10]:
supervised_method_cols = ['sift_snpdbe_pred', 'polyphen2_HVAR_pred', 'cadd_pred', 'mvp_pred', 'metarnn_pred', 'revel_pred', 'sequnet_pred', 'vespa_pred']
def get_n_supervised_methods_havings_prediction(row):
    # print(row)
    n = 0
    for col in supervised_method_cols:
        # print(pd.isna(row[col]))
        if not pd.isna(row[col]):
            n += 1
    # print(n)
    return n

for i in range(merged_df.shape[0]):
    merged_df.loc[i, "n_methods_having_preds"] = get_n_supervised_methods_havings_prediction(merged_df.loc[i])

In [11]:
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.tsv", sep="\t", index=False, header=True)