In [8]:
import os
import sys
home_dir = "../../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd

In [2]:
# 
model_root_and_name_tuple_list = [("dbnsfp", "sift"), ("dbnsfp", "polyphen2_HVAR"), ("dbnsfp", "cadd"), ("dbnsfp", "mvp"), ("dbnsfp", "metarnn"), ("dbnsfp", "revel"),
                                  ("tape_rao", "unirep"), ("tape_rao", "protbert"), ("sequnet_dunham", "sequnet"), 
                                  ("esm_rives", "esm1b_t33_650M_UR50S"), ("esm_rives", "esm1v_t33_650M_UR90S"), ("esm_rives", "esm2_t33_650M_UR50D"),
                                  ("bioembeddings_dallago", "plus_rnn"), 
                                  ("bioembeddings_dallago", "prottrans_bert_bfd"), ("bioembeddings_dallago", "prottrans_albert_bfd"),
                                  ("bioembeddings_dallago", "prottrans_xlnet_uniref100"), 
                                  ("bioembeddings_dallago", "prottrans_t5_bfd"), ("bioembeddings_dallago", "prottrans_t5_uniref50"), ("bioembeddings_dallago", "prottrans_t5_xl_u50")]

### Merging pathogenicity analysis predictions

In [9]:
pathogenicity_type = "likely_pathogenic" # pathogenic, likely_pathogenic

from models.aa_common.data_loader import get_pathogenicity_analysis_SNVs
list_of_variants_df = get_pathogenicity_analysis_SNVs(home_dir=home_dir, pathogenicity_type=pathogenicity_type)


Log: Loading combined fasta iterator ...
likely_pathogenic raw data: (6668, 12)
Index(['clinvar_id', 'gene_symbol', 'gene_id', 'chrom_acc_version',
       'chrom_pos', 'ref_allele', 'alt_allele', 'prot_acc_version', 'prot_pos',
       'wt', 'mut', 'class'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 1077
(4168, 12)

Log: Loading population freq variants dataset ...
Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')
0 (8336, 10)
1 (8336, 10)
2 (8336, 10)
3 (8336, 10)
4 (8336, 10)
5 (8336, 10)
6 (8336, 10)
7 (8336, 10)
8 (8336, 10)
9 (8336, 10)


In [7]:
for analysis_no, patho_variants_df in enumerate(list_of_variants_df):
    print(analysis_no)
    merged_df = patho_variants_df.copy(deep=True)
    merge_col_list = list(patho_variants_df.columns)
    
    for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):    
        models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{pathogenicity_type}/{str(analysis_no)}.csv", sep="\t")
        print(model_root, model_name, models_pred_df.shape)
        merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
        merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
        print(merged_df.shape)
        # if i==5: break
        
    merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{pathogenicity_type}_analysis/{str(analysis_no)}.csv", sep="\t", index=False, header=True)
    # break

0
dbnsfp sift (4586, 11)
(4586, 11)
dbnsfp polyphen2_HVAR (4586, 11)
(4586, 12)
dbnsfp cadd (4586, 11)
(4586, 13)
dbnsfp mvp (4586, 11)
(4586, 14)
dbnsfp metarnn (4586, 11)
(4586, 15)
dbnsfp revel (4586, 11)
(4586, 16)
tape_rao unirep (4586, 11)
(4586, 17)
tape_rao protbert (4586, 11)
(4586, 18)
sequnet_dunham sequnet (4582, 11)
(4586, 19)
esm_rives esm1b_t33_650M_UR50S (4586, 11)
(4586, 20)
esm_rives esm1v_t33_650M_UR90S (4586, 11)
(4586, 21)
esm_rives esm2_t33_650M_UR50D (4586, 11)
(4586, 22)
bioembeddings_dallago plus_rnn (4586, 11)
(4586, 23)
bioembeddings_dallago prottrans_bert_bfd (4586, 11)
(4586, 24)
bioembeddings_dallago prottrans_albert_bfd (4586, 11)
(4586, 25)
bioembeddings_dallago prottrans_xlnet_uniref100 (4586, 11)
(4586, 26)
bioembeddings_dallago prottrans_t5_bfd (4586, 11)
(4586, 27)
bioembeddings_dallago prottrans_t5_uniref50 (4586, 11)
(4586, 28)
bioembeddings_dallago prottrans_t5_xl_u50 (4586, 11)
(4586, 29)
1
dbnsfp sift (4586, 11)
(4586, 11)
dbnsfp polyphen2_HVAR 

### Merging population frequency predictions

In [21]:
from models.aa_common.data_loader import get_population_freq_SNVs
popu_freq_variants_df = get_population_freq_SNVs(home_dir=home_dir)
popu_freq_variants_df


Log: Loading data ...
(95223, 13)
Index(['snp_id', 'chrom_acc_version', 'chrom_pos', 'ref_allele', 'alt_allele',
       'prot_acc_version', 'prot_pos', 'wt', 'mut', 'wt_population',
       'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')
After combining common (18279), rare (29383) and sampled-singletons (48434), data: (95223, 13)
NP_001277137.1    95
NP_001291317.1    78
NP_002115.2       74
NP_112241.2       68
NP_001075106.2    65
                  ..
NP_037408.2        1
NP_066951.1        1
NP_001013716.2     1
NP_001316544.1     1
NP_065171.2        1
Name: prot_acc_version, Length: 14997, dtype: int64


Unnamed: 0,snp_id,chrom_acc_version,chrom_pos,ref_allele,alt_allele,prot_acc_version,prot_pos,wt,mut,wt_population,mut_poulation,wt_freq,mt_freq
0,rs41288789,NC_000002.12,26944749,G,A,NP_064519.2,512,A,T,218102,10302,0.954896,0.045104
1,rs34603401,NC_000001.11,9245386,A,C,NP_004276.2,151,D,A,225918,32086,0.875638,0.124362
2,rs115026899,NC_000001.11,210683774,C,A,NP_758872.1,826,G,V,211392,6706,0.969252,0.030748
3,rs17260829,NC_000006.12,122451974,T,C,NP_065806.1,225,S,G,301772,13333,0.957687,0.042313
4,rs1438318937,NC_000019.10,54575019,A,G,NP_001124389.2,214,E,G,11862,5489,0.683649,0.316351
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95218,rs1404095987,NC_000007.14,143755938,T,C,NP_848656.2,574,N,S,11862,1,0.999916,0.000084
95219,rs1331626452,NC_000019.10,57328412,G,C,NP_998763.2,317,G,A,4470,1,0.999776,0.000224
95220,rs751861106,NC_000020.11,44921715,G,A,NP_001359108.1,287,R,Q,35412,1,0.999972,0.000028
95221,rs771902243,NC_000008.11,28503161,C,T,NP_059108.1,50,L,M,35424,1,0.999972,0.000028


In [23]:
task = "popu_freq"
merged_df = popu_freq_variants_df.copy(deep=True)
merge_col_list = list(popu_freq_variants_df.columns)

merged_df.loc[merged_df["mt_freq"]>=.01, "class"] = "Common"
merged_df.loc[(merged_df["mt_freq"]<.01) & (merged_df["mt_freq"]>=.001), "class"] = "Rare"
merged_df.loc[(merged_df["mt_freq"]<.001), "class"] = "Extremely rare"


for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}_(mt-wt).csv", sep="\t")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    merged_df = merged_df.drop_duplicates(keep="first")
    print(merged_df.shape)
    # if i==5: break
    
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}_analysis.csv", sep="\t", index=False, header=True)

dbnsfp sift (95223, 14)
(95223, 15)
dbnsfp polyphen2_HVAR (95223, 14)
(95223, 16)
dbnsfp cadd (95223, 14)
(95223, 17)
dbnsfp mvp (95223, 14)
(95223, 18)
dbnsfp metarnn (95223, 14)
(95223, 19)
dbnsfp revel (95223, 14)
(95223, 20)
tape_rao unirep (95223, 14)
(95223, 21)
tape_rao protbert (95223, 14)
(95223, 22)
sequnet_dunham sequnet (95110, 14)
(95223, 23)
esm_rives esm1b_t33_650M_UR50S (95223, 14)
(95223, 24)
esm_rives esm1v_t33_650M_UR90S (95223, 14)
(95223, 25)
esm_rives esm2_t33_650M_UR50D (95223, 14)
(95223, 26)
bioembeddings_dallago plus_rnn (95223, 14)
(95223, 27)
bioembeddings_dallago prottrans_bert_bfd (95223, 14)
(95223, 28)
bioembeddings_dallago prottrans_albert_bfd (95223, 14)
(95223, 29)
bioembeddings_dallago prottrans_xlnet_uniref100 (95223, 14)
(95223, 30)
bioembeddings_dallago prottrans_t5_bfd (95223, 14)
(95223, 31)
bioembeddings_dallago prottrans_t5_uniref50 (95223, 14)
(95223, 32)
bioembeddings_dallago prottrans_t5_xl_u50 (95223, 14)
(95223, 33)


### Merging protein mutation dataset (PMD) predictions

In [11]:
from models.aa_common.data_loader import get_pmd_analysis_dataset
pmd_variants_df = get_pmd_analysis_dataset(home_dir=home_dir)
pmd_variants_df["functional_effect"].value_counts()


Log: Loading Protein Mutation Dataset (PMD) ...
(55465, 15)
Index(['mut_id', 'pmd_id', 'protein', 'mut_PMD', 'mut_real', 'function',
       'taxid', 'function_e', 'function_e2', 'functional_effect', 'seq',
       'protein_id', 'wt', 'mut', 'prot_pos'],
      dtype='object')

Log: excluding variants corresponding to proteins having seq-len>1022 ...

Log: Loading combined fasta iterator ...
#-protein sequences (seq-len<=1022): 7677
(51047, 15)


Knock-out    25116
No effect    13297
Effect       12634
Name: functional_effect, dtype: int64

In [9]:
task = "pmd_analysis"
merged_df = pmd_variants_df.copy(deep=True)
merge_col_list = list(pmd_variants_df.columns)

for i, (model_root, model_name) in enumerate(model_root_and_name_tuple_list):
    models_pred_df = pd.read_csv(home_dir+f"models/{model_root}/outputs/{model_name}/{task}/preds_{model_name}_(mt-wt).csv", sep=",")
    print(model_root, model_name, models_pred_df.shape)
    merged_df = pd.merge(left=merged_df, right=models_pred_df, how="left", on=merge_col_list)
    merged_df = merged_df.rename(columns={"pred": f"{model_name}_pred"})
    print(merged_df.shape)
    # if i==5: break
    
merged_df.to_csv(home_dir+f"models/aa_common/merged_predictions/{task}.csv", sep="\t", index=False, header=True)

tape_rao unirep (51047, 16)
(51047, 16)
tape_rao protbert (51047, 16)
(51047, 17)
sequnet_dunham sequnet (50994, 16)
(51047, 18)
esm_rives esm1b_t33_650M_UR50S (51047, 16)
(51047, 19)
esm_rives esm1v_t33_650M_UR90S (51047, 16)
(51047, 20)
esm_rives esm2_t33_650M_UR50D (51047, 16)
(51047, 21)
bioembeddings_dallago plus_rnn (51047, 16)
(51047, 22)
bioembeddings_dallago prottrans_bert_bfd (51047, 16)
(51047, 23)
bioembeddings_dallago prottrans_albert_bfd (51047, 16)
(51047, 24)
bioembeddings_dallago prottrans_xlnet_uniref100 (51047, 16)
(51047, 25)
bioembeddings_dallago prottrans_t5_bfd (51047, 16)
(51047, 26)
bioembeddings_dallago prottrans_t5_uniref50 (51047, 16)
(51047, 27)
bioembeddings_dallago prottrans_t5_xl_u50 (51047, 16)
(51047, 28)
