In [None]:
!pip install git+https://github.com/anton-bushuiev/mutils.git

In [15]:
import copy

import esm
import biotite
import numpy as np
import pandas as pd
from tqdm import tqdm
 
from mutils.data import load_SKEMPI2
from mutils.pdb import get_sequences
from mutils.definitions import MUTILS_SKEMPI2_DIR

tqdm.pandas()

In [16]:
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
model = model.eval()



In [17]:
def predict_ddg(esm_model, pdb_path, mutation, chain_ids=None):
    """
    See Appendix B in https://arxiv.org/pdf/2310.18515.pdf
    """
    # Load structure and wild-type sequences
    pdb_path = str(pdb_path)
    if chain_ids is None:
        chain_ids = list(get_sequences(pdb_path).keys())
    structure = esm.inverse_folding.util.load_structure(pdb_path, chain_ids)
    structure = biotite.structure.array([atom for atom in structure if not atom.hetero])
    coords, native_seqs = esm.inverse_folding.multichain_util.extract_coords_from_complex(structure)

    # Create mutant sequences
    mutated_seqs = copy.deepcopy(native_seqs)
    for point_mut in mutation.split(','):
        wt, chain, pos, mut = point_mut[0], point_mut[1], int(point_mut[2:-1]), point_mut[-1]
        pos -= 1  # 0-based indexing
        seq_wt = native_seqs[chain]
        assert seq_wt[pos] == wt, f'Wild-type sequence does not match the provided mutation: {seq_wt[pos]} != {wt}'
        seq_mut = mutated_seqs[chain]
        mutated_seqs[chain] = seq_mut[:pos] + mut + seq_mut[pos+1:]

    # Calculate average log likelihood for wild-type and mutant complexes
    ll_wt, ll_mut = [], []
    for chain in native_seqs.keys():
        seq_wt = native_seqs[chain]
        seq_mut = mutated_seqs[chain]
        if seq_wt != seq_mut:
            ll_wt_chain, _ = esm.inverse_folding.multichain_util.score_sequence_in_complex(
                esm_model, alphabet, coords, chain, seq_wt)
            ll_mut_chain, _ = esm.inverse_folding.multichain_util.score_sequence_in_complex(
                esm_model, alphabet, coords, chain, seq_mut)
            ll_wt.append(ll_wt_chain)
            ll_mut.append(ll_mut_chain)
    ll_wt = np.mean(ll_wt)
    ll_wt = ll_wt.round(2)
    ll_mut = np.mean(ll_mut)

    # Calculate predicted ddG
    ddg_pred = ll_wt - ll_mut
    return ddg_pred
    

# To have SKEMPI2 .pdb files in `MUTILS_SKEMPI2_DIR / 'PDBs'` directory,
# clone mutils from github (https://github.com/anton-bushuiev/mutils) and install in editable mode (pip install -e mutils).
# Alternatively, download the files from https://life.bsc.es/pid/skempi2/database/index
# predict_ddg(model, MUTILS_SKEMPI2_DIR / 'PDBs' / '1C4Z.pdb', 'ED90R')
# predict_ddg(model, MUTILS_SKEMPI2_DIR / 'PDBs' / '1KNE.pdb', 'TP2K')
# predict_ddg(model, MUTILS_SKEMPI2_DIR / 'PDBs' / '1KNE.pdb', 'DA40T,TP2L')
predict_ddg(model, MUTILS_SKEMPI2_DIR / 'PDBs' / '2NOJ.pdb', 'RB24A,NB31A')



0.17954417787749177

In [5]:
# Read ids for test complexes
df_ppiformer = pd.read_csv('https://raw.githubusercontent.com/anton-bushuiev/mutils/main/mutils/datasets/SKEMPI2/predictions_test/ppiformer.csv')
test_complexes = df_ppiformer['complex'].unique()
test_complexes

array(['1KNE_A_P', '1C4Z_ABC_D', '5CXB_A_B', '5CYK_A_B', '1BRS_A_D',
       '1B2U_A_D', '1B2S_A_D', '1B3S_A_D', '1X1W_A_D', '1X1X_A_D',
       '2GOX_A_B', '3D5S_A_C', '3D5R_A_C', '2NOJ_A_B'], dtype=object)

In [6]:
# Read dataframe for SKEMPI2 test set
from mutils.data import load_SKEMPI2
df_s2 = load_SKEMPI2()[0]
df_s2_test = df_s2[df_s2['#Pdb'].isin(test_complexes)]
df_s2_test

Unnamed: 0,#Pdb,Mutation(s)_PDB,Mutation(s)_cleaned,iMutation_Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_mut_parsed,Affinity_wt (M),Affinity_wt_parsed,...,dS_wt (cal mol^(-1) K^(-1)),Notes,Method,SKEMPI version,dG_mut,dG_wt,ddG,PDB Id,Partner 1,Partner 2
104,1BRS_A_D,KA27A,KA25A,COR,Other,"1BRS_A_D,1B2U_A_D,1B2S_A_D,1B3S_A_D,1X1W_A_D,1...",8.800000e-11,8.800000e-11,1.000000e-14,1.000000e-14,...,-1.01,"Thermodynamic data from 9126847.,,",ITC,1,-13.717446,-19.098395,5.380949,1BRS,A,D
105,1BRS_A_D,RA59A,RA57A,COR,Other,"1BRS_A_D,1B2U_A_D,1B2S_A_D,1B3S_A_D,1X1W_A_D,1...",7.000000e-11,7.000000e-11,1.000000e-14,1.000000e-14,...,-1.01,"Thermodynamic data from 9126847.,,",ITC,1,-13.853024,-19.098395,5.245372,1BRS,A,D
106,1BRS_A_D,RA83Q,RA81Q,COR,Other,"1BRS_A_D,1B2U_A_D,1B2S_A_D,1B3S_A_D,1X1W_A_D,1...",9.400000e-11,9.400000e-11,1.000000e-14,1.000000e-14,...,,,SFFL,1,-13.678369,-19.098395,5.420026,1BRS,A,D
107,1BRS_A_D,RA87A,RA85A,SUP,Other,"1BRS_A_D,1B2U_A_D,1B2S_A_D,1B3S_A_D,1X1W_A_D,1...",1.200000e-10,1.200000e-10,1.000000e-14,1.000000e-14,...,-1.01,"Thermodynamic data from 9126847.,,",ITC,1,-13.533694,-19.098395,5.564701,1BRS,A,D
108,1BRS_A_D,HA102A,HA100A,COR,Other,"1BRS_A_D,1B2U_A_D,1B2S_A_D,1B3S_A_D,1X1W_A_D,1...",3.200000e-10,3.200000e-10,1.000000e-14,1.000000e-14,...,-1.01,"Thermodynamic data from 9126847.,,",ITC,1,-12.952600,-19.098395,6.145795,1BRS,A,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6241,5CYK_A_B,EB486R,EB52R,COR,Other,"5CXB_A_B,5CYK_A_B",2.700000e-09,2.700000e-09,2.490000e-07,2.490000e-07,...,,Crystal structure is one of the mutants in the...,BI,2,-11.689086,-9.008714,-2.680372,5CYK,A,B
6242,5CYK_A_B,"EB486R,EB481D","EB52R,EB47D","COR,COR",Other,"5CXB_A_B,5CYK_A_B",5.000000e-09,5.000000e-09,2.490000e-07,2.490000e-07,...,,Crystal structure is one of the mutants in the...,BI,2,-11.324025,-9.008714,-2.315311,5CYK,A,B
6243,5CYK_A_B,"EB486R,TB484Q","EB52R,TB50Q","COR,COR",Other,"5CXB_A_B,5CYK_A_B",3.000000e-09,3.000000e-09,2.490000e-07,2.490000e-07,...,,Crystal structure is one of the mutants in the...,BI,2,-11.626665,-9.008714,-2.617951,5CYK,A,B
6244,5CYK_A_B,EB486A,EB52A,COR,Other,"5CXB_A_B,5CYK_A_B",7.000000e-09,7.000000e-09,2.490000e-07,2.490000e-07,...,,Crystal structure is one of the mutants in the...,BI,2,-11.124682,-9.008714,-2.115968,5CYK,A,B


In [18]:
# To have SKEMPI2 .pdb files in `MUTILS_SKEMPI2_DIR / 'PDBs'` directory,
# clone mutils from github (https://github.com/anton-bushuiev/mutils) and install in editable mode (pip install -e mutils).
# Alternatively, download the files from https://life.bsc.es/pid/skempi2/database/index
df_s2_test['ddG_pred'] = df_s2_test.progress_apply(
    lambda row: predict_ddg(
        esm_model=model,
        pdb_path=MUTILS_SKEMPI2_DIR / 'PDBs' / f'{row["PDB Id"]}.pdb',
        mutation=row['Mutation(s)_cleaned'],
        chain_ids=list(''.join(row['#Pdb'].split('_')[1:]))
    ),
    axis=1
)

100%|██████████| 219/219 [27:37<00:00,  7.57s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s2_test['ddG_pred'] = df_s2_test.progress_apply(


In [25]:
df_s2_test.groupby('Protein 1')[['ddG', 'ddG_pred']].corr('spearman')

Unnamed: 0_level_0,Unnamed: 1_level_0,ddG,ddG_pred
Protein 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Barnase,ddG,1.0,0.176488
Barnase,ddG_pred,0.176488,1.0
C. thermophilum YTM1,ddG,1.0,0.085367
C. thermophilum YTM1,ddG_pred,0.085367,1.0
Complement C3d,ddG,1.0,0.339032
Complement C3d,ddG_pred,0.339032,1.0
E6AP,ddG,1.0,0.21321
E6AP,ddG_pred,0.21321,1.0
dHP1 Chromodomain,ddG,1.0,0.104578
dHP1 Chromodomain,ddG_pred,0.104578,1.0
