In [31]:
import os
import subprocess

import pandas as pd
import numpy as np
import torch

from tqdm import tqdm


# clean original skempi_v2: csv & pdb

In [32]:
import pandas as pd

raw_df = pd.read_csv("./datasets/skempi_v2/skempi_v2.csv", sep=';')
raw_df = raw_df[['#Pdb', 'Mutation(s)_cleaned', 'Affinity_wt_parsed', 'Affinity_mut_parsed', 'Temperature']]
col_mapping = {'#Pdb': 'pdb',
               'Mutation(s)_cleaned': 'mutation', 
               'Affinity_wt_parsed': 'affinity_wt', 
               'Affinity_mut_parsed': 'affinity_mut',
               "Temperature": "temp"}
raw_df.rename(columns=col_mapping, inplace=True)
raw_df

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294
...,...,...,...,...,...
7080,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298
7081,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298
7082,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298
7083,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298


### compute ddg & daffinity

In [33]:

import numpy as np
df = raw_df
df['temp'] = df['temp'].map(lambda temp: float(temp[:3]) if isinstance(temp, str) else temp)

R = float(8.314/4184)
df['ddg'] = R * df['temp'] * (np.log(df['affinity_mut']) - np.log(df['affinity_wt']))

df['daffinity'] = df['affinity_mut'] - df['affinity_wt']

# df = df[['pdb', 'mutation', 'ddg', 'daffinity', 'temp']]
df

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
7080,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
7081,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
7082,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
7083,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


### remove repeated wt-mut data

In [34]:

dataset = {}
for i in range(len(df)):
    wt = df.loc[i, 'pdb']
    mut = df.loc[i, 'mutation']
    if (wt, mut) not in dataset:
        dataset[(wt, mut)] = []
    dataset[(wt, mut)].append(i)
print(f"Num of wt-mut pair: {len(dataset)}")

drop_list = []
for wt_mut_pair in dataset:
    repeat_idxs = dataset[wt_mut_pair]
    if len(repeat_idxs) == 1:
        continue
    idx_298 = []
    for idx in repeat_idxs:
        if df.loc[idx, 'temp'] == 298:
            idx_298.append(idx)
    if len(idx_298) == 0:
        idx_298 = repeat_idxs
    
    ddg = np.mean([df.loc[idx, 'ddg'] for idx in idx_298])
    mean_daffinity = np.mean([df.loc[idx, 'daffinity'] for idx in idx_298])
    df.loc[repeat_idxs[0], 'ddg'] = ddg
    df.loc[repeat_idxs[0], 'daffinity'] = mean_daffinity
    
    drop_list += repeat_idxs[1:]

print(f"Num of redundant data: {len(drop_list)}")
df = df.drop(index=drop_list)
df.reset_index(drop=True, inplace=True)
# df = df[['pdb', 'mutation', 'ddg', 'daffinity']]
df

Num of wt-mut pair: 6193
Num of redundant data: 892


Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
6188,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
6189,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
6190,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
6191,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


### clean original .pdb file

In [5]:
import os
from tqdm import tqdm
import glob
from Bio.PDB import PDBParser, PDBIO, PPBuilder
from Bio.PDB import Structure, Model, Chain, Residue, Atom

original_pdb_path = "./datasets/skempi_v2/PDBs/"
cleaned_pdb_path = "./datasets/skempi_v2/cleaned_PDBs/"
os.makedirs(cleaned_pdb_path, exist_ok=True)

illegal_original_pdb = {}
original_pdb_files = glob.glob(f"{original_pdb_path}/*.pdb")
for pdb_filepath in tqdm(original_pdb_files):
    pdb_name = os.path.basename(pdb_filepath).split('.')[0]

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_name, pdb_filepath)
    new_structure = Structure.Structure(pdb_name)

    model = structure[0]  # only keep one model 
    new_model = Model.Model(model.id)

    for chain in model:
        new_chain = Chain.Chain(chain.id)

        chain = [res for res in chain.get_residues() if res.id[0] == ' ']  # exclude HOH...
        for residue in chain:
            if set(['N', 'CA', 'C', 'O']).issubset(set(residue.child_dict.keys())):
                new_chain.add(residue)
            else:
                if not illegal_original_pdb.get(pdb_filepath):
                    illegal_original_pdb[pdb_filepath] = []
                illegal_original_pdb[pdb_filepath].append(residue.full_id)

                if not (residue.id[1] == 1 or residue.id[1] == len(chain)):
                    print(f"WARNING: delete in the middle {residue.full_id} of total len {len(chain)}")

        if len(new_chain) > 0:
            new_model.add(new_chain)
    if len(new_model) > 0:
        new_structure.add(new_model)

    io = PDBIO()
    io.set_structure(new_structure)
    io.save(f"{cleaned_pdb_path}/{pdb_name}.pdb")

len(illegal_original_pdb)

 10%|▉         | 33/345 [00:03<00:40,  7.78it/s]



 32%|███▏      | 110/345 [00:12<00:22, 10.32it/s]



 51%|█████▏    | 177/345 [00:19<00:16, 10.16it/s]



 63%|██████▎   | 217/345 [00:24<00:13,  9.60it/s]



 78%|███████▊  | 268/345 [00:30<00:08,  9.29it/s]



 83%|████████▎ | 286/345 [00:31<00:05, 10.96it/s]



 95%|█████████▍| 327/345 [00:36<00:01,  9.83it/s]



100%|██████████| 345/345 [00:37<00:00,  9.08it/s]


24

### EvoEF2 generate mutant .pdb

In [9]:
import os
from tqdm import tqdm
import subprocess

EvoEF2_toolpath = "./tools/EvoEF2/EvoEF2"
cleaned_pdb_path = "./datasets/skempi_v2/cleaned_PDBs/"
mut_pdb_path = "./datasets/skempi_v2/mut_PDBs/"
os.makedirs(mut_pdb_path, exist_ok=True)

for i in tqdm(range(len(df))):
    wt = df.loc[i, 'pdb']
    mut_site = df.loc[i, 'mutation']
    name = wt.split('_')[0]
    mut_name = f"{wt}_{mut_site}"
    # print(f"{name} -> {mut_name}")

    with open(f"{mut_pdb_path}/{mut_name}.txt", 'w') as f:
        f.write(mut_site + ';')
    
    cmd = f"{EvoEF2_toolpath} --command=BuildMutant \
                --pdb={cleaned_pdb_path}/{name}.pdb --mutant_file={mut_pdb_path}/{mut_name}.txt && \
            mv {name}_Model_0001.pdb {mut_pdb_path}/{mut_name}.pdb && \
            rm {mut_pdb_path}/{mut_name}.txt"
    # print(cmd)
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL)

# besides buffer overflow, 1KBH would also fail because the illegal original .pdb and its' mutation sites.

  7%|▋         | 403/6193 [04:03<54:48,  1.76it/s]  *** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 404/6193 [04:03<45:44,  2.11it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 405/6193 [04:03<37:11,  2.59it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 407/6193 [04:04<38:28,  2.51it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 408/6193 [04:04<31:57,  3.02it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 409/6193 [04:04<27:15,  3.54it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 411/6193 [04:05<38:47,  2.48it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 412/6193 [04:05<32:36,  2.95it/s]*** buffer overflow detected ***: terminated
Aborted (core dumped)
  7%|▋         | 413/6193 [04:05<27:57,  3.45it/s]*** buffer o

### remove data that EvoEF2 failed to generate

In [35]:

import os

mut_pdb_path = "./datasets/skempi_v2/mut_PDBs/"

drop_list = []
for idx in range(len(df)):
    wt = df.loc[idx, 'pdb']
    mut = df.loc[idx, 'mutation']
    mut_pdb_filename = wt + '_' + mut + '.pdb'
    if not os.path.exists(mut_pdb_path + mut_pdb_filename):
        drop_list.append(idx)
        
print(f"Num of EvoEF2-fail-to-generate data: {len(drop_list)}") 
df = df.drop(index=drop_list)
df.reset_index(drop=True, inplace=True)
df

Num of EvoEF2-fail-to-generate data: 166


Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
6022,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
6023,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
6024,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
6025,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


### remove NaN data

In [38]:

df = df.dropna()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
5756,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
5757,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
5758,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
5759,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


In [21]:
print(f"daffinity range: [{df['daffinity'].min()}, {df['daffinity'].max()}]")
print(f"ddg range: [{df['ddg'].min()}, {df['ddg'].max()}]")


daffinity range: [-0.000111, 0.04049885]
ddg range: [-12.221723141911879, 12.221723141911879]


In [None]:
df.iloc[[df['daffinity'].idxmin()]]

In [None]:
df.iloc[[df['daffinity'].idxmax()]]

In [22]:
df.iloc[[df['ddg'].idxmin()]]

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
2676,3BTG_E_I,GI13K,6.7e-05,5.88e-14,295.0,-12.221723,-6.7e-05


In [23]:
df.iloc[[df['ddg'].idxmax()]]

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
1178,2FTL_E_I,KI15G,5.88e-14,6.7e-05,295.0,12.221723,6.7e-05


In [24]:
df.iloc[[df['ddg'].abs().idxmin()]]

Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
52,2SIC_E_I,MI67K,1.8e-11,1.8e-11,298.0,0.0,0.0


### save processed dataset

In [25]:

import torch

df.to_csv("./data/info.csv",index=False)
torch.save(df, "./data/dataset.pt")


# prepare feature

In [26]:
import os

import numpy as np
import torch

from tqdm import tqdm

### load processed dataset

In [39]:
info = torch.load("./data/dataset.pt")
info


Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
5756,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
5757,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
5758,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
5759,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


### collect .pdb file

In [28]:

import subprocess

data_pdb_path = "./data/pdb/"
os.makedirs(data_pdb_path, exist_ok=True)

os.system(f"cp datasets/skempi_v2/cleaned_PDBs/*.pdb {data_pdb_path}")
os.system(f"cp datasets/skempi_v2/mut_PDBs/*.pdb {data_pdb_path}")

print(f"Total .pdb file: {subprocess.check_output(f'ls {data_pdb_path} | wc -w', shell=True)}")


Total .pdb file: b'6372\n'


In [40]:

wt_name_list = [f"{pdb.split('_')[0]}" for pdb in set(info['pdb'])]
mut_name_list = [f"{mut}" for mut in set(info['pdb'] +'_'+ info['mutation'])]
duplicate_names = {name for name in wt_name_list if wt_name_list.count(name) > 1}
print(f"duplicate pdb: {duplicate_names}")

duplicate pdb: {'2C5D', '3SE3', '3SE4'}


### extract sequence & coordinate from .pdb file

In [18]:
from utils import *

from Bio.PDB import PDBParser

data_seq_path = "./data/seq/"
data_coord_path = "./data/coord/"
os.makedirs(data_seq_path, exist_ok=True)
os.makedirs(data_coord_path, exist_ok=True)

for name in tqdm(wt_name_list + mut_name_list):
    
    pdb_filepath = f"{data_pdb_path}/{name}.pdb"
    parser = PDBParser(QUIET=True)
    struct = parser.get_structure(name, pdb_filepath)
    res_list = get_clean_res_list(struct.get_residues(), verbose=False, ensure_ca_exist=True)
    # ensure all res contains N, CA, C and O
    res_list = [res for res in res_list if (('N' in res) and ('CA' in res) and ('C' in res) and ('O' in res))]

    # extract sequence
    seq = "".join([three_to_one.get(res.resname) for res in res_list])

    # extract coordinate
    coord = []
    for res in res_list:
        res_coord = []
        R_group = []
        for atom in res:
            if atom.get_name() in ['N', 'CA', 'C', 'O']:
                res_coord.append(atom.get_coord())
            else:
                R_group.append(atom.get_coord())

        if len(R_group) == 0:
            R_group.append(res['CA'].get_coord())
        R_group = np.array(R_group).mean(axis=0)
        res_coord.append(R_group)
        coord.append(res_coord)
    coord = np.array(coord)  # convert list directly to tensor would be rather slow, suggest use ndarray as transition
    coord = torch.tensor(coord, dtype=torch.float32)

    # save to file
    seq_to_file = f"{data_seq_path}/{name}.txt"
    coord_to_file = f"{data_coord_path}/{name}.pt"
    with open(seq_to_file, "w") as seq_file:
        seq_file.write(seq)
    torch.save(coord, coord_to_file)


100%|██████████| 6191/6191 [09:12<00:00, 11.21it/s]


### extract ProtTrans feature from sequence

In [7]:
import gc
from tqdm import tqdm
import torch
from transformers import T5Tokenizer, T5EncoderModel

In [8]:
data_seq_path = "./data/seq/"
ProtTrans_toolpath = "./tools/Prot-T5-XL-U50/"
gpu = '3'

# Load the vocabulary and ProtT5-XL-UniRef50 Model
tokenizer = T5Tokenizer.from_pretrained(ProtTrans_toolpath, do_lower_case=False)
model = T5EncoderModel.from_pretrained(ProtTrans_toolpath)
gc.collect()

# Load the model into the GPU if avilabile and switch to inference mode
device = torch.device('cuda:' + gpu if torch.cuda.is_available() and gpu else 'cpu')
model = model.to(device)
model = model.eval()


Some weights of the model checkpoint at ./tools/Prot-T5-XL-U50/ were not used when initializing T5EncoderModel: ['decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.14.layer.1.layer_norm.weight', 'decoder.block.22.layer.0.layer_norm.weight', 'decoder.block.20.layer.0.SelfAttention.v.weight', 'decoder.block.9.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.0.SelfAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.23.layer.0.SelfAttention.k.weight', 'decoder.block.11.layer.2.DenseReluDense.wo.weight', 'decoder.block.19.layer.1.EncDecAttention.o.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.16.layer.0.SelfAttention.q.weight', 'decoder.block.14.layer.0.layer_norm.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight', 'decoder.block.12.layer.1.layer_norm.weight', 'decoder.block.2.layer.0.SelfAttentio

In [9]:
data_ProtTrans_raw_path = "./data/ProtTrans_raw/"
os.makedirs(data_ProtTrans_raw_path, exist_ok=True)

for name in tqdm(wt_name_list + mut_name_list):
    with open(f"{data_seq_path}/{name}.txt") as seq_file:
        seq = seq_file.readline()
    batch_name_list = [name]
    batch_seq_list = [" ".join(list(seq))]
    # print(len(seq))
    # print(batch_name_list)
    # print(batch_seq_list)

    # Tokenize, encode sequences and load it into the GPU if possibile
    ids = tokenizer.batch_encode_plus(batch_seq_list, add_special_tokens=True, padding=True)
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # Extracting sequences' features and load it into the CPU if needed
    with torch.no_grad():
        embedding = model(input_ids=input_ids,attention_mask=attention_mask)
    embedding = embedding.last_hidden_state.cpu()

    # Remove padding (\<pad>) and special tokens (\</s>) that is added by ProtT5-XL-UniRef50 model
    for seq_num in range(len(embedding)):
        seq_len = (attention_mask[seq_num] == 1).sum()
        seq_emb = embedding[seq_num][:seq_len-1]
        # print(f"truncate padding: {embedding[seq_num].shape} -> {seq_emb.shape}")
        ProtTrans_to_file = f"{data_ProtTrans_raw_path}/{batch_name_list[seq_num]}.npy"
        np.save(ProtTrans_to_file, seq_emb)


100%|██████████| 6191/6191 [28:21<00:00,  3.64it/s]  


#### normalize raw ProtTrans

In [10]:
Max_protrans = []
Min_protrans = []
for name in tqdm(wt_name_list + mut_name_list):
    raw_protrans = np.load(f"{data_ProtTrans_raw_path}/{name}.npy")
    Max_protrans.append(np.max(raw_protrans, axis = 0))
    Min_protrans.append(np.min(raw_protrans, axis = 0))

Min_protrans = np.min(np.array(Min_protrans), axis = 0)
Max_protrans = np.max(np.array(Max_protrans), axis = 0)
print(Min_protrans)
print(Max_protrans)

np.save("./data/Max_ProtTrans_repr.npy", Max_protrans)
np.save("./data/Min_ProtTrans_repr.npy", Min_protrans)

100%|██████████| 6191/6191 [05:15<00:00, 19.62it/s]  


[-1.1695224  -0.67864954 -0.9903961  ... -0.9388567  -0.9177465
 -0.93719786]
[1.1407886 0.7556648 1.0245636 ... 0.9903357 1.1121918 0.9022764]


In [11]:
Max_protrans = np.load("./data/Max_ProtTrans_repr.npy")
Min_protrans = np.load("./data/Min_ProtTrans_repr.npy")

data_ProtTrans_path = "./data/ProtTrans/"
os.makedirs(data_ProtTrans_path, exist_ok=True)

for name in tqdm(wt_name_list + mut_name_list):
    raw_protrans = np.load(f"{data_ProtTrans_raw_path}/{name}.npy")
    protrans = (raw_protrans - Min_protrans) / (Max_protrans - Min_protrans)
    ProtTrans_to_file = f"{data_ProtTrans_path}/{name}.pt"
    torch.save(torch.tensor(protrans, dtype = torch.float32), ProtTrans_to_file)


100%|██████████| 6191/6191 [07:43<00:00, 13.35it/s]  


### extract DSSP feature from .pdb file

#### correct format of mut.pdb: col of Occupancy(55 - 60) should be "{:.2f}"

In [13]:
data_pdb_path = "./data/pdb/"

for name in tqdm(mut_name_list):
    pdb_filepath = f"{data_pdb_path}/{name}.pdb"

    with open(pdb_filepath, "r") as f:
        lines = f.readlines()

    for i in range(len(lines)):
        if lines[i].split()[0] == "REMARK":
            continue
        lines[i] = lines[i][:57] + '.00' + lines[i][60:]

    with open(pdb_filepath, "w") as f:
        f.writelines(lines)


100%|██████████| 5847/5847 [06:25<00:00, 15.17it/s]


In [14]:
from utils import *

data_pdb_path = "./data/pdb/"
data_seq_path = "./data/seq/"
dssp_toolpath = "./tools/mkdssp"

data_DSSP_path = "./data/DSSP"
os.makedirs(data_DSSP_path, exist_ok=True)

for name in tqdm(wt_name_list + mut_name_list):
    pdb_filepath = f"{data_pdb_path}/{name}.pdb"
    with open(f"{data_seq_path}/{name}.txt") as seq_file:
        seq = seq_file.readline()

    DSSP_to_file = f"{data_DSSP_path}/{name}.dssp"
    dssp_cmd = f"{dssp_toolpath} -i {pdb_filepath} -o {DSSP_to_file}"
    os.system(dssp_cmd)

    try:
        dssp_seq, dssp_matrix = process_dssp(DSSP_to_file)
        # dssp_seq: likely equal to original sequence
        # dssp_matrix (list<ndarray>): list of (1, 9) vector, length of dssp_seq
        if dssp_seq != seq:
            dssp_matrix = match_dssp(dssp_seq, dssp_matrix, seq)
        
        DSSP_to_file = f"{data_DSSP_path}/{name}.pt"
        torch.save(torch.tensor(np.array(dssp_matrix), dtype = torch.float32), DSSP_to_file)
        # shape(AA_len, 9)
        # os.system("rm {DSSP_to_file}"")
    except:
        print(f"Wrong entry prompt: $ {dssp_cmd}")
        continue


100%|██████████| 6191/6191 [16:33<00:00,  6.23it/s]  


# validate wt & mut feature match

In [42]:
from utils import match_wt2mut
# 1Y3B @282
# 1Y4A @275
# 2FTL @223

match_wt2mut("1Y3B", "1Y3B_E_I_SI41E", "./data/")
match_wt2mut("1Y4A", "1Y4A_E_I_SI40E,RI39M", "./data/")
match_wt2mut("2FTL", "2FTL_E_I_PI13A", "./data/")



verify sequence whether match: 
wt: 344 - mut: 344
321: S -> E
verify coordinate whether match: 
wt: torch.Size([344, 5, 3]) - mut: torch.Size([344, 5, 3])
verify ProtTrans feature whether match: 
wt: torch.Size([344, 1024]) - mut: torch.Size([344, 1024])
verify DSSP feature whether match: 
wt: torch.Size([344, 9]) - mut: torch.Size([344, 9])
verify sequence whether match: 
wt: 337 - mut: 337
313: R -> M
314: S -> E
verify coordinate whether match: 
wt: torch.Size([337, 5, 3]) - mut: torch.Size([337, 5, 3])
verify ProtTrans feature whether match: 
wt: torch.Size([337, 1024]) - mut: torch.Size([337, 1024])
verify DSSP feature whether match: 
wt: torch.Size([337, 9]) - mut: torch.Size([337, 9])
verify sequence whether match: 
wt: 279 - mut: 279
234: P -> A
verify coordinate whether match: 
wt: torch.Size([279, 5, 3]) - mut: torch.Size([279, 5, 3])
verify ProtTrans feature whether match: 
wt: torch.Size([279, 1024]) - mut: torch.Size([279, 1024])
verify DSSP feature whether match: 
wt: to

# split dataset

In [1]:
import random
import numpy as np
import torch

info = torch.load("./data/dataset.pt")
info


Unnamed: 0,pdb,mutation,affinity_wt,affinity_mut,temp,ddg,daffinity
0,1CSE_E_I,LI38G,1.120000e-12,5.260000e-11,294.0,2.248833,5.148000e-11
1,1CSE_E_I,LI38S,1.120000e-12,8.330000e-12,294.0,1.172229,7.210000e-12
2,1CSE_E_I,LI38P,1.120000e-12,1.020000e-07,294.0,6.671276,1.019989e-07
3,1CSE_E_I,LI38I,1.120000e-12,1.720000e-10,294.0,2.940988,1.708800e-10
4,1CSE_E_I,LI38D,1.120000e-12,1.920000e-09,294.0,4.350434,1.918880e-09
...,...,...,...,...,...,...,...
5756,3QIB_ABP_CD,KP8R,5.500000e-06,2.400000e-04,298.0,2.235909,2.345000e-04
5757,3QIB_ABP_CD,TP11A,5.500000e-06,1.100000e-03,298.0,3.137419,1.094500e-03
5758,3QIB_ABP_CD,TP11S,5.500000e-06,3.380000e-05,298.0,1.075181,2.830000e-05
5759,3QIB_ABP_CD,TP11N,5.500000e-06,4.340000e-05,298.0,1.223219,3.790000e-05


In [20]:

index = list(range(len(info)))
test_size = int(len(index) * 0.1)
index_test = random.sample(index, test_size)

mask_test = np.array([(i in index_test) for i in index])
mask_train = ~mask_test

dataset_test = info[mask_test]
dataset_train = info[mask_train]

dataset_train = dataset_train.sample(frac=1).reset_index(drop=True)
dataset_test = dataset_test.sample(frac=1).reset_index(drop=True)
print(f"train-set: {len(dataset_train)}  test-set: {len(dataset_test)}")

torch.save(dataset_train, "./data/dataset_train.pt")
torch.save(dataset_test, "./data/dataset_test.pt")