In [58]:
import json
import os
import pandas as pd
import numpy as np
from rdkit import Chem

from utils import utils_function
from utils import calculate_descriptors
from utils import generate_conformation
from utils import generate_atom_input
from utils import generate_monomer_input
from utils import generate_peptide_input

import torch
import torch.nn as nn
from model import model_utils
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Torch version: {torch.__version__}')
print(f'Device: {DEVICE}')

Torch version: 2.0.0
Device: cpu


In [33]:
config_path = 'config/CycPeptMP.json'
config = json.load(open(config_path,'r'))

In [3]:
# Example cyclic peptide drugs, without experimentally determined permeability.
# Anidulafungin, Pasireotide
new_data = pd.read_csv('data/new_data/new_data.csv')

# Check duplicates
old_data = pd.read_csv('data/CycPeptMPDB_Peptide_All.csv', low_memory=False)
for i in range(len(new_data)):
    if utils_function.canonicalize_smiles(new_data.iloc[i]['SMILES']) in old_data['SMILES'].to_list():
        print(f'Your peptide: {i} ({new_data.iloc[i]["ID_org"]}) is already in the database.')

### 0. Divide peptide into monomers (substructures)
+ Divides __peptide bond__ and __ester bond__ in the __main chain__ and splits peptide into monomers.
+ The cleaved amide group or O atom of the amide-to-ester substitution was methylated (addition of CH3), and the carboxyl group was converted to an aldehyde (addition of H).
+ __Disulfide bond__ is not included in CycPeptMPDB data, but it may be better to consider it as a division target.
+ __Bonds in side-chain__ are not subject to division to fully represent the side-chain properties.

In [None]:
# Save unique monomers
utils_function.get_unique_monomer(new_data, 'data/new_data/unique_monomer.csv')

### 1. Generate different peptide SMILES representations by SMILES enumeration as atom-level data augmentation

In [40]:
utils_function.enumerate_smiles(new_data, config, 'data/new_data/enum_smiles.csv')

100%|██████████| 2/2 [00:00<00:00, 10.60it/s]


### 2. Generate 3D conformations for peptide and monomer

+ Peptide

In [52]:
os.mkdir('sdf/new_data/')

df_enu = pd.read_csv('data/new_data/enum_smiles.csv')

# WARNING: If there is too much data, you can manually split it into multiple files for parallel computation.
# For example:
# sub_file_num = 10
# sub_file_len = len(df_enu) // sub_file_num
# for i in range(sub_file_num):
#     df_enu.iloc[i*sub_file_len:(i+1)*sub_file_len].to_csv(f'sdf/new_data/peptide_{i}.csv', index=False)

generate_conformation.generate_peptide_conformation(config, df_enu, 'sdf/new_data/peptide.sdf')

  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [03:25<00:00,  1.71s/it]


+ Monomer

In [9]:
df_monomer = pd.read_csv('data/new_data/unique_monomer.csv')
generate_conformation.generate_monomer_conformation(config, df_monomer, 'sdf/new_data/monomer.sdf')

100%|██████████| 11/11 [00:36<00:00,  3.34s/it]


### 3. Calculate 2D and 3D descriptors for peptide and monomer

#### 3.1. RDKit (208 types 2D descriptors)

In [133]:
# peptide
calculate_descriptors.calc_rdkit_descriptors(new_data['SMILES'].tolist(), 'desc/new_data/peptide_rdkit.csv')

100%|██████████| 2/2 [00:00<00:00,  6.33it/s]


In [136]:
# monomer
calculate_descriptors.calc_rdkit_descriptors(df_monomer['SMILES'].tolist(), 'desc/new_data/monomer_rdkit.csv')

100%|██████████| 11/11 [00:00<00:00, 37.17it/s]


#### 3.2. Mordred (1275 types 2D descriptors + 51 types 3D descriptors)

+ 2D

In [138]:
# peptide
calculate_descriptors.calc_mordred_2Ddescriptors(new_data['SMILES'].tolist(), 'desc/new_data/peptide_mordred_2D.csv')

100%|██████████| 2/2 [00:01<00:00,  1.05it/s]


In [139]:
# monomer
calculate_descriptors.calc_mordred_2Ddescriptors(df_monomer['SMILES'].tolist(), 'desc/new_data/monomer_mordred_2D.csv')

100%|██████████| 11/11 [00:01<00:00,  8.81it/s]


+ 3D

In [142]:
# peptide
mols = Chem.SDMolSupplier('sdf/new_data/peptide.sdf')
calculate_descriptors.calc_mordred_3Ddescriptors(mols, 'desc/new_data/peptide_mordred_3D.csv')

100%|██████████| 120/120 [00:32<00:00,  3.70it/s]


In [143]:
# monomer
mols = Chem.SDMolSupplier('sdf/new_data/monomer.sdf')
calculate_descriptors.calc_mordred_3Ddescriptors(mols, 'desc/new_data/monomer_mordred_3D.csv')

100%|██████████| 660/660 [00:32<00:00, 20.02it/s] 


#### 3.3. MOE (206 types 2D descriptors + 117 types 3D descriptors)
+ CycPeptMP used the commercial software __MOE__ to calculate some of the descriptors.
+ In particular, many of the selected 3D descriptors were computed by MOE.
+ Please manualy calculate these descriptors. I showed __utils/MOE_3D_descriptors.sh__ as an example.
+ For 2D descriptors:
    + Please wash SMILES and use washed mols for calculation.
        + for GUI: Molecule -> Wash -> Protonation: Dominant
+ For 3D descriptors:
    + First, please calculate the charge for the RDKit conformations.
        + for GUI: Compute -> Molecule -> Partial Charges
    + 21 MOPAC descriptors of the 3D descriptors were not computed due to computational cost (AM_x, MNDO_, PM3_x)

#### 3.4. Concatenation files

In [118]:
calculate_descriptors.merge_descriptors(config, 'desc/new_data/', 'data/new_data/')

### 4. Generate input for three sub-models

In [9]:
folder_path = 'model/input/new_data/'
set_name = 'new'

+ Atom model

In [51]:
df_enu = pd.read_csv('data/new_data/enum_smiles.csv')
mols = Chem.SDMolSupplier('sdf/new_data/peptide.sdf')

generate_atom_input.generate_atom_input(config, new_data, df_enu, mols, folder_path, set_name)

100%|██████████| 120/120 [00:00<00:00, 701.12it/s]
100%|██████████| 120/120 [00:00<00:00, 311.35it/s]
100%|██████████| 120/120 [00:00<00:00, 476.95it/s]
100%|██████████| 120/120 [00:00<00:00, 448.57it/s]
100%|██████████| 120/120 [00:00<00:00, 646.59it/s]


+ Monomer model

In [4]:
df_mono_2D = pd.read_csv('desc/new_data/monomer_2D.csv')
df_mono_3D = pd.read_csv('desc/new_data/monomer_3D.csv')

generate_monomer_input.generate_monomer_input(config, new_data, df_mono_2D, df_mono_3D, folder_path, set_name)

+ Peptide model

In [53]:
df_pep_2D = pd.read_csv('desc/new_data/peptide_2D.csv')
df_pep_3D = pd.read_csv('desc/new_data/peptide_3D.csv')
df_enu = pd.read_csv('data/new_data/enum_smiles.csv')

generate_peptide_input.generate_peptide_input(config, new_data, df_enu, df_pep_2D, df_pep_3D, folder_path, set_name)

100%|██████████| 2/2 [00:00<00:00, 890.51it/s]
100%|██████████| 2/2 [00:00<00:00, 756.75it/s]


### 5. Prediction

In [56]:
MODEL_TYPE = 'Fusion'
# OPTIMIZE: Augmentation times
REPLICA_NUM = 60

# Set random seed for reproducibility
seed = config['data']['seed']
model_utils.set_seed(seed)

# Import input
dataset_new = model_utils.load_dataset('model/input/new_data/', MODEL_TYPE, REPLICA_NUM, 'new')

# Determined hyperparameters
best_trial = config['model']

In [61]:
for cv in range(3):
    # Load trained weights
    model_path = f'weight/{MODEL_TYPE}/{MODEL_TYPE}-{REPLICA_NUM}_cv{cv}.cpt'
    checkpoint = torch.load(model_path)
    model = model_utils.create_model(best_trial, DEVICE, config['model']['use_auxiliary'])
    model_state = checkpoint['model_state_dict']
    model.load_state_dict(model_state)
    model = nn.DataParallel(model)
    model.to(DEVICE)

    dataloader_now = torch.utils.data.DataLoader(dataset_new, batch_size=256, shuffle=False)
    ids, exps, preds = model_utils.predict_valid(DEVICE, model, dataloader_now, None, istrain=False,
                                                 use_auxiliary=config['model']['use_auxiliary'], gamma_layer=config['model']['gamma_layer'], gamma_subout=config['model']['gamma_subout'])
    now_pred = pd.DataFrame(preds, columns=['pred'])
    now_pred['exp'] = exps
    now_pred['ID'] = ids

    # NOTE: Can save all predicted values of all replicas
    # now_pred.to_csv(f'predicted/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}_allrep.csv')

    # Take the average of all replicas
    now_pred = now_pred.groupby('ID').mean()
    now_pred.to_csv(f'predicted/new_data/{MODEL_TYPE}-{REPLICA_NUM}/{set_name}_cv{cv}.csv')

RuntimeError: Error(s) in loading state_dict for FusionModel:
	Missing key(s) in state_dict: "atom_model.embedding_atoms.weight", "atom_model.embedding_atoms.bias", "atom_model.embedding_graph.weight", "atom_model.embedding_graph.bias", "atom_model.embedding_conf.weight", "atom_model.embedding_conf.bias", "atom_model.embedding_bond.weight", "atom_model.embedding_bond.bias", "atom_model.transformer_encoder_graph.encoder_graph_0.self_attn.in_proj_weight", "atom_model.transformer_encoder_graph.encoder_graph_0.self_attn.in_proj_bias", "atom_model.transformer_encoder_graph.encoder_graph_0.self_attn.out_proj.weight", "atom_model.transformer_encoder_graph.encoder_graph_0.self_attn.out_proj.bias", "atom_model.transformer_encoder_graph.encoder_graph_0.linear1.weight", "atom_model.transformer_encoder_graph.encoder_graph_0.linear1.bias", "atom_model.transformer_encoder_graph.encoder_graph_0.linear2.weight", "atom_model.transformer_encoder_graph.encoder_graph_0.linear2.bias", "atom_model.transformer_encoder_graph.encoder_graph_0.norm1.weight", "atom_model.transformer_encoder_graph.encoder_graph_0.norm1.bias", "atom_model.transformer_encoder_graph.encoder_graph_0.norm2.weight", "atom_model.transformer_encoder_graph.encoder_graph_0.norm2.bias", "atom_model.transformer_encoder_graph.encoder_graph_1.self_attn.in_proj_weight", "atom_model.transformer_encoder_graph.encoder_graph_1.self_attn.in_proj_bias", "atom_model.transformer_encoder_graph.encoder_graph_1.self_attn.out_proj.weight", "atom_model.transformer_encoder_graph.encoder_graph_1.self_attn.out_proj.bias", "atom_model.transformer_encoder_graph.encoder_graph_1.linear1.weight", "atom_model.transformer_encoder_graph.encoder_graph_1.linear1.bias", "atom_model.transformer_encoder_graph.encoder_graph_1.linear2.weight", "atom_model.transformer_encoder_graph.encoder_graph_1.linear2.bias", "atom_model.transformer_encoder_graph.encoder_graph_1.norm1.weight", "atom_model.transformer_encoder_graph.encoder_graph_1.norm1.bias", "atom_model.transformer_encoder_graph.encoder_graph_1.norm2.weight", "atom_model.transformer_encoder_graph.encoder_graph_1.norm2.bias", "atom_model.transformer_encoder_conf.encoder_conf_0.self_attn.in_proj_weight", "atom_model.transformer_encoder_conf.encoder_conf_0.self_attn.in_proj_bias", "atom_model.transformer_encoder_conf.encoder_conf_0.self_attn.out_proj.weight", "atom_model.transformer_encoder_conf.encoder_conf_0.self_attn.out_proj.bias", "atom_model.transformer_encoder_conf.encoder_conf_0.linear1.weight", "atom_model.transformer_encoder_conf.encoder_conf_0.linear1.bias", "atom_model.transformer_encoder_conf.encoder_conf_0.linear2.weight", "atom_model.transformer_encoder_conf.encoder_conf_0.linear2.bias", "atom_model.transformer_encoder_conf.encoder_conf_0.norm1.weight", "atom_model.transformer_encoder_conf.encoder_conf_0.norm1.bias", "atom_model.transformer_encoder_conf.encoder_conf_0.norm2.weight", "atom_model.transformer_encoder_conf.encoder_conf_0.norm2.bias", "atom_model.transformer_encoder_conf.encoder_conf_1.self_attn.in_proj_weight", "atom_model.transformer_encoder_conf.encoder_conf_1.self_attn.in_proj_bias", "atom_model.transformer_encoder_conf.encoder_conf_1.self_attn.out_proj.weight", "atom_model.transformer_encoder_conf.encoder_conf_1.self_attn.out_proj.bias", "atom_model.transformer_encoder_conf.encoder_conf_1.linear1.weight", "atom_model.transformer_encoder_conf.encoder_conf_1.linear1.bias", "atom_model.transformer_encoder_conf.encoder_conf_1.linear2.weight", "atom_model.transformer_encoder_conf.encoder_conf_1.linear2.bias", "atom_model.transformer_encoder_conf.encoder_conf_1.norm1.weight", "atom_model.transformer_encoder_conf.encoder_conf_1.norm1.bias", "atom_model.transformer_encoder_conf.encoder_conf_1.norm2.weight", "atom_model.transformer_encoder_conf.encoder_conf_1.norm2.bias", "atom_model.auxiliary_concat_layers.auxiliary_concat_0.weight", "atom_model.auxiliary_concat_layers.auxiliary_concat_0.bias", "atom_model.auxiliary_concat_layers.auxiliary_concat_1.weight", "atom_model.auxiliary_concat_layers.auxiliary_concat_1.bias", "atom_model.auxiliary_out_layers.auxiliary_out_0.weight", "atom_model.auxiliary_out_layers.auxiliary_out_0.bias", "atom_model.auxiliary_out_layers.auxiliary_out_1.weight", "atom_model.auxiliary_out_layers.auxiliary_out_1.bias", "atom_model.linear_layers.linear_concat.weight", "atom_model.linear_layers.linear_concat.bias", "atom_model.linear_layers.linear_0.weight", "atom_model.linear_layers.linear_0.bias", "atom_model.linear_layers.linear_out.weight", "atom_model.linear_layers.linear_out.bias", "monomer_model.conv_layers.conv1d_0.weight", "monomer_model.conv_layers.conv1d_0.bias", "monomer_model.conv_layers.bn_conv_0.weight", "monomer_model.conv_layers.bn_conv_0.bias", "monomer_model.conv_layers.bn_conv_0.running_mean", "monomer_model.conv_layers.bn_conv_0.running_var", "monomer_model.conv_layers.conv1d_1.weight", "monomer_model.conv_layers.conv1d_1.bias", "monomer_model.conv_layers.bn_conv_1.weight", "monomer_model.conv_layers.bn_conv_1.bias", "monomer_model.conv_layers.bn_conv_1.running_mean", "monomer_model.conv_layers.bn_conv_1.running_var", "monomer_model.conv_layers.conv1d_2.weight", "monomer_model.conv_layers.conv1d_2.bias", "monomer_model.conv_layers.bn_conv_2.weight", "monomer_model.conv_layers.bn_conv_2.bias", "monomer_model.conv_layers.bn_conv_2.running_mean", "monomer_model.conv_layers.bn_conv_2.running_var", "monomer_model.conv_layers.conv1d_3.weight", "monomer_model.conv_layers.conv1d_3.bias", "monomer_model.conv_layers.bn_conv_3.weight", "monomer_model.conv_layers.bn_conv_3.bias", "monomer_model.conv_layers.bn_conv_3.running_mean", "monomer_model.conv_layers.bn_conv_3.running_var", "monomer_model.conv_layers.conv1d_4.weight", "monomer_model.conv_layers.conv1d_4.bias", "monomer_model.conv_layers.bn_conv_4.weight", "monomer_model.conv_layers.bn_conv_4.bias", "monomer_model.conv_layers.bn_conv_4.running_mean", "monomer_model.conv_layers.bn_conv_4.running_var", "monomer_model.conv_layers.conv1d_5.weight", "monomer_model.conv_layers.conv1d_5.bias", "monomer_model.conv_layers.bn_conv_5.weight", "monomer_model.conv_layers.bn_conv_5.bias", "monomer_model.conv_layers.bn_conv_5.running_mean", "monomer_model.conv_layers.bn_conv_5.running_var", "monomer_model.auxiliary_layers.auxiliary_0.weight", "monomer_model.auxiliary_layers.auxiliary_0.bias", "monomer_model.auxiliary_layers.auxiliary_1.weight", "monomer_model.auxiliary_layers.auxiliary_1.bias", "monomer_model.auxiliary_layers.auxiliary_2.weight", "monomer_model.auxiliary_layers.auxiliary_2.bias", "monomer_model.auxiliary_layers.auxiliary_3.weight", "monomer_model.auxiliary_layers.auxiliary_3.bias", "monomer_model.auxiliary_layers.auxiliary_4.weight", "monomer_model.auxiliary_layers.auxiliary_4.bias", "monomer_model.auxiliary_layers.auxiliary_5.weight", "monomer_model.auxiliary_layers.auxiliary_5.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_0.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_0.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_1.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_1.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_2.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_2.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_3.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_3.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_4.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_4.bias", "monomer_model.auxiliary_out_layers.auxiliary_out_5.weight", "monomer_model.auxiliary_out_layers.auxiliary_out_5.bias", "monomer_model.linear_layers.convlinear_0.weight", "monomer_model.linear_layers.convlinear_0.bias", "monomer_model.linear_layers.bn_convlinear_0.weight", "monomer_model.linear_layers.bn_convlinear_0.bias", "monomer_model.linear_layers.bn_convlinear_0.running_mean", "monomer_model.linear_layers.bn_convlinear_0.running_var", "monomer_model.linear_layers.out_conv.weight", "monomer_model.linear_layers.out_conv.bias", "peptide_model.desc_layers.mlp_desc_0.weight", "peptide_model.desc_layers.mlp_desc_0.bias", "peptide_model.fp_layers.mlp_fp_0.weight", "peptide_model.fp_layers.mlp_fp_0.bias", "peptide_model.auxiliary_concat_layers.auxiliary_concat_0.weight", "peptide_model.auxiliary_concat_layers.auxiliary_concat_0.bias", "peptide_model.auxiliary_out_layers.auxiliary_out_0.weight", "peptide_model.auxiliary_out_layers.auxiliary_out_0.bias", "peptide_model.linear_layers.linear_concat.weight", "peptide_model.linear_layers.linear_concat.bias", "peptide_model.linear_layers.out_mlp.weight", "peptide_model.linear_layers.out_mlp.bias". 
	Unexpected key(s) in state_dict: "atoms_model.embedding_atoms.weight", "atoms_model.embedding_atoms.bias", "atoms_model.embedding_graph.weight", "atoms_model.embedding_graph.bias", "atoms_model.embedding_conf.weight", "atoms_model.embedding_conf.bias", "atoms_model.embedding_bond.weight", "atoms_model.embedding_bond.bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.self_attn.in_proj_weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.self_attn.in_proj_bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.self_attn.out_proj.weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.self_attn.out_proj.bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.linear1.weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.linear1.bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.linear2.weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.linear2.bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.norm1.weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.norm1.bias", "atoms_model.transformer_encoder_graph.encoder_graph_0.norm2.weight", "atoms_model.transformer_encoder_graph.encoder_graph_0.norm2.bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.self_attn.in_proj_weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.self_attn.in_proj_bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.self_attn.out_proj.weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.self_attn.out_proj.bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.linear1.weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.linear1.bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.linear2.weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.linear2.bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.norm1.weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.norm1.bias", "atoms_model.transformer_encoder_graph.encoder_graph_1.norm2.weight", "atoms_model.transformer_encoder_graph.encoder_graph_1.norm2.bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.self_attn.in_proj_weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.self_attn.in_proj_bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.self_attn.out_proj.weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.self_attn.out_proj.bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.linear1.weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.linear1.bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.linear2.weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.linear2.bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.norm1.weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.norm1.bias", "atoms_model.transformer_encoder_conf.encoder_conf_0.norm2.weight", "atoms_model.transformer_encoder_conf.encoder_conf_0.norm2.bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.self_attn.in_proj_weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.self_attn.in_proj_bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.self_attn.out_proj.weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.self_attn.out_proj.bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.linear1.weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.linear1.bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.linear2.weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.linear2.bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.norm1.weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.norm1.bias", "atoms_model.transformer_encoder_conf.encoder_conf_1.norm2.weight", "atoms_model.transformer_encoder_conf.encoder_conf_1.norm2.bias", "atoms_model.auxiliary_concat_layers.auxiliary_concat_0.weight", "atoms_model.auxiliary_concat_layers.auxiliary_concat_0.bias", "atoms_model.auxiliary_concat_layers.auxiliary_concat_1.weight", "atoms_model.auxiliary_concat_layers.auxiliary_concat_1.bias", "atoms_model.auxiliary_out_layers.auxiliary_out_0.weight", "atoms_model.auxiliary_out_layers.auxiliary_out_0.bias", "atoms_model.auxiliary_out_layers.auxiliary_out_1.weight", "atoms_model.auxiliary_out_layers.auxiliary_out_1.bias", "atoms_model.linear_layers.linear_concat.weight", "atoms_model.linear_layers.linear_concat.bias", "atoms_model.linear_layers.linear_0.weight", "atoms_model.linear_layers.linear_0.bias", "atoms_model.linear_layers.linear_out.weight", "atoms_model.linear_layers.linear_out.bias", "monomers_model.conv_layers.conv1d_0.weight", "monomers_model.conv_layers.conv1d_0.bias", "monomers_model.conv_layers.bn_conv_0.weight", "monomers_model.conv_layers.bn_conv_0.bias", "monomers_model.conv_layers.bn_conv_0.running_mean", "monomers_model.conv_layers.bn_conv_0.running_var", "monomers_model.conv_layers.bn_conv_0.num_batches_tracked", "monomers_model.conv_layers.conv1d_1.weight", "monomers_model.conv_layers.conv1d_1.bias", "monomers_model.conv_layers.bn_conv_1.weight", "monomers_model.conv_layers.bn_conv_1.bias", "monomers_model.conv_layers.bn_conv_1.running_mean", "monomers_model.conv_layers.bn_conv_1.running_var", "monomers_model.conv_layers.bn_conv_1.num_batches_tracked", "monomers_model.conv_layers.conv1d_2.weight", "monomers_model.conv_layers.conv1d_2.bias", "monomers_model.conv_layers.bn_conv_2.weight", "monomers_model.conv_layers.bn_conv_2.bias", "monomers_model.conv_layers.bn_conv_2.running_mean", "monomers_model.conv_layers.bn_conv_2.running_var", "monomers_model.conv_layers.bn_conv_2.num_batches_tracked", "monomers_model.conv_layers.conv1d_3.weight", "monomers_model.conv_layers.conv1d_3.bias", "monomers_model.conv_layers.bn_conv_3.weight", "monomers_model.conv_layers.bn_conv_3.bias", "monomers_model.conv_layers.bn_conv_3.running_mean", "monomers_model.conv_layers.bn_conv_3.running_var", "monomers_model.conv_layers.bn_conv_3.num_batches_tracked", "monomers_model.conv_layers.conv1d_4.weight", "monomers_model.conv_layers.conv1d_4.bias", "monomers_model.conv_layers.bn_conv_4.weight", "monomers_model.conv_layers.bn_conv_4.bias", "monomers_model.conv_layers.bn_conv_4.running_mean", "monomers_model.conv_layers.bn_conv_4.running_var", "monomers_model.conv_layers.bn_conv_4.num_batches_tracked", "monomers_model.conv_layers.conv1d_5.weight", "monomers_model.conv_layers.conv1d_5.bias", "monomers_model.conv_layers.bn_conv_5.weight", "monomers_model.conv_layers.bn_conv_5.bias", "monomers_model.conv_layers.bn_conv_5.running_mean", "monomers_model.conv_layers.bn_conv_5.running_var", "monomers_model.conv_layers.bn_conv_5.num_batches_tracked", "monomers_model.auxiliary_layers.auxiliary_0.weight", "monomers_model.auxiliary_layers.auxiliary_0.bias", "monomers_model.auxiliary_layers.auxiliary_1.weight", "monomers_model.auxiliary_layers.auxiliary_1.bias", "monomers_model.auxiliary_layers.auxiliary_2.weight", "monomers_model.auxiliary_layers.auxiliary_2.bias", "monomers_model.auxiliary_layers.auxiliary_3.weight", "monomers_model.auxiliary_layers.auxiliary_3.bias", "monomers_model.auxiliary_layers.auxiliary_4.weight", "monomers_model.auxiliary_layers.auxiliary_4.bias", "monomers_model.auxiliary_layers.auxiliary_5.weight", "monomers_model.auxiliary_layers.auxiliary_5.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_0.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_0.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_1.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_1.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_2.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_2.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_3.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_3.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_4.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_4.bias", "monomers_model.auxiliary_out_layers.auxiliary_out_5.weight", "monomers_model.auxiliary_out_layers.auxiliary_out_5.bias", "monomers_model.linear_layers.convlinear_0.weight", "monomers_model.linear_layers.convlinear_0.bias", "monomers_model.linear_layers.bn_convlinear_0.weight", "monomers_model.linear_layers.bn_convlinear_0.bias", "monomers_model.linear_layers.bn_convlinear_0.running_mean", "monomers_model.linear_layers.bn_convlinear_0.running_var", "monomers_model.linear_layers.bn_convlinear_0.num_batches_tracked", "monomers_model.linear_layers.out_conv.weight", "monomers_model.linear_layers.out_conv.bias", "peptides_model.desc_layers.mlp_desc_0.weight", "peptides_model.desc_layers.mlp_desc_0.bias", "peptides_model.fp_layers.mlp_fp_0.weight", "peptides_model.fp_layers.mlp_fp_0.bias", "peptides_model.auxiliary_concat_layers.auxiliary_concat_0.weight", "peptides_model.auxiliary_concat_layers.auxiliary_concat_0.bias", "peptides_model.auxiliary_out_layers.auxiliary_out_0.weight", "peptides_model.auxiliary_out_layers.auxiliary_out_0.bias", "peptides_model.linear_layers.linear_concat.weight", "peptides_model.linear_layers.linear_concat.bias", "peptides_model.linear_layers.out_mlp.weight", "peptides_model.linear_layers.out_mlp.bias". 

In [None]:
df_peptide = pd.read_csv(data_args['org_peptide_path'], low_memory=False)
df_monomer = pd.read_csv(data_args['org_monomer_path'], low_memory=False)

smiles = df_peptide['SMILES'].tolist()
shape = df_peptide['Molecule_Shape'].to_list()
helm = df_peptide['HELM'].to_list()
symbol_to_smiles = dict(zip(df_monomer['Symbol'], df_monomer['capped_SMILES']))
symbol_to_cxsmiles = dict(zip(df_monomer['Symbol'], df_monomer['CXSMILES']))
R3_dict = dict(zip(df_monomer['Symbol'], df_monomer['R3']))
smiles_to_symbol = dict(zip(df_monomer['capped_SMILES'], df_monomer['Symbol']))

In [None]:
substructure_list, substructure_num = [], []

for i in range(len(df_peptide)):

    now_substructure = []
    now_seq = helm[i].split('$')[0].split('{')[1].replace('}', '').replace('[', '').replace(']', '').split('.')

    if shape[i] == 'Circle':
        now_substructure = [symbol_to_smiles[_] for _ in now_seq]
    elif shape[i] == 'Lariat':
        # Lariat peptides, do not divide bonds of side chain
        atts = helm[i].split('$')[1].split(',')[2].split('-')
        atts_num = [int(_.split(':')[0]) for _ in atts]
        atts_R = [_.split(':')[1] for _ in atts]

        # HELM example of this case: PEPTIDE48{A.A.L.[meV].L.F.F.P.I.T.G.D.[-pip]}$PEPTIDE48,PEPTIDE48,1:R1-12:R3$$$
        if atts_num[0] == 1:
            # NOTE: This case were all R1-R3
            # if atts_R[0] != 'R1':
            #     print(f'{i}, 0, {atts_R[0]}')
            # elif atts_R[1] != 'R3':
            #     print(f'{i}, 1, {atts_R[1]}')

            now_substructure = [symbol_to_smiles[_] for _ in now_seq[:atts_num[1]-1]]
            # monomers to combine
            cxsmiles = [symbol_to_cxsmiles[_] for _ in now_seq[atts_num[1]-1:]]
            # NOTE: 第一个cap两处(R1, R3), side chain不cap
            tmp = cxsmiles[0].split(' |')[0]
            for _ in re.findall('_R\d', cxsmiles[0]):
                if _ == '_R1':
                    tmp = tmp.replace('[*]', '[CH3]', 1)
                elif _ == '_R2':
                    tmp = tmp.replace('[*]', '[2C]', 1)
                elif _ == '_R3':
                    if R3_dict[now_seq[atts_num[1]-1]] == 'H':
                        tmp = tmp.replace('[*]', '[CH3]', 1)
                    elif R3_dict[now_seq[atts_num[1]-1]] == 'OH':
                        tmp = tmp.replace('[*]', '[H]', 1)
            cxsmiles[0] = tmp

            combined = utils_function.combine_cxsmiles(cxsmiles, now_seq[atts_num[1]-1:], R3_dict)
            now_substructure.append(combined)

        # HELM example of this case: PEPTIDE959{[Mono22-].G.T.[Mono23].[Mono24].[dLeu(3R-OH)].[dSer(Me)].G.A.[meT].[dTyr(bR-OMe)].[Mono25]}$PEPTIDE959,PEPTIDE959,6:R3-12:R2$$$
        else:
            # NOTE: This case were all R3-R2
            # if atts_R[0] != 'R3':
            #     print(f'{i}, 0, {atts_R[0]}')
            # elif atts_R[1] != 'R2':
            #     print(f'{i}, 1, {atts_R[1]}')
            cxsmiles = [symbol_to_cxsmiles[_] for _ in now_seq[:atts_num[0]]]
            # NOTE: 最后一个cap两处(R2, R3), side chain不cap
            tmp = cxsmiles[-1].split(' |')[0]
            for _ in re.findall('_R\d', cxsmiles[-1]):
                if _ == '_R1':
                    tmp = tmp.replace('[*]', '[1C]', 1)
                elif _ == '_R2':
                    tmp = tmp.replace('[*]', '[H]', 1)
                elif _ == '_R3':
                    if R3_dict[now_seq[atts_num[0]-1]] == 'H':
                        tmp = tmp.replace('[*]', '[CH3]', 1)
                    elif R3_dict[now_seq[atts_num[0]-1]] == 'OH':
                        tmp = tmp.replace('[*]', '[H]', 1)
            cxsmiles[-1] = tmp

            combined = utils_function.combine_cxsmiles(cxsmiles, now_seq[:atts_num[0]], R3_dict)
            now_substructure.append(combined)
            now_substructure += [symbol_to_smiles[_] for _ in now_seq[atts_num[0]:]]

    substructure_num.append(len(now_substructure))
    if len(now_substructure) < data_args['monomer_max_len']:
        now_substructure += [''] * (data_args['monomer_max_len'] - len(now_substructure))
    substructure_list.append(now_substructure)

# check
df_peptide['Monomer_Length_in_Main_Chain'].to_list() == substructure_num

In [None]:
# Save substructure table
if not os.path.exists(data_args['substructures_table_path']):
    pd.concat([df_peptide[['CycPeptMPDB_ID', 'Source', 'Year', 'Original_Name_in_Source_Literature', \
                           'Structurally_Unique_ID', 'Same_Peptides_ID', 'SMILES', 'HELM', \
                           'Monomer_Length', 'Monomer_Length_in_Main_Chain', 'Molecule_Shape', 'Permeability', \
                           'PAMPA', 'Caco2', 'MDCK', 'RRCK']],
               pd.DataFrame(substructure_list, columns=[f'Substructure-{i}' for i in range(1, data_args['monomer_max_len']+1)])], axis=1).to_csv(data_args['substructures_table_path'], index=False)