In [1]:
import json
import os
import pandas as pd
import numpy as np
from rdkit import Chem

from utils import utils_function
from utils import calculate_descriptors
from utils import generate_conformation

In [2]:
config_path = 'config/CycPeptMP.json'
config = json.load(open(config_path,'r'))

In [3]:
# Example cyclic peptide drugs, without experimentally determined permeability.
# Anidulafungin, Pasireotide
# NOTE: 365, 376
new_data = pd.read_csv('data/new_data/new_data.csv')

# Check duplicates
old_data = pd.read_csv('data/CycPeptMPDB_Peptide_All.csv', low_memory=False)
for i in range(len(new_data)):
    if utils_function.canonicalize_smiles(new_data.iloc[i]['SMILES']) in old_data['SMILES'].to_list():
        print(f'Your peptide: {i} ({new_data.iloc[i]["ID_org"]}) is already in the database.')

### 0. Divide peptide into monomers (substructures)
+ Divides __peptide bond__ and __ester bond__ in the __main chain__ and splits peptide into monomers.
+ The cleaved amide group or O atom of the amide-to-ester substitution was methylated (addition of CH3), and the carboxyl group was converted to an aldehyde (addition of H).
+ __Disulfide bond__ is not included in CycPeptMPDB data, but it may be better to consider it as a division target.
+ __Bonds in side-chain__ are not subject to division to fully represent the side-chain properties.

In [None]:
# Save unique monomers
utils_function.get_unique_monomer(new_data, 'data/new_data/unique_monomer.csv')

### 1. Generate different peptide SMILES representations by SMILES enumeration as atom-level data augmentation

In [40]:
utils_function.enumerate_smiles(new_data, config, 'data/new_data/enum_smiles.csv')

100%|██████████| 2/2 [00:00<00:00, 10.60it/s]


### 2. Generate 3D conformations for peptide and monomer

+ Peptide

In [52]:
os.mkdir('sdf/new_data/')

df_enu = pd.read_csv('data/new_data/enum_smiles.csv')

# WARNING: If there is too much data, you can manually split it into multiple files for parallel computation.
# For example:
# sub_file_num = 10
# sub_file_len = len(df_enu) // sub_file_num
# for i in range(sub_file_num):
#     df_enu.iloc[i*sub_file_len:(i+1)*sub_file_len].to_csv(f'sdf/new_data/peptide_{i}.csv', index=False)

generate_conformation.generate_peptide_conformation(config, df_enu, 'sdf/new_data/peptide.sdf')

  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [03:25<00:00,  1.71s/it]


+ Monomer

In [122]:
df_monomer = pd.read_csv('data/new_data/unique_monomer.csv')
generate_conformation.generate_monomer_conformation(config, df_monomer, 'sdf/new_data/monomer.sdf')

100%|██████████| 11/11 [00:38<00:00,  3.46s/it]


### 3. Calculate 2D and 3D descriptors for peptide and monomer

#### 3.1. RDKit (208 types 2D descriptors)

In [133]:
# peptide
calculate_descriptors.calc_rdkit_descriptors(new_data['SMILES'].tolist(), 'desc/new_data/peptide_rdkit.csv')

100%|██████████| 2/2 [00:00<00:00,  6.33it/s]


In [136]:
# monomer
calculate_descriptors.calc_rdkit_descriptors(df_monomer['SMILES'].tolist(), 'desc/new_data/monomer_rdkit.csv')

100%|██████████| 11/11 [00:00<00:00, 37.17it/s]


#### 3.2. Mordred (1275 types 2D descriptors + 51 types 3D descriptors)

+ 2D

In [138]:
# peptide
calculate_descriptors.calc_mordred_2Ddescriptors(new_data['SMILES'].tolist(), 'desc/new_data/peptide_mordred_2D.csv')

100%|██████████| 2/2 [00:01<00:00,  1.05it/s]


In [139]:
# monomer
calculate_descriptors.calc_mordred_2Ddescriptors(df_monomer['SMILES'].tolist(), 'desc/new_data/monomer_mordred_2D.csv')

100%|██████████| 11/11 [00:01<00:00,  8.81it/s]


+ 3D

In [142]:
# peptide
mols = Chem.SDMolSupplier('sdf/new_data/peptide.sdf')
calculate_descriptors.calc_mordred_3Ddescriptors(mols, 'desc/new_data/peptide_mordred_3D.csv')

100%|██████████| 120/120 [00:32<00:00,  3.70it/s]


In [143]:
# monomer
mols = Chem.SDMolSupplier('sdf/new_data/monomer.sdf')
calculate_descriptors.calc_mordred_3Ddescriptors(mols, 'desc/new_data/monomer_mordred_3D.csv')

100%|██████████| 660/660 [00:32<00:00, 20.02it/s] 


#### 3.3. MOE (206 types 2D descriptors + 117 types 3D descriptors)
+ CycPeptMP used the commercial software __MOE__ to calculate some of the descriptors.
+ In particular, many of the selected 3D descriptors were computed by MOE.
+ Please manualy calculate these descriptors. I showed __utils/MOE_3D_descriptors.sh__ as an example.
+ For 2D descriptors:
    + Please wash SMILES and use washed mols for calculation.
        + for GUI: Molecule -> Wash -> Protonation: Dominant
+ For 3D descriptors:
    + First, please calculate the charge for the RDKit conformations.
        + for GUI: Compute -> Molecule -> Partial Charges
    + 21 MOPAC descriptors of the 3D descriptors were not computed due to computational cost (AM_x, MNDO_, PM3_x)

#### 3.4. Concatenation files

In [12]:
calculate_descriptors.merge_descriptors(config, 'desc/new_data/', 'data/new_data/')

### 4. Generate input for three sub-models

+ Atom model

In [None]:
# ! Z-score