In [15]:
import json
import os
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem

from utils import utils_function

In [16]:
import importlib
importlib.reload(utils_function)

<module 'utils.utils_function' from 'c:\\Users\\yoooo\\OneDrive\\桌面\\cycpeptmp\\utils\\utils_function.py'>

In [43]:
config_path = 'config/CycPeptMP.json'
config = json.load(open(config_path,'r'))
data_args = config['data']

### 1. Divide peptide into monomers (substructures)
+  Divides __peptide bonds__ and __ester bonds__ (__disulfide bonds__ are not included in CycPeptMPDB data) in the main chain and splits them into monomers.
+ __Side-chain bonds__ were not subject to division to fully represent the side-chain properties.

In [26]:
df_peptide = pd.read_csv(data_args['org_peptide_path'], low_memory=False)
df_monomer = pd.read_csv(data_args['org_monomer_path'], low_memory=False)

smiles = df_peptide['SMILES'].tolist()
shape = df_peptide['Molecule_Shape'].to_list()
helm = df_peptide['HELM'].to_list()
symbol_to_smiles = dict(zip(df_monomer['Symbol'], df_monomer['capped_SMILES']))
symbol_to_cxsmiles = dict(zip(df_monomer['Symbol'], df_monomer['CXSMILES']))
R3_dict = dict(zip(df_monomer['Symbol'], df_monomer['R3']))
smiles_to_symbol = dict(zip(df_monomer['capped_SMILES'], df_monomer['Symbol']))

In [29]:
import re

substructure_list, substructure_num = [], []

for i in range(len(df_peptide)):

    now_substructure = []
    now_seq = helm[i].split('$')[0].split('{')[1].replace('}', '').replace('[', '').replace(']', '').split('.')

    if shape[i] == 'Circle':
        now_substructure = [symbol_to_smiles[_] for _ in now_seq]
    elif shape[i] == 'Lariat':
        # NOTE: Lariat, do not divide bonds of side chain
        atts = helm[i].split('$')[1].split(',')[2].split('-')
        atts_num = [int(_.split(':')[0]) for _ in atts]
        atts_R = [_.split(':')[1] for _ in atts]

        # PEPTIDE48{A.A.L.[meV].L.F.F.P.I.T.G.D.[-pip]}$PEPTIDE48,PEPTIDE48,1:R1-12:R3$$$
        if atts_num[0] == 1:
            # NOTE: This case were all R1-R3
            # if atts_R[0] != 'R1':
            #     print(f'{i}, 0, {atts_R[0]}')
            # elif atts_R[1] != 'R3':
            #     print(f'{i}, 1, {atts_R[1]}')

            now_substructure = [symbol_to_smiles[_] for _ in now_seq[:atts_num[1]-1]]
            # monomers to combine
            cxsmiles = [symbol_to_cxsmiles[_] for _ in now_seq[atts_num[1]-1:]]
            # NOTE: 第一个cap两处(R1, R3), side chain不cap
            tmp = cxsmiles[0].split(' |')[0]
            for _ in re.findall('_R\d', cxsmiles[0]):
                if _ == '_R1':
                    tmp = tmp.replace('[*]', '[CH3]', 1)
                elif _ == '_R2':
                    tmp = tmp.replace('[*]', '[2C]', 1)
                elif _ == '_R3':
                    if R3_dict[now_seq[atts_num[1]-1]] == 'H':
                        tmp = tmp.replace('[*]', '[CH3]', 1)
                    elif R3_dict[now_seq[atts_num[1]-1]] == 'OH':
                        tmp = tmp.replace('[*]', '[H]', 1)
            cxsmiles[0] = tmp

            combined = utils_function.combine_cxsmiles(cxsmiles, now_seq[atts_num[1]-1:], R3_dict)
            now_substructure.append(combined)

        # PEPTIDE959{[Mono22-].G.T.[Mono23].[Mono24].[dLeu(3R-OH)].[dSer(Me)].G.A.[meT].[dTyr(bR-OMe)].[Mono25]}$PEPTIDE959,PEPTIDE959,6:R3-12:R2$$$
        else:
            # NOTE: This case were all R3-R2
            # if atts_R[0] != 'R3':
            #     print(f'{i}, 0, {atts_R[0]}')
            # elif atts_R[1] != 'R2':
            #     print(f'{i}, 1, {atts_R[1]}')
            cxsmiles = [symbol_to_cxsmiles[_] for _ in now_seq[:atts_num[0]]]
            # NOTE: 最后一个cap两处(R2, R3), side chain不cap
            tmp = cxsmiles[-1].split(' |')[0]
            for _ in re.findall('_R\d', cxsmiles[-1]):
                if _ == '_R1':
                    tmp = tmp.replace('[*]', '[1C]', 1)
                elif _ == '_R2':
                    tmp = tmp.replace('[*]', '[H]', 1)
                elif _ == '_R3':
                    if R3_dict[now_seq[atts_num[0]-1]] == 'H':
                        tmp = tmp.replace('[*]', '[CH3]', 1)
                    elif R3_dict[now_seq[atts_num[0]-1]] == 'OH':
                        tmp = tmp.replace('[*]', '[H]', 1)
            cxsmiles[-1] = tmp

            combined = utils_function.combine_cxsmiles(cxsmiles, now_seq[:atts_num[0]], R3_dict)
            now_substructure.append(combined)
            now_substructure += [symbol_to_smiles[_] for _ in now_seq[atts_num[0]:]]

    substructure_num.append(len(now_substructure))
    if len(now_substructure) < data_args['monomer_max_len']:
        now_substructure += [''] * (data_args['monomer_max_len'] - len(now_substructure))
    substructure_list.append(now_substructure)

# check
df_peptide['Monomer_Length_in_Main_Chain'].to_list() == substructure_num

True

In [38]:
# Save substructure table
if not os.path.exists(data_args['substructures_table_path']):
    pd.concat([df_peptide[['CycPeptMPDB_ID', 'Source', 'Year', 'Original_Name_in_Source_Literature', \
                        'Structurally_Unique_ID', 'Same_Peptides_ID', 'SMILES', 'HELM', \
                            'Monomer_Length', 'Monomer_Length_in_Main_Chain', 'Molecule_Shape', 'Permeability', \
                            'PAMPA', 'Caco2', 'MDCK', 'RRCK']],
        pd.DataFrame(substructure_list, columns=[f'Substructure-{i}' for i in range(1, data_args['monomer_max_len']+1)])], axis=1).to_csv(data_args['substructures_table_path'], index=False)

In [44]:
# Save unique substructures for descriptor calculation

if not os.path.exists(data_args['unique_substructures_path']):
    unique_substructure = list(set(sum(substructure_list, [])))[1:]
    unique_substructure_mw = [Chem.rdMolDescriptors._CalcMolWt(Chem.MolFromSmiles(_)) for _ in unique_substructure]
    df_substructure =  pd.DataFrame([unique_substructure, unique_substructure_mw], index=['SMILES', 'MolWt']).T
    df_substructure = df_substructure.sort_values('MolWt').reset_index(drop=True)

    tmp = []
    i = 1
    for _ in df_substructure['SMILES'].to_list():
        if _ in smiles_to_symbol:
            tmp.append(smiles_to_symbol[_])
        else:
            tmp.append(f'Sub{i}')
            i += 1
    df_substructure.insert(0, 'Symbol', tmp)
    df_substructure.insert(0, 'ID', [i+1 for i in range(len(df_substructure))])
    df_substructure.to_csv(data_args['unique_substructures_path'], index=False)

In [45]:
df_substructure

Unnamed: 0,ID,Symbol,SMILES,MolWt
0,1,G,CNCC=O,73.095
1,2,dA,CN[C@H](C)C=O,87.122
2,3,A,CN[C@@H](C)C=O,87.122
3,4,Sar,CN(C)CC=O,87.122
4,5,Bal,CNCCC=O,87.122
...,...,...,...,...
382,383,Sub87,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)N(C)C(=O)[C...,828.109
383,384,Sub88,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,832.528
384,385,Sub89,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,850.502
385,386,Sub90,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...,860.582


## 2. Generate 60 3D conformations per peptides/monomers

#### For peptides, different SMILES representations are first generated by __SMILES enumeration__.

In [79]:
config_path = 'config/PPB.json'
config = json.load(open(config_path,'r'))
data_args = config['data']
aug_args = config['augmentation']

REPLICA_NUM = aug_args['replica_num']

df_peptide = pd.read_csv("data/pep_sequence.csv", low_memory=False)

In [81]:
from utils import utils_function
from utils import SmilesEnumerator

sme = SmilesEnumerator.SmilesEnumerator()

id = df_peptide['ID'].tolist()
smiles = df_peptide['SMILES'].tolist()
# canonical smiles
smiles = [utils_function.canonicalize_smiles(_) for _ in smiles]
label = df_peptide[data_args['target_name']].tolist()

enu_id = [_ for _  in id for i in range(REPLICA_NUM)]
enu_label = [_ for _  in label for i in range(REPLICA_NUM)]
enu_smi = []

for i in tqdm(range(len(df_peptide))):
    for j in range(REPLICA_NUM):
        if j == 0:
            enu_smi.append(smiles[i])
        else:
            now_smi = sme.randomize_smiles(smiles[i])
            count = 0
            # NOTE If a new SMILES is not generated after 1000 times, save the duplicated one.
            while now_smi in enu_smi:
                if count >= aug_args['sme_dup_thresh']:
                    break
                now_smi = sme.randomize_smiles(smiles[i])
                count += 1
            enu_smi.append(now_smi)

df_enu = pd.DataFrame([enu_id, enu_smi, enu_label], index=['ID', 'SMILES', 'y']).T
if not os.path.exists('input/enumerated_smiles.csv'):
    df_enu.to_csv('input/enumerated_smiles.csv', index=False)
else:
    print('enumerated_smiles.csv already exists.')

100%|██████████| 380/380 [00:41<00:00,  9.11it/s]


#### Conformation generation using RDKit

In [None]:
# WARNING sdfは公開しない?

##### peptide

In [10]:
# from utils import confgene
# config_path = 'config/PPB.json'
# config = json.load(open(config_path,'r'))

# mol_type = 'peptide'
# sub_file_num = config['conformation']['sub_file_num']

# df_enu = pd.read_csv('input/enumerated_smiles.csv', low_memory=False)

# Split into multiple files for parallel computation.
# sub_file_len = len(df_enu) // sub_file_num
# for i in range(sub_file_num):
#     df_enu.iloc[i*sub_file_len:(i+1)*sub_file_len].to_csv(f'sdf/{mol_type}_{i}.csv', index=False)

In [7]:
# Run conformation generation confgene.py in parallel by yourself.

# for sub in range(sub_file_num):
    # df_exp = pd.read_csv(f'sdf/{mol_type}_{sub}.csv', low_memory=False)
    # confgene.peptide_conformation_genetation(config, df_exp, mol_type, sub)

100%|██████████| 2280/2280 [2:19:05<00:00,  3.66s/it]  
100%|██████████| 2280/2280 [2:10:19<00:00,  3.43s/it]  
100%|██████████| 2280/2280 [26:35<00:00,  1.43it/s]


In [15]:
# # Check conformation nubmer
# from rdkit import Chem

# for sub in range(sub_file_num):
#     with open(f'sdf/{mol_type}_{sub}.sdf', 'rb') as f:
#         fsuppl = Chem.ForwardSDMolSupplier(f, removeHs=True)
#         mols = [mol for mol in fsuppl if mol is not None]
#         del fsuppl
#     f.close()
#     if sub != sub_file_num-1:
#         if len(mols) != sub_file_len:
#             print(sub, len(mols))
#     else:
#         if len(mols) != len(df_enu) - sub_file_len*(sub_file_num-1):
#             print(sub, len(mols))

##### monomer

In [2]:
# from utils import confgene
# config_path = 'config/PPB.json'
# config = json.load(open(config_path,'r'))

# mol_type = 'monomer'

# df = pd.read_csv("data/monomer_list.csv", low_memory=False)

# confgene.monomer_conformation_genetation(config, df, mol_type)

100%|██████████| 126/126 [03:18<00:00,  1.57s/it]


## 3. Calculate 2D and 3D descriptors for peptides and monomers

In [45]:
from utils import utils_function

config_path = 'config/PPB.json'
config = json.load(open(config_path,'r'))
sub_file_num = config['conformation']['sub_file_num']

df_peptide = pd.read_csv("data/pep_sequence.csv", low_memory=False)
df_monomer = pd.read_csv("data/monomer_list.csv", low_memory=False)

#### RDKit (208 types 2D descriptors)

In [12]:
utils_function.calc_rdkit_descriptors(df_peptide['SMILES'].tolist(), 'peptide')

100%|██████████| 380/380 [01:07<00:00,  5.67it/s]


In [13]:
utils_function.calc_rdkit_descriptors(df_monomer['SMILES'].tolist(), 'monomer')

100%|██████████| 126/126 [00:00<00:00, 146.53it/s]


#### Mordred (1275 types 2D descriptors + 51 types 3D descriptors)
+ Some descriptors cannot be computed when using NumPy 1.20 or later versions.

+ 2D

In [15]:
utils_function.calc_mordred_2Ddescriptors(df_peptide['SMILES'].tolist(), 'peptide')

100%|██████████| 380/380 [00:32<00:00, 11.64it/s]


In [16]:
utils_function.calc_mordred_2Ddescriptors(df_monomer['SMILES'].tolist(), 'monomer')

100%|██████████| 126/126 [00:01<00:00, 104.93it/s]


+ 3D

In [18]:
utils_function.calc_mordred_3Ddescriptors('monomer')

100%|██████████| 7560/7560 [00:17<00:00, 430.79it/s]


In [22]:
for sub in range(sub_file_num):
    utils_function.calc_mordred_3Ddescriptors('peptide', sub)

100%|██████████| 2280/2280 [00:56<00:00, 40.31it/s]
100%|██████████| 2280/2280 [00:52<00:00, 43.35it/s]
100%|██████████| 2280/2280 [00:30<00:00, 74.75it/s]
100%|██████████| 2280/2280 [00:56<00:00, 40.11it/s]
100%|██████████| 2280/2280 [00:46<00:00, 48.82it/s]
100%|██████████| 2280/2280 [00:36<00:00, 61.90it/s]
100%|██████████| 2280/2280 [00:44<00:00, 50.96it/s]
100%|██████████| 2280/2280 [00:35<00:00, 64.97it/s]
100%|██████████| 2280/2280 [01:03<00:00, 36.06it/s]
100%|██████████| 2280/2280 [00:37<00:00, 61.25it/s] 


#### MOE (206 types 2D descriptors + 117 types 3D descriptors)
+ CycPeptMP used the commercial software __MOE__ to calculate some of the descriptors. In particular, many of the selected 3D descriptors were computed by __MOE__.
+ Please manualy calculate these descriptors. I showed __MOE_3D_descriptors.sh__ as an example.
+ For 2D descriptors:
    + Please wash SMILES and use washed mols for calculation.
        + for GUI: Molecule -> Wash -> Protonation: Dominant
+ For 3D descriptors:
    + First, please calculate the charge against the RDKit conformation.
        + for GUI: Compute -> Molecule -> Partial Charges
    + 21 MOPAC descriptors of the 3D descriptors were not computed due to computational cost (AM_x, MNDO_x, PM3_x)
+ If you cannot compute them, please exclude the MOE part from the after procedures.

#### Concatenation

+ 2D

In [34]:
for mol_type in ['peptide', 'monomer']:
    df_moe = pd.read_csv(f'desc/{mol_type}_moe_2D.csv')
    df = df_moe.iloc[:, :df_moe.columns.to_list().index('apol')].copy()
    df_moe = df_moe.iloc[:, df_moe.columns.to_list().index('apol'):].select_dtypes('number')

    df_rdkit = pd.read_csv(f'desc/{mol_type}_rdkit.csv').select_dtypes('number')
    name_dup = []
    for _ in df_rdkit.columns:
        if _ in df_moe.columns.to_list():
            name_dup.append(_)
    name_dup = dict(zip(name_dup, [_+'_rdkit' for _ in name_dup]))
    df_rdkit = df_rdkit.rename(columns=name_dup)

    df_mordred = pd.read_csv(f'desc/{mol_type}_mordred_2D.csv').select_dtypes('number')
    name_dup = []
    for _ in df_mordred.columns:
        if _ in df_moe.columns.to_list():
            name_dup.append(_)
    name_dup = dict(zip(name_dup, [_+'_mordred' for _ in name_dup]))
    df_mordred = df_mordred.rename(columns=name_dup)

    df = pd.concat([df, df_moe, df_rdkit, df_mordred], axis=1)
    df.to_csv(f'input/{mol_type}_2D.csv', index=False)

+ 3D

In [4]:
config_path = 'config/PPB.json'
config = json.load(open(config_path,'r'))

for mol_type in ['peptide', 'monomer']:
    if mol_type == 'peptide':
        for sub in range(config['conformation']['sub_file_num']):
            if sub == 0:
                df_moe = pd.read_csv(f'desc/{mol_type}_moe_3D_{sub}.csv')
                df_mordred = pd.read_csv(f'desc/{mol_type}_mordred_3D_{sub}.csv')
            else:
                df_moe = pd.concat([df_moe, pd.read_csv(f'desc/{mol_type}_moe_3D_{sub}.csv')], axis=0)
                df_mordred = pd.concat([df_mordred, pd.read_csv(f'desc/{mol_type}_mordred_3D_{sub}.csv')], axis=0)
        df_moe = df_moe.reset_index(drop=True)
        df_mordred = df_mordred.reset_index(drop=True)
    elif mol_type == 'monomer':
        df_moe = pd.read_csv(f'desc/{mol_type}_moe_3D.csv')
        df_mordred = pd.read_csv(f'desc/{mol_type}_mordred_3D.csv')

    df = df_moe.iloc[:, :df_moe.columns.to_list().index('ASA')].copy()
    df_moe = df_moe.iloc[:, df_moe.columns.to_list().index('ASA'):].select_dtypes('number')

    name_dup = []
    for _ in df_mordred.columns:
        if _ in df_moe.columns.to_list():
            name_dup.append(_)
    name_dup = dict(zip(name_dup, [_+'_mordred' for _ in name_dup]))
    df_mordred = df_mordred.rename(columns=name_dup).select_dtypes('number')

    if mol_type == 'peptide':
        df_enu = pd.read_csv('input/enumerated_smiles.csv', low_memory=False)
        df = pd.concat([df_enu, df, df_moe, df_mordred], axis=1)
    elif mol_type == 'monomer':
        df_monomer = pd.read_csv("data/monomer_list.csv", low_memory=False)
        df_monomer = df_monomer.iloc[df_monomer.index.repeat(config['augmentation']['replica_num'])].reset_index(drop=True)
        df = pd.concat([df_monomer, df, df_moe, df_mordred], axis=1)

    df.to_csv(f'input/{mol_type}_3D.csv', index=False)

### 0. Select only PAMPA entries of CycPeptMPDB and remove duplicate structures

In [None]:
# WARNING ここで重複をなし、目的変数の上限下限を丸める

data_args = config['data']

# All data from CycPeptMPDB
df = pd.read_csv(data_args['org_data_path'], low_memory=False)




In [7]:
config_path = 'config/PPB.json'
config = json.load(open(config_path,'r'))
valid_args = config['validation']

## 4. Split validation and test sets

+ Splie Test set by Kennard–Stone algorithm.

In [8]:
# OPTIMIZE

df = pd.read_csv("data/pep_sequence.csv", low_memory=False)

test_index = [0, 1, 19, 24, 33, 34, 41, 42, 44, 52, 55, 58, 71, 77, 107, 109, 135, 168, 186, 194, 211, 218, 259, 273, 284, 306, 310, 311, 312, 314, 316, 319, 343, 350, 357, 358, 361]
np.save('data/validation/Test_index.npy', test_index)
drugbank_index = [_ for _ in range(363, 380)]
np.save('data/validation/DrugBank_index.npy', drugbank_index)

test_ids = [_+1 for _ in test_index]
np.save('data/validation/Test_ids.npy', test_ids)
drugbank_ids = [_+1 for _ in drugbank_index]
np.save('data/validation/DrugBank_ids.npy', drugbank_ids)

train_valid_index = df.drop(test_index+drugbank_index).index

+ Split validation sets from the rest so that there is no duplication between validation sets.

In [23]:
train_valid_ids = df.iloc[train_valid_index]['ID'].to_list()

for cv, cv_seed in zip(range(valid_args['cv']), valid_args['cv_seed']):
    random.seed(cv_seed)
    if cv == 0:
        tmp_ids = train_valid_ids

    valid_ids = sorted(random.sample(tmp_ids, len(test_ids)))
    train_ids = sorted(list(set(train_valid_ids) - set(valid_ids)))
    np.save(f'data/validation/Train_ids_cv{cv}.npy', train_ids)
    np.save(f'data/validation/Valid_ids_cv{cv}.npy', valid_ids)

    valid_index = df[df['ID'].isin(valid_ids)].index.to_list()
    train_index = df[df['ID'].isin(train_ids)].index.to_list()
    np.save(f'data/validation/Train_index_cv{cv}.npy', train_index)
    np.save(f'data/validation/Valid_index_cv{cv}.npy', valid_index)

    # update tmp_ids
    tmp_ids = sorted(list(set(tmp_ids) - set(valid_ids)))

## 5. Descriptors selection

In [23]:
from utils import utils_function
from sklearn.ensemble import RandomForestRegressor

config_path = 'config/PPB.json'
config = json.load(open(config_path,'r'))

+ 2D

In [16]:
df = pd.read_csv('input/peptide_2D.csv', low_memory=False)
# OPTIMIZE target_name
label_list = df[config['data']['target_name']].to_numpy()
smiles_list = df['SMILES'].to_numpy()
df_2D = df.iloc[:, df.columns.to_list().index('apol'):].copy()

In [21]:
# # Deleted by standard deviation: 307
# # Deleted by similarity: 1048
# # Feature map shape: (380, 335)

# features_delete_std, features_delete_std_R, data_preprocessed = \
#   utils_function.entire_preprocessing(df_2D, label_list, threshold=config['feature_selection']['similarity_thresh'])
# np.savez_compressed('input/peptide_selected_2D.npz',
#                     features_delete_std=features_delete_std,
#                     features_delete_std_R=features_delete_std_R,
#                     features_use=data_preprocessed.columns.to_list(),
#                     data_preprocessed=data_preprocessed.values)

load = np.load('input/peptide_selected_2D.npz')
data_preprocessed_2D = pd.DataFrame(load['data_preprocessed'], columns=load['features_use'])

In [24]:
# Select by RF for 3-cv
importances = []
for cv in range(config['validation']['cv']):
    indices = np.load(f'data/validation/Train_index_cv{cv}.npy')
    x = data_preprocessed_2D.iloc[indices]
    y = label_list[indices]
    RF = RandomForestRegressor(n_estimators=500, random_state=233, n_jobs=12)
    RF.fit(x, y)
    importances.append(RF.feature_importances_)

importances = np.array(importances).mean(axis=0)

df_tmp = pd.DataFrame(importances, index=x.columns, columns=['importances']).sort_values('importances', ascending=False)[:15].reset_index()

In [25]:
df_tmp

Unnamed: 0,index,importances
0,logP(o/w),0.302166
1,PEOE_VSA-1,0.140975
2,AMID_h,0.028466
3,AATSC0c,0.027915
4,BIC1,0.027676
5,AATS4se,0.018925
6,MinEStateIndex,0.018824
7,MolLogP,0.018142
8,IC0,0.016368
9,VSA_EState3,0.01372


In [26]:
use_2D = ['logP(o/w)', 'PEOE_VSA-1']

+ 3D

In [27]:
df_3D = pd.read_csv('input/peptide_3D.csv', low_memory=False)
# Use the top conformation for selection
df_3D = df_3D.iloc[[config['augmentation']['replica_num']*_ for _ in range(len(df))]].reset_index(drop=True)
df
df_3D.shape
# (7337, 175)

(380, 174)

In [32]:
# # Deleted by standard deviation: 1
# # Deleted by similarity: 115
# # Feature map shape: (380, 52)

# features_delete_std, features_delete_std_R, data_preprocessed = \
#     utils_function.entire_preprocessing(df_3D.iloc[:, df_3D.columns.to_list().index('ASA'):].copy(), label_list, threshold=config['feature_selection']['similarity_thresh'])
# np.savez_compressed('input/peptide_selected_3D.npz',
#                     features_delete_std=features_delete_std,
#                     features_delete_std_R=features_delete_std_R,
#                     features_use=data_preprocessed.columns.to_list(),
#                     data_preprocessed=data_preprocessed.values)

load = np.load('input/peptide_selected_3D.npz')
data_preprocessed_3D = pd.DataFrame(load['data_preprocessed'], columns=load['features_use'])

In [34]:
# Select by RF for 3-cv
importances = []
for cv in range(config['validation']['cv']):
    indices = np.load(f'data/validation/Train_index_cv{cv}.npy')
    x = data_preprocessed_3D.iloc[indices]
    y = label_list[indices]
    RF = RandomForestRegressor(n_estimators=500, random_state=233, n_jobs=12)
    RF.fit(x, y)
    importances.append(RF.feature_importances_)

importances = np.array(importances).mean(axis=0)

df_tmp = pd.DataFrame(importances, index=x.columns, columns=['importances']).sort_values('importances', ascending=False)[:15].reset_index()

In [35]:
df_tmp

Unnamed: 0,index,importances
0,vsurf_CW1,0.170764
1,vsurf_CW3,0.160359
2,vsurf_CW2,0.096998
3,RPSA,0.058411
4,vsurf_D5,0.053362
5,vsurf_HL1,0.05222
6,vsurf_IW8,0.030033
7,TASA,0.026045
8,E_vdw,0.02173
9,FASA_P,0.020075


In [36]:
use_3D = ['vsurf_CW1', 'vsurf_CW3']

## 6. Model input generation

+ Atom

In [None]:


folder_path = f'Atom'
os.makedirs(folder_path, exist_ok=True)

## 7. Hyperparameter tuning