# 1. Library import

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data

import time
import numpy as np
#import gc
#import sys
import pickle
import copy
import pandas as pd
# from AttentiveLayers_Viz import *
from Featurizer import *
from getFeatures import *

from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

from rdkit import Chem
from rdkit.Chem import QED
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
from IPython.display import SVG, display
import seaborn as sns
sns.set(color_codes = True)

In [2]:
import sys
sys.version 

'3.7.16 (default, Jan 17 2023, 22:20:44) \n[GCC 11.2.0]'

# Attentive Layers

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device('cuda:2')

class Fingerprint_viz(nn.Module):
    def __init__(self, radius, T, input_feature_dim, input_bond_dim,
                 fingerprint_dim, output_units_num, p_dropout):
        super(Fingerprint_viz, self).__init__()

        # Graph attention for atom embedding
        self.atom_fc = nn.Linear(input_feature_dim, fingerprint_dim)
        self.neighbor_fc = nn.Linear(input_feature_dim+input_bond_dim, fingerprint_dim)
        self.GRUCell = nn.ModuleList([nn.GRUCell(fingerprint_dim, fingerprint_dim) for r in range(radius)])
        self.align = nn.ModuleList([nn.Linear(2*fingerprint_dim, 1) for r in range(radius)])
        self.attend = nn.ModuleList([nn.Linear(fingerprint_dim, fingerprint_dim) for r in range(radius)])

        # Graph attention for molecule embedding
        self.mol_GRUCell = nn.GRUCell(fingerprint_dim, fingerprint_dim)
        self.mol_align = nn.Linear(2*fingerprint_dim, 1)
        self.mol_attend = nn.Linear(fingerprint_dim, fingerprint_dim)

        self.dropout = nn.Dropout(p=p_dropout)
        # self.output = nn.Linear(fingerprint_dim, output_units_num)

        self.radius = radius
        self.T = T

    def forward(self, atom_list, bond_list, atom_degree_list, bond_degree_list, atom_mask):
        atom_list = atom_list.to(device)
        bond_list = bond_list.to(device)
        atom_degree_list = atom_degree_list.to(device)
        bond_degree_list = bond_degree_list.to(device)
        atom_mask = atom_mask.to(device)
        
        atom_mask = atom_mask.unsqueeze(2)
        batch_size, mol_length, num_atom_feat = atom_list.size()
        atom_feature = F.relu(self.atom_fc(atom_list))
        
        atom_feature_viz = []
        atom_feature_viz.append(self.atom_fc(atom_list))
                                
        bond_neighbor = [bond_list[i][bond_degree_list[i]] for i in range(batch_size)]
        bond_neighbor = torch.stack(bond_neighbor, dim=0)
        atom_neighbor = [atom_list[i][atom_degree_list[i]] for i in range(batch_size)]
        atom_neighbor = torch.stack(atom_neighbor, dim=0)

        # Concatenate atom and bond features
        neighbor_feature = torch.cat([atom_neighbor, bond_neighbor], dim=-1)
        neighbor_feature = F.relu(self.neighbor_fc(neighbor_feature))

        # Generate mask to eliminate the influence of blank atoms
        attend_mask = atom_degree_list.clone()
        attend_mask[attend_mask != mol_length-1] = 1
        attend_mask[attend_mask == mol_length-1] = 0
        attend_mask = attend_mask.type(torch.FloatTensor).unsqueeze(-1).to(device)

        softmax_mask = atom_degree_list.clone()
        softmax_mask[softmax_mask != mol_length-1] = 0
        softmax_mask[softmax_mask == mol_length-1] = -9
        softmax_mask = softmax_mask.type(torch.FloatTensor).unsqueeze(-1).to(device)

        batch_size, mol_length, max_neighbor_num, fingerprint_dim = neighbor_feature.shape
        atom_feature_expand = atom_feature.unsqueeze(-2).expand(batch_size, mol_length, max_neighbor_num, fingerprint_dim)
        feature_align = torch.cat([atom_feature_expand, neighbor_feature], dim=-1)

        align_score = F.relu(self.align[0](self.dropout(feature_align)))
        align_score = align_score + softmax_mask
        attention_weight = F.softmax(align_score, -2)
        attention_weight = attention_weight * attend_mask
        
        atom_attention_weight_viz = []
        atom_attention_weight_viz.append(attention_weight)
                                
                                
                                
        neighbor_feature_transform = self.attend[0](self.dropout(neighbor_feature))
        context = torch.sum(torch.mul(attention_weight, neighbor_feature_transform), -2)
        context = F.relu(context)
        context_reshape = context.view(batch_size * mol_length, fingerprint_dim)
        atom_feature_reshape = atom_feature.view(batch_size * mol_length, fingerprint_dim)
        atom_feature_reshape = self.GRUCell[0](context_reshape, atom_feature_reshape)
        atom_feature = atom_feature_reshape.view(batch_size, mol_length, fingerprint_dim)

        activated_features = F.relu(atom_feature)
        atom_feature_viz.append(activated_features)
                                
                                
        for d in range(1, self.radius):
            neighbor_feature = [activated_features[i][atom_degree_list[i]] for i in range(batch_size)]
            neighbor_feature = torch.stack(neighbor_feature, dim=0)
            atom_feature_expand = activated_features.unsqueeze(-2).expand(batch_size, mol_length, max_neighbor_num, fingerprint_dim)

            feature_align = torch.cat([atom_feature_expand, neighbor_feature], dim=-1)

            align_score = F.relu(self.align[d](self.dropout(feature_align)))
            align_score = align_score + softmax_mask
            attention_weight = F.softmax(align_score, -2)
            attention_weight = attention_weight * attend_mask
            
            atom_attention_weight_viz.append(attention_weight)
                                    
            neighbor_feature_transform = self.attend[d](self.dropout(neighbor_feature))
            context = torch.sum(torch.mul(attention_weight, neighbor_feature_transform), -2)
            context = F.relu(context)
            context_reshape = context.view(batch_size * mol_length, fingerprint_dim)
            atom_feature_reshape = self.GRUCell[d](context_reshape, atom_feature_reshape)
            atom_feature = atom_feature_reshape.view(batch_size, mol_length, fingerprint_dim)

            activated_features = F.relu(atom_feature)
            atom_feature_viz.append(activated_features)                    
            
        mol_feature_unbounded_viz = []
        mol_feature_unbounded_viz.append(torch.sum(atom_feature * atom_mask, dim = -2))
                                
        mol_feature = torch.sum(activated_features * atom_mask, dim=-2)
        activated_features_mol = F.relu(mol_feature)
        
        mol_feature_viz = []
        mol_feature_viz.append(mol_feature)
        
                                
        mol_attention_weight_viz = []
        mol_softmax_mask = atom_mask.clone()
        mol_softmax_mask[mol_softmax_mask == 0] = -9e8
        mol_softmax_mask[mol_softmax_mask == 1] = 0
        mol_softmax_mask = mol_softmax_mask.type(torch.FloatTensor).to(device)

        for t in range(self.T):
            mol_prediction_expand = activated_features_mol.unsqueeze(-2).expand(batch_size, mol_length, fingerprint_dim)
            mol_align = torch.cat([mol_prediction_expand, activated_features], dim=-1)
            mol_align_score = F.relu(self.mol_align(mol_align))
            mol_align_score = mol_align_score + mol_softmax_mask
            mol_attention_weight = F.softmax(mol_align_score, -2)
            mol_attention_weight = mol_attention_weight * atom_mask
            mol_attention_weight_viz.append(mol_attention_weight)
                                
            activated_features_transform = self.mol_attend(self.dropout(activated_features))
            mol_context = torch.sum(torch.mul(mol_attention_weight, activated_features_transform), -2)
            mol_context = F.relu(mol_context)
            mol_feature = self.mol_GRUCell(mol_context, mol_feature)
            mol_feature_unbounded_viz.append(mol_feature)

            activated_features_mol = F.relu(mol_feature)
            mol_feature_viz.append(activated_features_mol)

        mol_prediction = mol_feature

        return atom_feature, atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, mol_attention_weight_viz, mol_prediction

        
       

In [4]:
class ExtendedFingerprint_viz(Fingerprint_viz):
    def __init__(self, radius, T, input_feature_dim, input_bond_dim, fingerprint_dim, output_units_num, p_dropout, physicochemical_feature_dim, physicochemical_feature_dim_1, physicochemical_feature_dim_2, final1_fc1, final1_fc2):
        super().__init__(radius, T, input_feature_dim, input_bond_dim, fingerprint_dim, output_units_num, p_dropout)
        self.physicochemical_fc = nn.Linear(physicochemical_feature_dim, physicochemical_feature_dim_1)
        self.physicochemical_bn = nn.BatchNorm1d(physicochemical_feature_dim_1)
        self.physicochemical_fc2 = nn.Linear(physicochemical_feature_dim_1, fingerprint_dim)
        self.physicochemical_bn2 = nn.BatchNorm1d(fingerprint_dim)
        self.final1_fc = nn.Linear(fingerprint_dim*2, final1_fc1)
        self.bn_final1 = nn.BatchNorm1d(final1_fc1) # Added BatchNorm Layer

        self.final2_fc = nn.Linear(final1_fc1, final1_fc2)
        self.bn_final2 = nn.BatchNorm1d(final1_fc2) # Added BatchNorm Layer
        self.final3_fc = nn.Linear(final1_fc2, output_units_num)

    def forward(self, atom_list, bond_list, atom_degree_list, bond_degree_list, atom_mask, physicochemical_features):

        physicochemical_features = physicochemical_features.to(device)

        atom_feature, atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, mol_attention_weight_viz, mol_prediction = super().forward(atom_list, bond_list, atom_degree_list, bond_degree_list, atom_mask)
        
        processed_physicochemical_features = F.relu(self.physicochemical_fc(physicochemical_features))
        processed_physicochemical_features2 = self.dropout(processed_physicochemical_features)
        processed_physicochemical_features3 = F.relu(self.physicochemical_fc2(processed_physicochemical_features2))
        processed_physicochemical_features4 = self.dropout(processed_physicochemical_features3)
        
        combined_feature_vector = torch.cat([mol_prediction, processed_physicochemical_features4], dim=-1)
        
        #fingerprint_dim => 1024
        final_prediction = F.relu(self.bn_final1(self.final1_fc(combined_feature_vector))) # Applied BatchNorm Layer
        final2_prediction = self.dropout(final_prediction)
        final3_prediction = F.relu(self.bn_final2(self.final2_fc(final2_prediction))) # Applied BatchNorm Layer
        final4_prediction = self.dropout(final3_prediction)
        final5_prediction = self.final3_fc(final4_prediction)
        final5_prediction = F.softmax(final5_prediction, dim = 1)

       
      
        return atom_feature, atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, mol_attention_weight_viz, final5_prediction



In [5]:
os.getcwd()

'/data/home/ldhyun7222/hERGAT'

In [6]:
import os
os.getcwd()

'/data/home/ldhyun7222/hERGAT'

In [7]:
import os
os.chdir('/data/home/ldhyun7222/hERGAT/dataset')

# 2.dataset and preprocessing

## Delete the duplicated data standard on Canonical smiles

In [8]:
task_name = 'Class'
tasks = ['Class']
raw_filename = "new_hERGAT_dataset.csv"
feature_filename = raw_filename.replace('.csv','.pickle')
filename = raw_filename.replace('.csv','')
prefix_filename = raw_filename.split('/')[-1].replace('.csv','')
smiles_tasks_df = pd.read_csv(raw_filename)
smilesList = smiles_tasks_df.SMILES.values
print("number of all smiles: ",len(smilesList))

atom_num_dist = []
remained_smiles = []
canonical_smiles_list = []
problematic_smiles = []  # 문제가 되는 SMILES 문자열을 저장할 리스트

for smiles in smilesList:
    try:        
        mol = Chem.MolFromSmiles(smiles)
        Chem.SanitizeMol(mol) # 화합물의 유효성 검증
        atom_num_dist.append(len(mol.GetAtoms()))
        remained_smiles.append(smiles)
        canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True))
    except:
        problematic_smiles.append(smiles)
        print("not successfully processed smiles: ", smiles)
        pass
print("number of successfully processed smiles: ", len(remained_smiles))
print("number of problematic smiles: ", len(problematic_smiles))

smiles_tasks_df = smiles_tasks_df[smiles_tasks_df["SMILES"].isin(remained_smiles)]
# print(smiles_tasks_df)
smiles_tasks_df['cano_smiles'] =canonical_smiles_list
assert canonical_smiles_list[8]==Chem.MolToSmiles(Chem.MolFromSmiles(smiles_tasks_df['cano_smiles'][8]), isomericSmiles=True)
smiles_tasks_df.head()

number of all smiles:  23387




number of successfully processed smiles:  23387
number of problematic smiles:  0


Unnamed: 0,SMILES,Class,cano_smiles
0,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@@H](C3)Oc4cccc(c4...,1,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@H](Oc2cccc(C(N)=O...
1,COc1nc2ccc(Br)cc2cc1[C@@H](c3ccccc3)[C@@](O)(C...,1,COc1nc2ccc(Br)cc2cc1[C@@H](c1ccccc1)[C@@](O)(C...
2,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCCc...,1,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCCc...
3,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@@H](C3)Oc4cccc(c4...,1,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@H](Oc2cccc(C(N)=O...
4,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCc4...,1,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCc2...


In [9]:
# 문제가 되는 SMILES 출력
for s in problematic_smiles:
    print("Problematic SMILES:", s)

In [10]:
random_seed =100
start_time = str(time.ctime()).replace(':','-').replace(' ','_')
start = time.time()
radius = 3
T = 2
per_task_output_units_num = 2 # for classification model with 2 classes
output_units_num = len(tasks) * per_task_output_units_num # 2 

In [11]:
len(smiles_tasks_df)

23387

In [12]:
# Use the length of SMILES less than 100
smilesList = [smiles for smiles in canonical_smiles_list if len(Chem.MolFromSmiles(smiles).GetAtoms())<101]

uncovered = [smiles for smiles in canonical_smiles_list if len(Chem.MolFromSmiles(smiles).GetAtoms())>100]

# delete the duplicated cano_smiles in uncovered
smiles_tasks_df = smiles_tasks_df[~smiles_tasks_df["cano_smiles"].isin(uncovered)]

feature_dicts = get_smiles_dicts(smilesList)
# keys = smiles values in cano_smiles값
remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin(feature_dicts['smiles_to_atom_mask'].keys())] 
uncovered_df = smiles_tasks_df.drop(remained_df.index) 
uncovered_df # 최종적으로 uncovered_df에는 get_smiles_dicts로 바꾼 값과 cano_smiles와 비교하였을 때 삭제



Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(S(F)(F)(F)(F)F)cc3)C2)n1C
Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3cccc(S(F)(F)(F)(F)F)c3)C2)n1C
[Cl-]




FS(F)(F)(F)(F)c1cccc(-c2cnn(Cc3cc[nH]n3)c2)c1
Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(S(F)(F)(F)(F)F)cc3)C2)n1C
Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3cccc(S(F)(F)(F)(F)F)c3)C2)n1C


Unnamed: 0,SMILES,Class,cano_smiles
1240,Cc1ncoc1c2nnc(SCCCN3CC4CC4(C3)c5ccc(cc5)S(F)(F...,1,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(S(F)(F)(F)(F...
1671,Cc1ncoc1c2nnc(SCCCN3CC4CC4(C3)c5cccc(c5)S(F)(F...,1,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3cccc(S(F)(F)(F)(...
3704,[Cl-],1,[Cl-]
9717,FS(F)(F)(F)(F)c1cccc(-c2cnn(Cc3cc[nH]n3)c2)c1,1,FS(F)(F)(F)(F)c1cccc(-c2cnn(Cc3cc[nH]n3)c2)c1
9878,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(S(F)(F)(F)(F...,1,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(S(F)(F)(F)(F...
9884,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3cccc(S(F)(F)(F)(...,1,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3cccc(S(F)(F)(F)(...


Remain 7,930 dataset...
duplicated 3 cano_smiles are deleted|

In [13]:
print(uncovered_df.shape)
print(remained_df.shape)

(6, 3)
(23381, 3)


## Physicochemical properties 생성

In [14]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors

Reset index in remained_df to initialize the index for later concatenation

In [15]:
remained_df = remained_df.reset_index()

In [16]:
# mol 형태로 변환
train_mols = [Chem.MolFromSmiles(smiles) for smiles in remained_df["cano_smiles"]]

# mol 형태로 변환이 되지 않은 경우, none_list에 담는다
none_list = []
for i in range(len(train_mols)):
    if train_mols[i] is None :
        none_list.append(i)
        print('none_list에 추가됨')
    
reg_idx = 0
for i in none_list :
    del train_mols[i - reg_idx]
    reg_idx += 1
    
# none_list가 존재할 경우, 삭제 후 데이터프레임 인덱스 맞춰주기
if len(none_list) != 0 :
    remained_df = remained_df.drop(none_list, axis=0)
    remained_df = remained_df.reset_index(drop = True)




In [17]:
# fingerprint 생성
bit_info_list = [] # bit vector의 설명자 리스트 담기
bit_info = {} #bit vector 설명자
fps = []

b = 0
# mol 파일에서 fingerprint Bit Vector 형태로 변환하기
for a in train_mols :
    fps.append(AllChem.GetMorganFingerprintAsBitVect(a, 3, nBits = 1024, bitInfo = bit_info))
    bit_info_list.append(bit_info.copy()) # bit_info 그대로 가져오면 변수가 변해서 리스트 값이 달라지므로 .copy()
    
# array 변환

arr_list = []
for i in range(len(fps)):
    array = np.zeros((0,), dtype = np.int8)
    arr_list.append(array)
for i in range(len(fps)):
    bit = fps[i]
    DataStructs.ConvertToNumpyArray(bit, arr_list[i])
    
train_x = np.stack([i.tolist() for i in arr_list])
train_finprt = pd.DataFrame(train_x)
train_finprt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
import joblib
from sklearn.preprocessing import StandardScaler

# StandardScaler

sds_scaler = StandardScaler()

# molecular physicochemical properties 구하기

from rdkit.Chem import QED

train_qe = [QED.properties(mol) for mol in train_mols]
train_qe = pd.DataFrame(train_qe)
train_qe = train_qe.drop(columns = ['ROTB', 'AROM', 'ALERTS'])
train_qe[['MW', 'ALOGP', 'HBA', 'HBD', 'PSA']] = sds_scaler.fit_transform(train_qe[['MW', 'ALOGP', 'HBA', 'HBD', 'PSA']])
input_df=pd.concat([train_finprt, train_qe], axis=1)
new_remained_df = pd.concat([remained_df, input_df], axis = 1)
new_remained_df = new_remained_df.drop(columns = ['index'])
new_remained_df



Unnamed: 0,SMILES,Class,cano_smiles,0,1,2,3,4,5,6,...,1019,1020,1021,1022,1023,MW,ALOGP,HBA,HBD,PSA
0,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@@H](C3)Oc4cccc(c4...,1,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@H](Oc2cccc(C(N)=O...,1,0,0,0,0,0,0,...,1,0,0,0,0,-0.892386,-0.097748,-0.890598,-0.279961,-0.545279
1,COc1nc2ccc(Br)cc2cc1[C@@H](c3ccccc3)[C@@](O)(C...,1,COc1nc2ccc(Br)cc2cc1[C@@H](c1ccccc1)[C@@](O)(C...,0,1,1,1,0,0,0,...,0,0,0,0,0,1.276572,2.236299,-0.407655,-0.279961,-0.852902
2,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCCc...,1,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCCc...,0,0,0,0,0,0,0,...,1,0,0,0,0,-0.744015,-0.014032,-0.890598,-0.279961,-0.545279
3,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@@H](C3)Oc4cccc(c4...,1,Cc1ccc(CN2[C@@H]3CC[C@H]2C[C@H](Oc2cccc(C(N)=O...,0,0,0,0,0,0,0,...,1,1,0,0,0,-0.828614,-0.056288,-0.890598,-0.279961,-0.545279
4,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCc4...,1,NC(=O)c1cccc(O[C@@H]2C[C@H]3CC[C@@H](C2)N3CCc2...,0,0,0,0,0,0,0,...,1,0,0,0,0,-0.892386,-0.277020,-0.890598,-0.279961,-0.545279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23376,COc1cc(-c2nc3n(n2)CCOC3c2ccccc2C(F)(F)F)ccc1-n...,1,COc1cc(-c2nc3n(n2)CCOC3c2ccccc2C(F)(F)F)ccc1-n...,0,0,0,1,0,0,0,...,1,0,0,0,0,0.218019,0.521059,0.075289,-1.191121,-0.192608
23377,COc1cc(-c2nc3n(n2)CCC[C@H]3c2ccc(F)cc2C)ccc1-n...,1,COc1cc(-c2nc3n(n2)CCC[C@H]3c2ccc(F)cc2C)ccc1-n...,0,0,0,1,1,0,0,...,1,0,0,0,0,-0.183419,0.679364,-0.407655,-1.191121,-0.477398
23378,COc1cc(-c2nc3n(n2)CCC[C@@H]3c2ccc(F)cc2C)ccc1-...,1,COc1cc(-c2nc3n(n2)CCC[C@@H]3c2ccc(F)cc2C)ccc1-...,0,0,0,1,1,0,0,...,1,0,0,0,0,-0.183419,0.679364,-0.407655,-1.191121,-0.477398
23379,COc1cc(-c2nc3n(n2)CCCC3c2ccccc2C(F)(F)F)ccc1-n...,1,COc1cc(-c2nc3n(n2)CCCC3c2ccccc2C(F)(F)F)ccc1-n...,0,0,0,1,1,0,0,...,1,0,0,0,0,0.197160,1.064496,-0.407655,-1.191121,-0.477398


In [19]:
new_remained_df.to_csv('new_remained_df.csv', index = False)

In [21]:
for key, value in feature_dicts.items():
    print(key)

smiles_to_atom_mask
smiles_to_atom_info
smiles_to_bond_info
smiles_to_atom_neighbors
smiles_to_bond_neighbors
smiles_to_rdkit_list


가중치는 전체 샘플수에서 해당 클래스의 샘플 수로 결정 - 소수 클래스에 더 높은 가중치 부여

 => nn.CrossEntropyLoss와 같이 가중치를 사용하면 클래스 불균형 문제 해결
  - 소수 클래스의 손실이 전체 손실에 더 영향을 주게 되어 모델이 소수 클래스에도 더 예측할수 있도록 학습하게 하는 효과

In [22]:
weights = []
for i,task in enumerate(tasks):    
    negative_df = new_remained_df[new_remained_df[task] == 0][["SMILES",task]] # 
    positive_df = new_remained_df[new_remained_df[task] == 1][["SMILES",task]] # 
    weights.append([(positive_df.shape[0]+negative_df.shape[0])/negative_df.shape[0],\
                    (positive_df.shape[0]+negative_df.shape[0])/positive_df.shape[0]])
    
# positive나 negative한 blocker가 데이터의 양이 다르기 때문에 데이터 불균형 문제를 해결해주기 위해서
# 학습할 때에 weights를 이용하여 학습하게  (loss_function에 0적용)
# weights :  [[1.7614393602843181, 2.313302217036173]]


# train/test/valid split
# test_df = new_remained_df.sample(frac=1/10, random_state=random_seed) # test set
# training_data = new_remained_df.drop(test_df.index) # training data
# valid_df = training_data.sample(frac=1/9, random_state=random_seed) # validation set
# train_df = training_data.drop(valid_df.index) # train set
# train_df = train_df.reset_index(drop=True)
# valid_df = valid_df.reset_index(drop=True)
# test_df = test_df.reset_index(drop=True)

train_df = new_remained_df[:20542]
train_df = train_df.reset_index(drop = True)

test_df = new_remained_df[20542:21966]
test_df = test_df.reset_index(drop = True)

valid_df = new_remained_df[21966:]
valid_df = valid_df.reset_index(drop = True)


In [23]:
print(negative_df.shape)
print(positive_df.shape)

(9198, 2)
(14183, 2)


In [24]:
print('weights : ', weights)

weights :  [[2.5419656447053707, 1.648522879503631]]


In [25]:
print(train_df.shape)
print(test_df.shape)
print(valid_df.shape)

(20542, 1027)
(1424, 1027)
(1415, 1027)


In [26]:
print('train_df label의 shape')
print(train_df['Class'].value_counts())
print('test_df label의 shape')
print(test_df['Class'].value_counts())
print('valid_df label의 shape')
print(valid_df['Class'].value_counts())

train_df label의 shape
1    12528
0     8014
Name: Class, dtype: int64
test_df label의 shape
1    807
0    617
Name: Class, dtype: int64
valid_df label의 shape
1    848
0    567
Name: Class, dtype: int64


데이터준비완료

### GNN 사용하기 위해 데이터준비

In [27]:
x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array([smilesList[0]], feature_dicts)

num_atom_features = x_atom.shape[-1]
num_bond_features = x_bonds.shape[-1]
loss_function = [nn.CrossEntropyLoss(weight=torch.Tensor(weight).to(device), reduction='mean') for weight in weights]

# loss_function = [nn.BCEWithLogitsLoss(pos_weight = pos_weight, weight=torch.Tensor(weight), reduction='mean') for weight in weights]

### model ExtendedFingerprint 선언

{'target': 0.8770887433668285, 'params': {'batch_size': 204.579952450252, 'dropout': 0.6767147751204459, 'fingerprint_dim': 126.00327336765619, 'learning_rate_exp': 4.814956775338501, 'weight_decay_exp': 2.5126322543327806}

In [28]:
################################
# bayesian optimizer로 찾은 최적의 parameter 값 #
physicochemical_feature_dim = 1029


# 이거는 기존에 hERGAT dataset.csv 파일에 해당하는 bayesian optimizer hyperparameter 변수들
T = 2
p_dropout = 0.2
batch_size = 218
final1_fc1 = 214
final1_fc2 = 150
fingerprint_dim = 160
learning_rate = 5.0
physicochemical_feature_dim_1 = 273
physicochemical_feature_dim_2 = 200
weight_decay = 2.13




################################


model = ExtendedFingerprint_viz(radius, T, num_atom_features, num_bond_features, fingerprint_dim, output_units_num, p_dropout, physicochemical_feature_dim, physicochemical_feature_dim_1, physicochemical_feature_dim_2, final1_fc1, final1_fc2)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 10**-learning_rate, weight_decay = 10**-weight_decay)

model_parameters = filter(lambda p : p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

1163469


In [29]:
for name, param in model.named_parameters():
    print(name, param.data.shape)

atom_fc.weight torch.Size([160, 39])
atom_fc.bias torch.Size([160])
neighbor_fc.weight torch.Size([160, 49])
neighbor_fc.bias torch.Size([160])
GRUCell.0.weight_ih torch.Size([480, 160])
GRUCell.0.weight_hh torch.Size([480, 160])
GRUCell.0.bias_ih torch.Size([480])
GRUCell.0.bias_hh torch.Size([480])
GRUCell.1.weight_ih torch.Size([480, 160])
GRUCell.1.weight_hh torch.Size([480, 160])
GRUCell.1.bias_ih torch.Size([480])
GRUCell.1.bias_hh torch.Size([480])
GRUCell.2.weight_ih torch.Size([480, 160])
GRUCell.2.weight_hh torch.Size([480, 160])
GRUCell.2.bias_ih torch.Size([480])
GRUCell.2.bias_hh torch.Size([480])
align.0.weight torch.Size([1, 320])
align.0.bias torch.Size([1])
align.1.weight torch.Size([1, 320])
align.1.bias torch.Size([1])
align.2.weight torch.Size([1, 320])
align.2.bias torch.Size([1])
attend.0.weight torch.Size([160, 160])
attend.0.bias torch.Size([160])
attend.1.weight torch.Size([160, 160])
attend.1.bias torch.Size([160])
attend.2.weight torch.Size([160, 160])
attend

In [30]:
def train(model, dataset, optimizer, loss_function):
    model.train()
    np.random.seed(30)
    valList = np.arange(0,dataset.shape[0])
    #shuffle them
    np.random.shuffle(valList)
    batch_list = []
    for i in range(0, dataset.shape[0], batch_size):
        batch = valList[i:i+batch_size]
        batch_list.append(batch)   
    for counter, train_batch in enumerate(batch_list):
        batch_df = dataset.loc[train_batch,:]
        smiles_list = batch_df.cano_smiles.values
        
        x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(smiles_list,feature_dicts)    
        x_atom = torch.Tensor(x_atom).to(device).float()
        x_bonds = torch.Tensor(x_bonds).to(device).float()
        x_atom_index = torch.LongTensor(x_atom_index).to(device)
        x_bond_index = torch.LongTensor(x_bond_index).to(device)
        x_mask = torch.Tensor(x_mask).to(device).float()
        
        selected_rows = batch_df[batch_df['cano_smiles'] == smiles_list].drop(columns=['SMILES', 'Class', 'cano_smiles'])
        physicochemical_features= torch.tensor(selected_rows.to_numpy(dtype=np.float32), dtype=torch.float).to(device)

        atoms_prediction, atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, mol_attention_weight_viz, mol_prediction = model(x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, physicochemical_features)
        
        model.zero_grad()
        # Step 4. Compute your loss function. (Again, Torch wants the target wrapped in a variable)
        loss = 0.0
        for i,task in enumerate(tasks):
            y_pred = mol_prediction[:, i * per_task_output_units_num:(i + 1) *
                                    per_task_output_units_num].to(device)
            y_val = batch_df["Class"].values

            validInds = np.where((y_val == 0) | (y_val == 1))[0]
            if len(validInds) == 0:
                continue

            y_val_adjust = np.array([y_val[v] for v in validInds]).astype(float)
            y_val_adjust = torch.LongTensor(y_val_adjust).to(device)
            validInds = torch.LongTensor(validInds).squeeze().to(device)
            y_pred_adjust = torch.index_select(y_pred, 0, validInds)
            y_pred_adjust = y_pred_adjust.to(device)
            loss += loss_function[i](
                y_pred_adjust,y_val_adjust)
        # Step 5. Do the backward pass and update the gradient
#             print(y_val,y_pred,validInds,y_val_adjust,y_pred_adjust)
        loss.backward()
        optimizer.step()
        
        
def eval(model, dataset):
    model.eval()
    y_val_list = {}
    y_pred_list = {}
    losses_list = []

    
    valList = np.arange(0,dataset.shape[0])
    batch_list = []
    for i in range(0, dataset.shape[0], batch_size):
        batch = valList[i:i+batch_size]
        batch_list.append(batch)   
    for counter, test_batch in enumerate(batch_list):
        batch_df = dataset.loc[test_batch,:]
        smiles_list = batch_df.cano_smiles.values
        
        x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(smiles_list,feature_dicts)
        x_atom = torch.Tensor(x_atom).to(device).float()
        x_bonds = torch.Tensor(x_bonds).to(device).float()
        x_atom_index = torch.LongTensor(x_atom_index).to(device)
        x_bond_index = torch.LongTensor(x_bond_index).to(device)
        x_mask = torch.Tensor(x_mask).to(device).float()
        
        selected_rows = batch_df[batch_df['cano_smiles'] == smiles_list].drop(columns=['SMILES', 'Class', 'cano_smiles'])
        physicochemical_features = torch.tensor(selected_rows.to_numpy(dtype = np.float32), dtype = torch.float).to(device)
#        physicochemical_features = physicochemical_features.expand(batch_size, -1)
        
        atoms_prediction, atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, mol_attention_weight_viz, mol_prediction = model(x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, physicochemical_features)

        atom_pred = atoms_prediction.data[:,:,1].unsqueeze(2).cpu().numpy()
        for i,task in enumerate(tasks):
            y_pred = mol_prediction[:, i * per_task_output_units_num:(i + 1) *
                                    per_task_output_units_num].to(device)
            y_val = batch_df[task].values

            validInds = np.where((y_val==0) | (y_val==1))[0]
            
            if len(validInds) == 0:
                continue
            y_val_adjust = np.array([y_val[v] for v in validInds]).astype(float)
            y_val_adjust = torch.LongTensor(y_val_adjust).to(device)
            validInds = torch.LongTensor(validInds).squeeze().to(device)
            y_pred_adjust = torch.index_select(y_pred, 0, validInds)
            y_pred_adjust = y_pred_adjust.to(device)

            loss = loss_function[i](
                y_pred_adjust,y_val_adjust)
            y_pred_adjust = F.softmax(y_pred_adjust,dim=-1).data.cpu().numpy()[:,1]
            losses_list.append(loss.detach().cpu().numpy())
            try:
                y_val_list[i].extend(y_val_adjust.cpu().numpy())
                y_pred_list[i].extend(y_pred_adjust)
            except:
                y_val_list[i] = []
                y_pred_list[i] = []
                y_val_list[i].extend(y_val_adjust.cpu().numpy())
                y_pred_list[i].extend(y_pred_adjust)
                
    # 시행할때마다 모델에서 최적의 optimal_threshold값을 찾아 성능평가
    
    def find_optimal_threshold(precision, recall, thresholds):
        with np.errstate(divide='ignore', invalid='ignore'):
            f1_scores = 2*((precision*recall)/(precision+recall))
            f1_scores = np.nan_to_num(f1_scores)  # convert NaNs to 0
        index = np.argmax(f1_scores)
        return thresholds[index], f1_scores[index]

    test_prc = []
    test_thresholds = []
    optimal_thresholds = []
    for i in range(len(tasks)):
        precision, recall, thresholds = precision_recall_curve(y_val_list[i], y_pred_list[i])
        optimal_threshold, test_f1_scores = find_optimal_threshold(precision, recall, thresholds)
        optimal_thresholds.append(optimal_threshold)
        test_prc.append(auc(recall, precision))
        test_thresholds.append(thresholds)
        
        
    test_precision = [precision_score(y_val_list[i], (np.array(y_pred_list[i]) > optimal_thresholds[i]).astype(int), zero_division = 1) for i in range(len(tasks))]
    test_recall = [recall_score(y_val_list[i],(np.array(y_pred_list[i]) >optimal_thresholds[i]).astype(int)) for i in range(len(tasks))]
    
    #test_f1_score = [2 * (precision * recall) / (precision + recall) for precision, recall in zip(test_precision, test_recall)]    
    conf_matrices = [confusion_matrix(y_val_list[i], (np.array(y_pred_list[i]) > optimal_thresholds[i]).astype(int)) for i in range(len(tasks))]
    test_sensitivity = [cm[1, 1] / (cm[1, 1] + cm[1, 0]) for cm in conf_matrices]
    test_specificity = [cm[0, 0] / (cm[0, 0] + cm[0, 1]) for cm in conf_matrices]
    test_acc = [accuracy_score(y_val_list[i], (np.array(y_pred_list[i]) > optimal_thresholds[i]).astype(int)) for i in range(len(tasks))]
    fpr_list, tpr_list, thresholds2 = [], [], []
    for i in range(len(tasks)):
        fpr, tpr, thresholds = roc_curve(y_val_list[i], y_pred_list[i])
        fpr_list.append(fpr)
        tpr_list.append(tpr)
    test_roc = [roc_auc_score(y_val_list[i], y_pred_list[i]) for i in range(len(tasks))]
    test_loss = np.array(losses_list).mean()
    
    return test_acc, test_roc, test_prc, test_precision, test_recall, test_f1_scores, test_loss, test_sensitivity, test_specificity, fpr_list, tpr_list, optimal_thresholds, y_val_list, y_pred_list

In [31]:
best_param = {"roc_epoch": 0, "valid_roc": 0, "loss_epoch": 0, "valid_loss": float('inf')}
best_param

{'roc_epoch': 0, 'valid_roc': 0, 'loss_epoch': 0, 'valid_loss': inf}

### Earlystopping

In [32]:
class EarlyStopping:
    def __init__(self, patience=5):
        self.loss = np.inf
        self.counter = 0
        self.patience = patience
        self.early_stop = False

    def step(self, loss):
        if loss < self.loss:
            self.loss = loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def should_step(self):
        return self.early_stop
    
early_stopping = EarlyStopping(patience = 15)    

## Model training

In [None]:
import os
os.chdir('/data/home/ldhyun7222/hERGAT')

epochs = 300
train_loss_vis2 = []
valid_loss_vis2 = []
train_roc2 = []
valid_roc2 = []
train_sensitivity2 = []
train_specificity2 = []
valid_precision2 = []
valid_recall2 = []
valid_prc2 = []
valid_optimal_thresholds2=[]
for epoch in range(epochs):    
    start_time = time.time() # epoch의 시작시간
    
    train(model, train_df, optimizer, loss_function)
  
    
    train_acc, train_roc, train_prc, train_precision, train_recall, train_f1_score, train_loss, train_sensitivity, train_specificity, train_fpr, train_tpr, train_optimal_thresholds, train_y_val_list, train_y_pred_list= eval(model, train_df)
    valid_acc, valid_roc, valid_prc, valid_precision, valid_recall, valid_f1_score, valid_loss, valid_sensitivity, valid_specificity, valid_fpr, valid_tpr, valid_optimal_thresholds, valid_y_val_list, valid_y_pred_list= eval(model, valid_df)

    train_loss_vis2.append(train_loss)
    valid_loss_vis2.append(valid_loss)
    train_roc2.append(train_roc)
    valid_roc2.append(valid_roc)
    train_roc_mean = np.array(train_roc2).mean()
    valid_roc_mean = np.array(valid_roc2).mean()
    train_sensitivity2.append(train_sensitivity)
    train_specificity2.append(train_specificity)
    valid_precision2.append(valid_precision)
    valid_recall2.append(valid_recall)
    valid_optimal_thresholds2.append(valid_optimal_thresholds)
    valid_prc2.append(valid_prc)
    
    
    

        
    if valid_roc_mean > 0.60:
        torch.save(model, 'saved_model2/model_'+prefix_filename+'_'+str(epoch)+'.pt')                
        
            
    if max(valid_roc2)[0] > best_param["valid_roc"]:
        best_param["roc_epoch"] = epoch
        best_param["valid_roc"] = max(valid_roc2)
                
    if valid_loss < best_param["valid_loss"]:
        best_param["loss_epoch"] = epoch
        best_param["valid_loss"] = valid_loss

    print("EPOCH:\t"+str(epoch)+'\n'\
        +"train_acc"+":"+str(train_acc)+'\n'\
        +"train_roc"+":"+str(train_roc)+'\n'\
        +"valid_roc"+":"+str(valid_roc)+'\n'\
        +"valid_prc"+":"+str(valid_prc)+'\n'\
        +"train_precision"+":"+str(train_precision)+'\n'
        +"train_recall"+":"+str(train_recall)+'\n'
        +"train_specificity"+":"+str(train_specificity)+'\n'\
        +"train_f1_score"+":"+str(train_f1_score)+'\n'
        +"train_loss"+":"+str(train_loss)+'\n'\
        +"valid_loss"+":"+str(valid_loss)+'\n'
        )
    
#     if (epoch - best_param["roc_epoch"] >18) and (epoch - best_param["loss_epoch"] >28):        
#         break
    early_stopping.step(valid_loss)
    if early_stopping.should_step():
        print('early stopping')
        
        break
    
    
    end_time = time.time() # 각 epoch의 종료시간
    elapsed_time = end_time - start_time # 경과시간
    print(f'Epoch {epoch+1} finished in {elapsed_time:.2f} seconds.')
    

EPOCH:	0
train_acc:[0.648914419238633]
train_roc:[0.6643187241611981]
valid_roc:[0.6243750207979768]
valid_prc:[0.6853272887567738]
train_precision:[0.640308277027027]
train_recall:[0.9682311621966795]
train_specificity:[0.14973795857249814]
train_f1_score:0.7709610802223987
train_loss:0.66913277
valid_loss:0.68047374

Epoch 1 finished in 28.39 seconds.
EPOCH:	1
train_acc:[0.6721351377665271]
train_roc:[0.7163449206943405]
valid_roc:[0.6712962962962963]
valid_prc:[0.740261193269456]
train_precision:[0.6606400088736065]
train_recall:[0.9508301404853129]
train_specificity:[0.2364611929124033]
train_f1_score:0.7796865285821798
train_loss:0.6540178
valid_loss:0.66731393

Epoch 2 finished in 27.68 seconds.
EPOCH:	2
train_acc:[0.6932625839742965]
train_roc:[0.7476320125524266]
valid_roc:[0.6992758144487705]
valid_prc:[0.7744594181772099]
train_precision:[0.6830716763685541]
train_recall:[0.9272828863346104]
train_specificity:[0.3274270027451959]
train_f1_score:0.7867009750812568
train_loss:0

In [None]:
best_param

In [None]:
best_model = torch.load('saved_model2/model'+'_' + prefix_filename + '_' + str(best_param['roc_epoch']) + '.pt')     # best_param['roc_epoch']

test_acc, test_roc, test_prc, test_precision, test_recall, test_f1_score, test_loss, test_sensitivity, test_specificity, test_fpr, test_tpr, test_optimal_thresholds, test_y_val_list, test_y_pred_list= eval(best_model, test_df)

print("test_optimal_thresholds"+str(test_optimal_thresholds)
      +"\n"+"test_acc:"+str(test_acc)
      +"\n"+"test_roc:"+str(test_roc)
      +"\n"+"test_prc:"+str(test_prc)
      +"\n"+"test_precision:"+str(test_precision)
      +"\n"+"test_recall:"+str(test_recall)
      +"\n"+"test_sensitivity:"+str(test_sensitivity)
      +"\n"+"test_specificity:"+str(test_specificity)
      +"\n"+"test_f1_score:"+str(test_f1_score)
      +"\n"+"test_roc_mean:",str(np.array(test_roc).mean())
     )

In [None]:
# train_df.to_csv('hERGAT_train_df.csv', index = False)
# test_df.to_csv('hERGAT_test_df.csv', index = False)
# valid_df.to_csv('hERGAT_valid_df.csv', index = False)

## Bayesian Optimizer

In [144]:
import bayes_opt
from bayes_opt import BayesianOptimization
import torch
epochs = 300

def optimize_function(learning_rate_exp, weight_decay_exp, fingerprint_dim,batch_size, dropout, physicochemical_feature_dim_1,  physicochemical_feature_dim_2, final1_fc1, final1_fc2):
    # radius = int(round(radius))
    # T = int(round(T))
    batch_size = int(round(batch_size))
    fingerprint_dim = int(round(fingerprint_dim))
  
    # Early_stopping 인스턴스 생성
    early_stopping = EarlyStopping(patience = 20)
  
    # 모델 생성 및 학습 파라미터 초기화
    model = ExtendedFingerprint_viz(radius, T, num_atom_features, num_bond_features, fingerprint_dim, output_units_num, p_dropout, physicochemical_feature_dim, physicochemical_feature_dim_1, physicochemical_feature_dim_2, final1_fc1, final1_fc2)
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr = 10**-learning_rate_exp, weight_decay = 10**-weight_decay_exp)
    train_loss_vis2 = []
    valid_loss_vis2 = []
    train_roc2 = []
    valid_roc2 = []
    train_sensitivity2 = []
    train_specificity2 = []
    valid_precision2 = []
    valid_recall2 = []
    valid_optimal_thresholds2 = []
#     # 모델 학습 및 검증
    for epoch in range(epochs):
        start_time = time.time() # epoch의 시작시간
        
        
        train(model, train_df, optimizer, loss_function)


        train_acc, train_roc, train_prc, train_precision, train_recall, train_f1_score, train_loss, train_sensitivity, train_specificity, train_fpr, train_tpr, train_optimal_thresholds, train_y_val_list, train_y_pred_list= eval(model, train_df)
        valid_acc, valid_roc, valid_prc, valid_precision, valid_recall, valid_f1_score, valid_loss, valid_sensitivity, valid_specificity, valid_fpr, valid_tpr, valid_optimal_thresholds, valid_y_val_list, valid_y_pred_list= eval(model, valid_df)
        train_loss_vis2.append(train_loss)
        valid_loss_vis2.append(valid_loss)
        train_roc2.append(train_roc)
        valid_roc2.append(valid_roc)
        train_roc_mean = np.array(train_roc2).mean()
        valid_roc_mean = np.array(valid_roc2).mean()
   
        train_sensitivity2.append(train_sensitivity)
        train_specificity2.append(train_specificity)
        valid_precision2.append(valid_precision)
        valid_recall2.append(valid_recall)
        valid_optimal_thresholds2.append(valid_optimal_thresholds)
    
        
        if valid_roc_mean > 0.60:
            torch.save(model, 'saved_model2/model_'+prefix_filename+'_'+str(epoch)+'.pt')                
            
        if valid_roc_mean > best_param["valid_roc"]:
            best_param["roc_epoch"] = epoch
            best_param["valid_roc"] = valid_roc_mean
                
        if valid_loss < best_param["valid_loss"]:
            best_param["loss_epoch"] = epoch
            best_param["valid_loss"] = valid_loss

        print("EPOCH:\t"+str(epoch)+'\n'\
            +"train_acc"+":"+str(train_acc)+'\n'\
            +"train_roc"+":"+str(train_roc)+'\n'\
            +"valid_roc"+":"+str(valid_roc)+'\n'\
            +"train_precision"+":"+str(train_precision)+'\n'
            +"train_recall"+":"+str(train_recall)+'\n'
            +"train_specificity"+":"+str(train_specificity)+'\n'\
            +"train_f1_score"+":"+str(train_f1_score)+'\n'
            +"train_loss"+":"+str(train_loss)+'\n'\
            +"valid_loss"+":"+str(valid_loss)+'\n'
            )
    
#     if (epoch - best_param["roc_epoch"] >18) and (epoch - best_param["loss_epoch"] >28):        
#         break
        early_stopping.step(valid_loss)
        if early_stopping.should_step():
            print('early stopping')
            break
    
        end_time = time.time() # 각 epoch의 종료시간
        elapsed_time = end_time - start_time # 경과시간
        print(f'Epoch {epoch+1} finished in {elapsed_time:.2f} seconds.')
    
            
    return np.mean(valid_roc)  # 예를 들어, 검증 세트의 ROC AUC를 최적화 목표로 설정


In [145]:
bayes_optimizer = BayesianOptimization(
    f=optimize_function,
    pbounds={
        'learning_rate_exp': (2, 5),
        'weight_decay_exp': (2, 5),
        'fingerprint_dim': (50, 300),
        #'radius': (1, 5),
        # 'T': (2, 4),
        'batch_size': (16, 256),
        'dropout': (0.2, 0.7),
        'final1_fc1': (150, 300),
        'final1_fc2': (50, 150),
        'physicochemical_feature_dim_1': (200, 500)
        
    },
    random_state=1
)

In [146]:
 bayes_optimizer.maximize(
    init_points=5,  # 목적함수를 초기에 5번 수행
    n_iter=20,    # 초기단계를 제외하고 optimization을 몇 번 수행할지 결정 즉, 20번 반복
)# 총 25번의 하이퍼파라미터 세트에 대한 목적함수 값을 평가


|   iter    |  target   | batch_... |  dropout  | final1... | final1... | finger... | learni... | physic... | physic... | weight... |
-------------------------------------------------------------------------------------------------------------------------------------
EPOCH:	0
train_acc:[0.7788920260928829]
train_roc:[0.844407299797194]
valid_roc:[0.7841679561412267]
train_precision:[0.788303249097473]
train_recall:[0.8714878671775224]
train_specificity:[0.6341402545545296]
train_f1_score:0.827855491110353
train_loss:0.53756577
valid_loss:0.60644823

Epoch 1 finished in 10.55 seconds.
EPOCH:	1
train_acc:[0.7880440073994742]
train_roc:[0.8526259601253362]
valid_roc:[0.7926524907656983]
train_precision:[0.7947922677437969]
train_recall:[0.8795498084291188]
train_specificity:[0.6449962565510357]
train_f1_score:0.8351140410699402
train_loss:0.5314867
valid_loss:0.60382277

Epoch 2 finished in 13.23 seconds.
EPOCH:	2
train_acc:[0.7814234251776847]
train_roc:[0.8589656399512857]
valid_roc:[0.

In [147]:
print(bayes_optimizer.max) 

{'target': 0.9125881834215168, 'params': {'batch_size': 218.25207613614143, 'dropout': 0.2, 'final1_fc1': 213.92401133311446, 'final1_fc2': 150.0, 'fingerprint_dim': 159.786159469503, 'learning_rate_exp': 5.0, 'physicochemical_feature_dim_1': 273.04078549098057, 'physicochemical_feature_dim_2': 200.0, 'weight_decay_exp': 2.1306937725949666}}


In [None]:
| 20        | 0.9126    | 218.3     | 0.2       | 213.9     | 150.0     | 159.8     | 5.0       | 273.0     | 200.0     | 2.131     |


## Loss function 그래프

In [None]:
def moving_average(data, window_size=5):
    """Calculate the moving average of given data."""
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

# 원하는 창의 크기를 설정
window_size = 5  
smooth_train_loss = moving_average(train_loss_vis2, window_size)
smooth_valid_loss = moving_average(valid_loss_vis2, window_size)


In [None]:
x_len_smooth = np.arange(len(smooth_train_loss)) + window_size // 2
fig, ax = plt.subplots()
ax.set_facecolor('white')  # Set the background color

plt.plot(x_len_smooth, smooth_train_loss, 'red', label='Train Loss')
plt.plot(x_len_smooth, smooth_valid_loss, 'blue', label='Valid Loss')

plt.legend(loc='upper right', facecolor = 'white')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.savefig('smooth_loss_function.png', facecolor = 'white')
plt.show()
