# 필요 패키지

In [136]:
#import tensorflow as tf
import torch

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch.nn.functional as F

import numpy as np
import pandas as pd
import os
import time
from tqdm.autonotebook import tqdm

from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output

from util import *
from models_resnet152_GRU import Encoder, DecoderWithAttention
from torch.nn.utils.rnn import pack_padded_sequence


from torch import nn

In [137]:
# in pytorch, for image
import torchvision
import torchvision.transforms as transforms
from torchvision import models
# cuda
device = torch.device('cuda')

In [138]:
from SmilesDataset import *
import torch.backends.cudnn as cudnn

In [139]:
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# Training parameters
start_epoch = 0
epochs = 10  # number of epochs to train for (if early stopping is not triggered)
epochs_since_improvement = 0  # keeps track of number of epochs since there's been an improvement in validation BLEU
batch_size = 5
workers = 0  # for data-loading; right now, only 1 works with h5py
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
grad_clip = 5.  # clip gradients at an absolute value of
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention', as in the paper
best_bleu4 = 0.  # BLEU-4 score right now
print_freq = 100  # print training/validation stats every __ batches
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [140]:
data_folder = './'#'/media/ksk/Backup/SMILES dataset/'
data_name = 'train.csv'

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

dataset = SmilesDataset(data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize]))

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 99627.17it/s]


In [141]:
vocab_size = len(dataset.get_vocab()[0])

In [142]:
train_set, val_set = torch.utils.data.random_split(dataset, [int(len(dataset) * 0.98), len(dataset) - int(len(dataset) * 0.98)])

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)

In [143]:
decoder = DecoderWithAttention(attention_dim=attention_dim,
                                       embed_dim=emb_dim,
                                       decoder_dim=decoder_dim,
                                       vocab_size=len(dataset.get_vocab()[0]),
                                       dropout=dropout)
decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                     lr=decoder_lr)
encoder = Encoder()
encoder.fine_tune(fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                     lr=encoder_lr) if fine_tune_encoder else None


# Move to GPU, if available
decoder = decoder.to(device)
encoder = encoder.to(device)

# Loss function
criterion = nn.CrossEntropyLoss().to(device)

In [144]:
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch):
    """
    Performs one epoch's training.
    :param train_loader: DataLoader for training data
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning)
    :param decoder_optimizer: optimizer to update decoder's weights
    :param epoch: epoch number
    """

    decoder.train()  # train mode (dropout and batchnorm is used)
    encoder.train()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()

    i = 0;
    # Batches
    for (imgs, caps, caplens) in iter(train_loader):
        data_time.update(time.time() - start)

        # Move to GPU, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = caps_sorted[:, 1:]

        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0] # 5045, 69
        targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0] # 5045

        # Calculate loss
        loss = criterion(scores, targets)

        # Add doubly stochastic attention regularization
        loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

        # Back prop.
        decoder_optimizer.zero_grad()
        if encoder_optimizer is not None:
            encoder_optimizer.zero_grad()
        loss.backward()

        # Clip gradients
        if grad_clip is not None:
            clip_gradient(decoder_optimizer, grad_clip)
            if encoder_optimizer is not None:
                clip_gradient(encoder_optimizer, grad_clip)

        # Update weights
        decoder_optimizer.step()
        if encoder_optimizer is not None:
            encoder_optimizer.step()

        # Keep track of metrics
        top5 = accuracy(scores, targets, 5)
        losses.update(loss.item(), sum(decode_lengths))
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                          batch_time=batch_time,
                                                                          data_time=data_time, loss=losses,
                                                                          top5=top5accs))
        i += 1

In [145]:
def validate(val_loader, encoder, decoder, criterion):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list()  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # explicitly disable gradient calculation to avoid CUDA memory error
    # solves the issue #57
    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens) in enumerate(val_loader):

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            if encoder is not None:
                imgs = encoder(imgs)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0]
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0]

            # Calculate loss
            loss = criterion(scores, targets)

            # Add doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)
            
            # bleu4
            #bleu4 = (references, hypothesis) 

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time,
                                                                                loss=losses, top5=top5accs))
            


        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}\n'.format( # , BLEU-4 - {bleu}
                loss=losses,
                top5=top5accs)) # ,bleu=bleu4

    return losses

In [146]:
best_loss = np.infty

In [147]:
best_loss = 1.0
best_epoch = -1
#save_dir = './save_model'
for epoch in tqdm(range(start_epoch, epochs)): # epochs 
    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if epochs_since_improvement == 20:
        break
    if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if fine_tune_encoder:
            adjust_learning_rate(encoder_optimizer, 0.8)

    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch)
    
    
        
    recent_loss = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            criterion=criterion)
    '''
    # Check if there was an improvement
    is_best = recent_loss.val < best_loss
    best_loss = max(recent_loss.val, best_loss) # max?
    if not is_best:
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
    else:
        epochs_since_improvement = 0

    # Save checkpoint
    save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_loss, is_best)
    '''
    if recent_loss.val < best_loss :
        epochs_since_improvement = 0
        best_loss = recent_loss.val
        best_epoch = epoch
       # torch.save(model, f'{save_dir}/{model_name}.pt') # util.py로 옮겨야하나?
    else :
        epochs_since_improvement += 1
        print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,))
    print('best loss :')
    print('best epoch :')

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Epoch: [0][0/20]	Batch Time 3.065 (3.065)	Data Load Time 0.028 (0.028)	Loss 4.2156 (4.2156)	Top-5 Accuracy 2.449 (2.449)


 10%|████████▎                                                                          | 1/10 [01:01<09:13, 61.48s/it]

Validation: [0/1]	Batch Time 0.754 (0.754)	Loss 2.8662 (2.8662)	Top-5 Accuracy 15.152 (15.152)	

 * LOSS - 2.866, TOP-5 ACCURACY - 15.152


Epochs since last improvement: 1

best loss :
best epoch :
Epoch: [1][0/20]	Batch Time 3.072 (3.072)	Data Load Time 0.009 (0.009)	Loss 2.5383 (2.5383)	Top-5 Accuracy 17.615 (17.615)


 20%|████████████████▌                                                                  | 2/10 [02:03<08:12, 61.58s/it]

Validation: [0/1]	Batch Time 0.654 (0.654)	Loss 2.3272 (2.3272)	Top-5 Accuracy 18.182 (18.182)	

 * LOSS - 2.327, TOP-5 ACCURACY - 18.182


Epochs since last improvement: 2

best loss :
best epoch :
Epoch: [2][0/20]	Batch Time 3.312 (3.312)	Data Load Time 0.008 (0.008)	Loss 2.3104 (2.3104)	Top-5 Accuracy 17.619 (17.619)


 30%|████████████████████████▉                                                          | 3/10 [03:05<07:12, 61.75s/it]

Validation: [0/1]	Batch Time 0.533 (0.533)	Loss 1.9468 (1.9468)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.947, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 3

best loss :
best epoch :
Epoch: [3][0/20]	Batch Time 2.446 (2.446)	Data Load Time 0.008 (0.008)	Loss 1.9221 (1.9221)	Top-5 Accuracy 19.074 (19.074)


 40%|█████████████████████████████████▏                                                 | 4/10 [03:56<05:51, 58.56s/it]

Validation: [0/1]	Batch Time 0.587 (0.587)	Loss 1.9496 (1.9496)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.950, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 4

best loss :
best epoch :
Epoch: [4][0/20]	Batch Time 2.808 (2.808)	Data Load Time 0.008 (0.008)	Loss 1.9807 (1.9807)	Top-5 Accuracy 18.553 (18.553)


 50%|█████████████████████████████████████████▌                                         | 5/10 [04:45<04:37, 55.60s/it]

Validation: [0/1]	Batch Time 0.558 (0.558)	Loss 1.8304 (1.8304)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.830, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 5

best loss :
best epoch :
Epoch: [5][0/20]	Batch Time 1.935 (1.935)	Data Load Time 0.007 (0.007)	Loss 2.0616 (2.0616)	Top-5 Accuracy 20.000 (20.000)


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [05:34<03:35, 53.79s/it]

Validation: [0/1]	Batch Time 0.599 (0.599)	Loss 1.6776 (1.6776)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.678, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 6

best loss :
best epoch :
Epoch: [6][0/20]	Batch Time 2.492 (2.492)	Data Load Time 0.009 (0.009)	Loss 1.4933 (1.4933)	Top-5 Accuracy 20.000 (20.000)


 70%|██████████████████████████████████████████████████████████                         | 7/10 [06:23<02:37, 52.41s/it]

Validation: [0/1]	Batch Time 0.665 (0.665)	Loss 1.5840 (1.5840)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.584, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 7

best loss :
best epoch :
Epoch: [7][0/20]	Batch Time 2.080 (2.080)	Data Load Time 0.008 (0.008)	Loss 1.6566 (1.6566)	Top-5 Accuracy 19.506 (19.506)


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [07:14<01:43, 51.71s/it]

Validation: [0/1]	Batch Time 0.727 (0.727)	Loss 1.5407 (1.5407)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.541, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 8

best loss :
best epoch :

DECAYING learning rate.
The new learning rate is 0.000320

Epoch: [8][0/20]	Batch Time 2.500 (2.500)	Data Load Time 0.009 (0.009)	Loss 1.4295 (1.4295)	Top-5 Accuracy 20.000 (20.000)


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [08:04<00:51, 51.20s/it]

Validation: [0/1]	Batch Time 0.623 (0.623)	Loss 1.5822 (1.5822)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.582, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 9

best loss :
best epoch :
Epoch: [9][0/20]	Batch Time 2.693 (2.693)	Data Load Time 0.008 (0.008)	Loss 2.2206 (2.2206)	Top-5 Accuracy 18.750 (18.750)


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [08:55<00:00, 53.51s/it]

Validation: [0/1]	Batch Time 0.612 (0.612)	Loss 1.5586 (1.5586)	Top-5 Accuracy 19.394 (19.394)	

 * LOSS - 1.559, TOP-5 ACCURACY - 19.394


Epochs since last improvement: 10

best loss :
best epoch :





In [45]:
def predict_seq(image, beam_size=5):    
    k = beam_size

    # Move to GPU device, if available
    image = image.to(device)  # (1, 3, 256, 256)

    # Encode
    encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

    # We'll treat the problem as having a beam size of k
    encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    k_prev_words = torch.LongTensor([[0]] * k).to(device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Lists to store completed sequences and scores
    complete_seqs = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

        awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

        gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe

        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

        scores = decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

        # For the first step, all k points will have the same scores (since same k previous words, h, c)
        if step == 1:
            top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
        else:
            # Unroll and find top scores, and their unrolled indices
            top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

        # Convert unrolled indices to actual indices of scores
        prev_word_inds = top_k_words // vocab_size  # (s)
        next_word_inds = top_k_words % vocab_size  # (s)

        # Add new words to sequences
        seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                           next_word != 1]
        complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 80:
            break
        step += 1
        
    if len(complete_seqs_scores) == 0:
        print('no sequence left')
        return ''

    i = complete_seqs_scores.index(max(complete_seqs_scores))
    seq = complete_seqs[i]
    #predicted_seq = ''.join([dataset.get_vocab()[1][num] for num in seq ])[1:-1]
    
    return np.array(complete_seqs)[np.argsort(complete_seqs_scores)[::-1]]

In [46]:
val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=True, num_workers=workers, pin_memory=True)

predicted_seqs = []
true_seqs = []

beam_size = 20

for imgs, caps, caplens in tqdm(val_loader):
    
    predicted_smiles = predict_seq(imgs, beam_size)
    
    for i in range(len(predicted_smiles)): 
        predicted_smile = ''.join([dataset.get_vocab()[1][num] for num in predicted_smiles[i] ])[1:-1]
        m = Chem.MolFromSmiles(predicted_smile)
        
        if m != None:
            break
    
    predicted_seqs.append(predicted_smile)
    true_seqs.append(''.join([dataset.get_vocab()[1][num.item()] for num in caps.squeeze() if num != 2 ])[1:-1])

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]


TypeError: h5py objects cannot be pickled

In [15]:
count = 0
for rid, pred in enumerate(predicted_seqs):    
    if true_seqs[rid] == pred:
        count+=1
print('val_accuracy : ', count/len(predicted_seqs))

ZeroDivisionError: division by zero

### 검증 데이터셋 Tanimoto Similarity

In [None]:
score = []
for i, pred in enumerate(predicted_seqs):
    m1 = Chem.MolFromSmiles(true_seqs[i])
    m2 = Chem.MolFromSmiles(pred)
    
    if m2 != None:
        fp1 = Chem.RDKFingerprint(m1)
        fp2 = Chem.RDKFingerprint(m2)

        similarity = DataStructs.FingerprintSimilarity(fp1,fp2)
    else:
        similarity = 0
    score.append(similarity)
    
print('val_similarity :', np.mean(score))

In [None]:
error_idx = []
for i, pred in enumerate(preds):
    m = Chem.MolFromSmiles(pred)
    if m == None:
        error_idx.append(i)
error_idx = np.array(error_idx)
error_idx_ = error_idx.copy()

In [None]:
drop_error = []
while True:
    error_idx_dict = {}
    for i, e in enumerate(error_idx_):
        error_idx_dict[i] = e
        
    img_name_test_ = np.array(test_img_path)[error_idx_]
    dataset_test_ = tf.data.Dataset.from_tensor_slices((img_name_test_))
    dataset_test_ = dataset_test_.map(lambda item1: tf.numpy_function(map_func_pred, [item1], [tf.float32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_test_ = dataset_test_.batch(BATCH_SIZE)
    dataset_test_ = dataset_test_.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    
    test_result_ = []
    for batch in dataset_test_:
        test_result_.extend(predict_(batch[0]).T)
    test_result_ = np.array(test_result_)

    preds_ = []
    for rid in range(test_result_.shape[0]):
        pred = ''.join([tokenizer.index_word[i] for i in test_result_[rid] if i not in [0]])
        pred = pred.split('>')[0]
        preds_.append(pred)
    
    for i, pred in enumerate(preds_):
        m = Chem.MolFromSmiles(pred)
        if m != None:
            preds[error_idx_dict[i]] = pred
            drop_idx = np.where(error_idx==error_idx_dict[i])[0]
            drop_error.append(drop_idx[0])
    error_idx_ = np.delete(error_idx, drop_error)
    clear_output(wait=True)
    print(len(list(drop_error)), '/', error_idx.shape[0])
    
    if error_idx.shape[0]-len(list(drop_error)) < 10 :
        break

### 제출

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['SMILES'] = np.array(preds)
submission

In [None]:
submission.to_csv('Dacon_baseline.csv', index=False)