# 필요 패키지

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm

from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  

from IPython.display import clear_output

# ChenHub에서 수집한 데이터를 이용해 학습

### 제공된 데이터는 ChemHub에서 순서대로 수집한 SMILES로 유사 구조들이 밀집되어 있을 가능성이 높습니다.

In [2]:
PATH = './train/' #declare img path

In [3]:
with open('train.csv', 'r') as csv_file:
    data = csv_file.read()
    
all_captions = [] #captions list
all_img_name_vector = [] #image_name list

for line in data.split('\n')[1:-1]: #all train elements
    image_id, smiles = line.split(',') #split to ","
    caption = '<' + smiles + '>' #caption is smiles
    full_image_path = PATH + image_id #images' path

    all_img_name_vector.append(full_image_path) #append to img
    all_captions.append(caption) #append to caption

train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=42) #42번 섞기

num_examples = 10000#908765 # 학습에 사용할 데이터 수, Baseline에서는 제공된 데이터 모두 사용하였습니다.
train_captions = train_captions[:num_examples] #섞인 data들을 저장
img_name_vector = img_name_vector[:num_examples]

### 입력에 맞게 이미지를 로드하는 함수 정의

아래 코드는 블라블라...

~~~python
def syntaxHighlight():
    haroo = "press"
    return haroo
~~~

In [4]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3) #channels : color channels, jpeg를 디코딩
    img = tf.image.resize(img, (299, 299)) #img의 사이즈를 변경
    #inception_v3 : google model 이미지를 비교하여 찾아내는 성능 좋은 모델
    img = tf.keras.applications.inception_v3.preprocess_input(img) #inceptioin_v3에 img를 넣음
    return img, image_path

# imagenet으로 pretrain된 InceptionV3를 이용하여 이미지의 feature 추출

In [5]:
#inclde_top : top의 fully-connected를 포함하는지, weights : None(랜덤 초기화), imagenet(ImageNet에서 미리 트레이닝됨)
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_input = image_model.input #input을 넣음
hidden_layer = image_model.layers[-1].output #hidden_layer를 넣음

#input과 hidden layer를 넣어서 image feature vector를 생성하는 model을 만듦
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

# 빠른 학습을 위해 추출된 feature 로컬디스크에 저장

### InceptionV3에 추출된 feature는 8x8의 feature map이 2048개 존재합니다.
### (8x82048)의 feature를 (64x2048)로 reshape하여 sequence date로 사용합니다.

In [6]:
#encoding시킬 img_feature_vector
encode_train = sorted(set(img_name_vector))

#
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

56798it [40:46:50,  2.58s/it]   


### 예측할 SMILES의 최대 길이 정의
### 학습에 사용하는 SMILES의 길이는 70이하로 샘플링된 상태
### TEST SET은 70이상의 SMILES도 존재합니다.

In [6]:
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [7]:
max_length = calc_max_length(train_captions)

### SMILES 토큰화

In [8]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, char_level=True)
tokenizer.fit_on_texts(train_captions)
top_k = len(tokenizer.word_index)

In [9]:
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [10]:
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

### 학습, 검증 데이터셋 분리

In [11]:
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, cap_vector, test_size=0.02, random_state=42)

In [12]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

(9800, 9800, 200, 200)

### 하이퍼 파라미터 및 학습에 필요한 변수 지정

In [13]:
BATCH_SIZE = 512
BUFFER_SIZE = 1000
embedding_dim = 512
units = 1024
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE
features_shape = 2048
attention_features_shape = 64

### 데이터셋 정의 함수

In [14]:
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor, cap

### 학습 데이터셋 준비, BATCH_SIZE만큼 로컬에서 불러와 학습

In [15]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
dataset = dataset.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

### 검증 데이터셋 준비

In [16]:
dataset_val = tf.data.Dataset.from_tensor_slices((img_name_val, cap_val))
dataset_val = dataset_val.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_val = dataset_val.batch(BATCH_SIZE)
dataset_val = dataset_val.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# 모델 구축
### Attention Layer 정의

In [17]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# CNN_Encoder 정의
### InceptionV3에서 추출된 feature가 입력으로 들어갑니다.

In [18]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

# RNN_Decoder 정의
### CNN_Encoder의 출력이 Attention Layer를 거쳐 context_vector와 attention_weights로 들어가 SMILES를 예측합니다.
### Decode에는 GRU가 사용되었습니다.

In [19]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        context_vector, attention_weights = self.attention(features, hidden)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        x = self.fc1(output)

        x = tf.reshape(x, (-1, x.shape[2]))

        x = self.fc2(x)

        return x, state, attention_weights
    
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

### 모델 생성

In [20]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

# 학습
### Optimizer & Loss Function 정의
Optimizer로는 Adam, Loss Function은 SpareCategoricalCrossentropy을 사용하였습니다.

In [21]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### CheckPoint 지정

In [22]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=25)

### 저장된 CheckPoint가 있으면 복원합니다.

In [23]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint)

### Loss 기록용 List

In [24]:
loss_plot = []

### 학습 정의

In [25]:
@tf.function
def train_step(img_tensor, target, validation=False):
    loss = 0
    
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<']] * target.shape[0], 1)
    
    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            dec_input = tf.expand_dims(target[:, i], 1)

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    total_loss = (loss / int(target.shape[1]))
    
    return loss, total_loss

25 EPOCHS 학습 시켰으며 검증 데이터셋을 이용한 확인은 진행되지 않았습니다.

In [26]:
EPOCHS = 25

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    
    loss_plot.append(total_loss / (batch+1))
    
    ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/(batch+1)))
    
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

KeyboardInterrupt: 

In [None]:
plt.plot(loss_plot, label='loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.legend()
plt.show()

# 검증
### Predict 함수 정의

In [None]:
# 가장 높은 확률로 예측
def predict(img_tensor):
    hidden = decoder.reset_state(batch_size=img_tensor.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<']] * img_tensor.shape[0], 1)
    features = encoder(img_tensor)
    
    result = []
    
    for i in range(max_length):
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predictions = np.argmax(predictions, axis=1)
        result.append(predictions)
        dec_input = tf.expand_dims(predictions, 1)
    
    return np.array(result)

# 비교적 높은 확률들로 예측
def predict_(img_tensor):
    hidden = decoder.reset_state(batch_size=img_tensor.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<']] * img_tensor.shape[0], 1)
    features = encoder(img_tensor)
    
    result = []
    
    for i in range(max_length):
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predictions = tf.random.categorical(predictions, 1)[:, 0].numpy()
        result.append(predictions)
        dec_input = tf.expand_dims(predictions, 1)
    
    return np.array(result)

def map_func_pred(img_name):
    img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    return img_tensor

### 가장 높은 확률로 검증셋을 예측

In [None]:
val_result = []
for batch in tqdm(dataset_val):
    val_result.extend(predict(batch[0]).T)
val_result = np.array(val_result)

### 예측 결과 SMILES로 변환

In [None]:
preds = []
for rid in range(cap_val.shape[0]):
    pred = ''.join([tokenizer.index_word[i] for i in val_result[rid]])
    pred = pred.split('>')[0]
    preds.append(pred)

### 예측 결과가 SMILES 규칙에 맞는지 검사

In [None]:
error_idx = []
for i, pred in enumerate(preds):
    m = Chem.MolFromSmiles(pred)
    if m == None:
        error_idx.append(i)
error_idx = np.array(error_idx)
error_idx_ = error_idx.copy()

##### SMILE 규칙에 맞지 않는 데이터는 확률이 높은 다른값들로 다시 예측
##### 대부분의 결과가 SMILES 규칙에 맞게 예측할 때까지 반복

In [None]:
drop_error = []
while True:
    error_idx_dict = {}
    for i, e in enumerate(error_idx_):
        error_idx_dict[i] = e
        
    img_name_val_, cap_val_ = np.array(img_name_val)[error_idx_], np.array(cap_val)[error_idx_]
    dataset_val_ = tf.data.Dataset.from_tensor_slices((img_name_val_, cap_val_))
    dataset_val_ = dataset_val_.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_val_ = dataset_val_.batch(BATCH_SIZE)
    dataset_val_ = dataset_val_.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    
    val_result_ = []
    for batch in dataset_val_:
        val_result_.extend(predict_(batch[0]).T)
    val_result_ = np.array(val_result_)

    preds_ = []
    for rid in range(val_result_.shape[0]):
        pred = ''.join([tokenizer.index_word[i] for i in val_result_[rid] if i not in [0]])
        pred = pred.split('>')[0]
        preds_.append(pred)
    
    for i, pred in enumerate(preds_):
        m = Chem.MolFromSmiles(pred)
        if m != None:
            preds[error_idx_dict[i]] = pred
            drop_idx = np.where(error_idx==error_idx_dict[i])[0]
            drop_error.append(drop_idx[0])
    error_idx_ = np.delete(error_idx, drop_error)
    clear_output(wait=True)
    print(len(list(drop_error)), '/', error_idx.shape[0])
    
    if error_idx.shape[0]-len(list(drop_error)) < 10 :
        break

### 검증 데이터셋  Accuracy

In [None]:
count = 0
answer = []
for rid, pred in enumerate(preds):
    true = ''.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])[1:-1]
    answer.append(true)
    if true == pred:
        count+=1
print('val_accuracy : ', count/cap_val.shape[0])

### 검증 데이터셋 Tanimoto Similarity

In [None]:
score = []
for i, pred in enumerate(preds):
    m1 = Chem.MolFromSmiles(answer[i])
    m2 = Chem.MolFromSmiles(pred)
    
    if m2 != None:
        fp1 = Chem.RDKFingerprint(m1)
        fp2 = Chem.RDKFingerprint(m2)

        similarity = DataStructs.FingerprintSimilarity(fp1,fp2)
    else:
        similarity = 0
    score.append(similarity)
    
print('val_similarity :', np.mean(score))

# TestSet 예측
### Test 이미지 위치

In [None]:
TEST_PATH = './test/'

with open('sample_submission.csv', 'r') as csv_file:
    data = csv_file.read()
    
test_img_path = []

for line in data.split('\n')[1:-1]:
    image_id, _ = line.split(',')
    full_image_path = TEST_PATH + image_id

    test_img_path.append(full_image_path)

### 빠른 학습을 위해 InceptionV3에서 추출된 feature를 로컬디스크에 저장

In [None]:
encode_train = sorted(set(test_img_path))

image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

### 배치사이즈로 데이터셋 준비

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices((test_img_path))
dataset_test = dataset_test.map(lambda item1: tf.numpy_function(map_func_pred, [item1], [tf.float32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_test = dataset_test.batch(BATCH_SIZE)
dataset_test = dataset_test.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

### 결과 예측

In [None]:
test_result = []
for batch in tqdm(dataset_test):
    test_result.extend(predict(batch[0]).T)
test_result = np.array(test_result)

### SMILES로 변환

In [None]:
preds = []
for rid in range(len(test_img_path)):
    pred = ''.join([tokenizer.index_word[i] for i in test_result[rid]])
    pred = pred.split('>')[0]
    preds.append(pred)

### 검증 데이터셋과 마찬가지로 SMILES 규칙을 만족하지 않은 결과 재예측

In [None]:
error_idx = []
for i, pred in enumerate(preds):
    m = Chem.MolFromSmiles(pred)
    if m == None:
        error_idx.append(i)
error_idx = np.array(error_idx)
error_idx_ = error_idx.copy()

In [None]:
drop_error = []
while True:
    error_idx_dict = {}
    for i, e in enumerate(error_idx_):
        error_idx_dict[i] = e
        
    img_name_test_ = np.array(test_img_path)[error_idx_]
    dataset_test_ = tf.data.Dataset.from_tensor_slices((img_name_test_))
    dataset_test_ = dataset_test_.map(lambda item1: tf.numpy_function(map_func_pred, [item1], [tf.float32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_test_ = dataset_test_.batch(BATCH_SIZE)
    dataset_test_ = dataset_test_.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    
    test_result_ = []
    for batch in dataset_test_:
        test_result_.extend(predict_(batch[0]).T)
    test_result_ = np.array(test_result_)

    preds_ = []
    for rid in range(test_result_.shape[0]):
        pred = ''.join([tokenizer.index_word[i] for i in test_result_[rid] if i not in [0]])
        pred = pred.split('>')[0]
        preds_.append(pred)
    
    for i, pred in enumerate(preds_):
        m = Chem.MolFromSmiles(pred)
        if m != None:
            preds[error_idx_dict[i]] = pred
            drop_idx = np.where(error_idx==error_idx_dict[i])[0]
            drop_error.append(drop_idx[0])
    error_idx_ = np.delete(error_idx, drop_error)
    clear_output(wait=True)
    print(len(list(drop_error)), '/', error_idx.shape[0])
    
    if error_idx.shape[0]-len(list(drop_error)) < 10 :
        break

### 제출

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['SMILES'] = np.array(preds)
submission

In [None]:
submission.to_csv('Dacon_baseline.csv', index=False)