In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle

Processing /root/.cache/pip/wheels/a1/6a/26/d30b7499ff85a4a4593377a87ecf55f7d08af42f0de9b60303/kaggle-1.5.12-cp37-none-any.whl
Installing collected packages: kaggle
  Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c bms-molecular-translation  kaggle

In [1]:
import zipfile
archive = zipfile.ZipFile('/content/drive/MyDrive/bms-molecular-translation.zip')


In [None]:
archive.namelist()

In [4]:
for file in archive.namelist():
    if file.startswith('test'):
        archive.extract(file, 'kaggle')

In [5]:
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import random
import numpy as np
import pandas as pd
import os
import time
from PIL import Image
import glob

In [6]:
TOKEN_LIST = ["<PAD>", "InChI=1S/","<START>", "<END>", "/c", "/h", "/m", "/t", "/b", "/s", "/i"] +\
             ['Si', 'Br', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'C', 'H', 'B', ] +\
             [str(i) for i in range(167,-1,-1)] +\
             ["+", "(", ")", "-", ",", "D", "T"]

In [None]:
len(TOKEN_LIST)

198

In [7]:
tok_2_int = {c:i for i,c in enumerate(TOKEN_LIST)}
int_2_tok = {v:k for k,v in tok_2_int.items()}

In [8]:
def convert_to_tensor(label):
  token = [tok_2_int["<START>"]]
  l = label.split('/')
  token.append(tok_2_int[l[0]+'/'])
  f = re.split('(\d+)', l[1])
  for c in f:
    if c.isnumeric()==False:
      st=0
      for i in range(len(c)+1):
        if c[st:i] in TOKEN_LIST:
          token.append(tok_2_int[c[st:i]])
          st=i
    else:
      token.append(tok_2_int[c])

  for i in range(2,len(l)):
    token.append(tok_2_int['/'+l[i][0]])
    s = re.split(r'(\W+)', l[i][1:])
    for c in s:
      if c.isnumeric()==False and len(c)>=2:
        if c[0] == '-' or c[0] == '+' or c[0] == ',' or c[0] == ')'  or c[0]== '(':
          for sp in c:
            token.append(tok_2_int[sp])
        else:
          cc = re.split('(\d+)', c)
          for b in cc:
            if b.isnumeric()==False and len(b)>=2:
              st=0
              for i in range(len(b)+1):
                if b[st:i] in TOKEN_LIST:
                  token.append(tok_2_int[b[st:i]])
                  st=i
            else:
               if len(b)>0:
                 token.append(tok_2_int[b])

      else:
        if len(c)>0:
          token.append(tok_2_int[c])
  token.append(tok_2_int["<END>"])
  return token

In [None]:
import pandas as pd
labels = pd.read_csv('/content/kaggle/train_labels.csv')

In [None]:
labels = labels.sort_values('image_id')

In [None]:
labels.columns

Index(['image_id', 'InChI'], dtype='object')

In [None]:
names = labels.iloc[:,1]
id = labels.iloc[:,0]

In [None]:
import glob
images = glob.glob('/content/kaggle/train/*/*/*/*.png')

In [None]:
images = sorted(images)

In [None]:
print(images[1])
print(id[1])
print(names[1])

/content/kaggle/train/0/0/0/000019cc0cd2.png
000019cc0cd2
InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(10-14)11-17(23)19-15-4-5-18(24)21(15,3)9-7-16(19)20/h13-16,19H,4-11H2,1-3H3/t13-,14+,15+,16-,19-,20+,21+/m1/s1


In [None]:
names = names[:10000]
images = images[:10000]

In [None]:
len(images)

10000

In [None]:
len(names)

10000

In [None]:
name_vectors = []
for i in range(len(names)):
  name_vectors.append(convert_to_tensor(names[i]))


In [None]:
max_len = max(len(t) for t in name_vectors)
print(max_len)

271


In [None]:
name_vectors = tf.keras.preprocessing.sequence.pad_sequences(name_vectors, padding='post')


In [9]:
image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
BATCH_SIZE = 8
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
vocab_size = 198
num_steps = len(images) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [10]:
def load_image(image_path, name_vectors):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    batch_features = image_features_extract_model(tf.expand_dims(img,0))
    batch_features = tf.reshape(batch_features[0],
                              [64,2048])
    return batch_features, name_vectors

In [None]:
image_dataset = tf.data.Dataset.from_tensor_slices((images, name_vectors))
image_dataset = image_dataset.map(
  load_image, num_parallel_calls=tf.data.AUTOTUNE)
dataset = image_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [13]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    # attention_hidden_layer shape == (batch_size, 64, units)
    attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                         self.W2(hidden_with_time_axis)))

    # score shape == (batch_size, 64, 1)
    # This gives you an unnormalized score for each image feature.
    score = self.V(attention_hidden_layer)

    # attention_weights shape == (batch_size, 64, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [14]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [15]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    # defining attention as a separate model
    context_vector, attention_weights = self.attention(features, hidden)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)

    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)

    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [16]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [18]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [19]:
checkpoint_path = "/content/drive/MyDrive/Kaggle/checkpoints"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

In [20]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  # restoring the latest checkpoint in checkpoint_path
  ckpt.restore(ckpt_manager.latest_checkpoint)

In [None]:
loss_plot = []


In [None]:
@tf.function
def train_step(img_tensor, target):
  loss = 0

  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])

  dec_input = tf.expand_dims([tok_2_int['<START>']] * target.shape[0], 1)

  with tf.GradientTape() as tape:
      features = encoder(img_tensor)

      for i in range(1, target.shape[1]):
          # passing the features through the decoder
          predictions, hidden, _ = decoder(dec_input, features, hidden)

          loss += loss_function(target[:, i], predictions)

          # using teacher forcing
          dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))

  return loss, total_loss

In [None]:
EPOCHS = 50

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    t = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            average_batch_loss = batch_loss.numpy()/int(target.shape[1])
            print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f} time {time.time() - t}')
            t = time.time()
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)
    if epoch % 1 == 0:
      ckpt_manager.save()

    print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
    print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

In [58]:
def evaluate(image):
   
    hidden = decoder.reset_state(batch_size=1)

    img = tf.io.read_file(image)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    batch_features = image_features_extract_model(tf.expand_dims(img,0))
    img_tensor_val = tf.reshape(batch_features, (batch_features.shape[0],
                                                 -1,
                                                 batch_features.shape[3]))

    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tok_2_int['<START>']], 0)
    result = []

    for i in range(300):
        predictions, hidden, attention_weights = decoder(dec_input,
                                                         features,
                                                         hidden)

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(int_2_tok[predicted_id])

        if int_2_tok[predicted_id] == '<END>':
            return result

        dec_input = tf.expand_dims([predicted_id], 0)
        str = ''
        for s in result[:-1]:
          str+=s
  
    return result

In [22]:
test_images = glob.glob('kaggle/test/*/*/*/*.png')

In [61]:
image_id = [t.split('/')[-1].split('.')[0] for t in test_images]

In [55]:
result = []

In [None]:
for image in test_images[:10000]:
  r = evaluate(image)
  res = ''
  for s in r[:-1]:
    res+=s
  result.append(res)
  if len(result)%100 == 0:
    print(len(result))


In [None]:
df = pd.DataFrame({'image_id': image_id, 'InChl':result})
df.to_csv('submission.csv')