In [None]:
import os
import pickle as pkl

import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout, Embedding, InputLayer
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from data import ManyThingsEnFr, create_tokenizer_and_preprocessed_files, create_tf_records
from data import load_datasets, load_tokenizers
from data import zip_directory_and_upload_to_gcs, download_from_gcs_and_unzip_directory

pd.set_option("display.max_colwidth", 1000)
print(tf.__version__, tf.test.is_gpu_available())

In [None]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIRECTORY = os.path.join(BASE_DIR, "data")
LOG_DIRECTORY = os.path.join(BASE_DIR, "logs")
CHECKPOINT_DIRECTORY = os.path.join(BASE_DIR, "checkpoints")
PREFIX = "manythingsenfr"
GCS_BUCKET = "nmt-data-store"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(BASE_DIR, "nmt-gcs-credentials.json")

In [None]:
# create_tokenizer_and_preprocessed_files(ManyThingsEnFr(DATA_DIRECTORY), DATA_DIRECTORY, PREFIX)
# create_tf_records(DATA_DIRECTORY, PREFIX)
# zip_directory_and_upload_to_gcs(DATA_DIRECTORY, GCS_BUCKET, "manythings.enfr.zip")

In [None]:
download_from_gcs_and_unzip_directory(DATA_DIRECTORY, GCS_BUCKET, "manythings.enfr.zip")
download_from_gcs_and_unzip_directory(os.path.join(CHECKPOINT_DIRECTORY, "run_1"), GCS_BUCKET, f"checkpoints.run_1.zip")
download_from_gcs_and_unzip_directory(os.path.join(LOG_DIRECTORY, "run_1"), GCS_BUCKET, f"logs.run_1.zip")

In [None]:
class Encoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(input_dim=vocab_size, output_dim=300)
        self.gru = Bidirectional(GRU(units=256))
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        return self.gru(embedded, hidden)
    
    def initial_hidden_state(self, batch_size):
        return [tf.zeros((batch_size, 256))] * 2

In [None]:
class Decoder(Model):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = Embedding(vocab_size, 300)
        self.gru = GRU(512, return_sequences=False, return_state=False)
        self.dense = Dense(vocab_size, activation='softmax')
    
    def call(self, X, hidden):
        embedded = self.embedding(X)
        output = self.gru(embedded, hidden)
        return self.dense(output), output

In [None]:
def calculate_loss(loss_fn, y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

In [None]:
def train_on_batch(en, fr, encoder, decoder, en_tokenizer, fr_tokenizer, loss_fn, optimizer):
    batch_size = en.shape[0]
    encoder_hidden_state = encoder.initial_hidden_state(batch_size)
    loss = 0
    with tf.GradientTape() as tape:
        context = encoder(en, encoder_hidden_state)
        decoder_hidden_state = context
        decoder_input = tf.expand_dims([fr_tokenizer.word_index['<sos>']] * batch_size, 1)
        for i in range(1, fr.shape[1]):
            predictions, decoder_hidden_state = decoder(decoder_input, decoder_hidden_state)
            loss += calculate_loss(loss_fn, fr[:, i], predictions)
            decoder_input = tf.expand_dims(fr[:, i], 1)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss / batch_size

In [None]:
def init_checkpoint(encoder, decoder, optimizer, run_id):
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), optimizer=optimizer, encoder=encoder, decoder=decoder)
    ckpt_manager = tf.train.CheckpointManager(ckpt, os.path.join(CHECKPOINT_DIRECTORY, run_id), max_to_keep=5)
    ckpt.restore(ckpt_manager.latest_checkpoint)
    if ckpt_manager.latest_checkpoint:
        print("Restored from {}".format(ckpt_manager.latest_checkpoint))
    return ckpt, ckpt_manager

In [None]:
def train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val, run_id, log_directory):
    optimizer = Adam()
    loss_fn = SparseCategoricalCrossentropy()
    summary_writer = tf.summary.create_file_writer(os.path.join(log_directory, run_id))
    ckpt, ckpt_manager = init_checkpoint(encoder, decoder, optimizer, run_id)
    for epoch in range(10):
        for (batch, (en, fr)) in enumerate(dataset_train):
            loss = train_on_batch(en, fr, encoder, decoder, en_tokenizer, fr_tokenizer, loss_fn, optimizer)
            if int(ckpt.step) % 1 == 0:
                print(f"Epoch: {epoch+1} | Batch: {batch+1} | Loss: {loss}")
            ckpt.step.assign_add(1)
            if int(ckpt.step) % 1 == 0:
                save_path = ckpt_manager.save()
                print(f"Saved checkpoint {save_path}")
            with summary_writer.as_default():
                tf.summary.scalar('loss', loss, step=int(ckpt.step))
        zip_directory_and_upload_to_gcs(os.path.join(LOG_DIRECTORY, run_id), GCS_BUCKET, f"logs.{run_id}.zip")
        zip_directory_and_upload_to_gcs(os.path.join(CHECKPOINT_DIRECTORY, run_id), GCS_BUCKET, f"checkpoints.{run_id}.zip")

In [None]:
dataset_train, dataset_val = load_datasets(32, DATA_DIRECTORY, PREFIX)

In [None]:
en_tokenizer, fr_tokenizer = load_tokenizers(DATA_DIRECTORY, PREFIX)

In [None]:
encoder = Encoder(len(en_tokenizer.word_index) + 1)
decoder = Decoder(len(fr_tokenizer.word_index) + 1)

In [None]:
train(encoder, decoder, en_tokenizer, fr_tokenizer, dataset_train, dataset_val, "run_1", LOG_DIRECTORY)