##### Copyright 2019 The TensorFlow Authors.

notebook original tutorial: https://www.tensorflow.org/tutorials/text/nmt_with_attention

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [3]:
!pip install transformers
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
from transformers import RobertaTokenizer, TFRobertaModel


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 9.0MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 16.8MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     

In [4]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#check that files needed are in the right location
from pathlib import Path

colab_path = '/content/drive'

training_data = '/pairs_fr_eng.txt'

# this roBERTa model was trained on the English unaligned data in pytorch, then converted to tensorflow
roberta_eng_model = '/English_small'

#not used, placeholder if decide to put roBERTa french in GRU decoder
roberta_fr_model = ''

training_data_path = colab_path + "/My Drive/Colab Notebooks/carocode" + training_data
roBERTa_eng_model_path = colab_path + "/My Drive/Colab Notebooks/carocode" + roberta_eng_model

target_gold_path = Path(colab_path + "/My Drive/Colab Notebooks/carocode" + roberta_eng_model + "_target_gold.txt")
target_predicted_path = Path(colab_path + "/My Drive/Colab Notebooks/carocode" + roberta_eng_model + "_target_predicted.txt")
input_gold_path = Path(colab_path + "/My Drive/Colab Notebooks/carocode" + roberta_eng_model + "_input_gold.txt")

In [0]:
# Hyperparameters

num_examples = 11000

# when fine_tuning roBERTa, batch_size is 16. 
# when using roBERTa as a feature extractor, batch_size is 64
BATCH_SIZE = 16
embedding_dim = 100   #hidden dimension size 
units = 252 # to match BERT hidden dim (roBERTa_small = 252, roBERTa_large=768)

EPOCHS = 2

In [0]:
def preprocess_sentence(w):
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

In [0]:
# 1. Clean the sentences
# 2. Return word pairs in the format: [ENGLISH, FRENCH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [20]:
fr, en = create_dataset(training_data_path, None)
print(fr[-1])
print(en[-1])

<start> Je conviens que nous avons besoin d' un agenda social ambitieux qui englobera la lutte contre la pauvreté et l' exclusion sociale . <end>
<start> i agree that we need an ambitious social agenda which will include combating poverty and social exclusion <end>


In [0]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [0]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='', oov_token="<oov>", lower=False)
    lang_tokenizer.fit_on_texts(lang) #Updates internal vocabulary based on a list of texts 

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

    return tensor, lang_tokenizer


In [0]:
def roberta_tokenize(text, roberta_path):
    roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_path)
    input_ids = roberta_tokenizer.batch_encode_plus(text,pad_to_max_length=True )["input_ids"]  
    return input_ids, roberta_tokenizer

In [0]:
def load_dataset(path, roberta_eng, roberta_fr, num_examples=None):
    # creating cleaned input, output pairs
    # targ -> french, inp -> emglish
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    input_train, input_val, target_train, target_val = train_test_split(inp_lang, targ_lang, test_size=0.2, random_state=1234)

    input_tensor_train, inp_lang_tokenizer_train = roberta_tokenize(input_train, roberta_eng)
    input_tensor_val, inp_lang_tokenizer_val = roberta_tokenize(input_val, roberta_eng)
    
    target_tensor_train, targ_lang_tokenizer_train = tokenize(target_train)
    target_tensor_val, targ_lang_tokenizer_val = tokenize(target_val)
    
    return input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val, inp_lang_tokenizer_train, inp_lang_tokenizer_val, targ_lang_tokenizer_train, targ_lang_tokenizer_val, input_val, target_val

In [0]:
def convert(lang, tensor):
    for t in tensor:
        print(t)
        if t!=0: #if not a padding token
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [28]:
#inp_lang and targ_lang are tokenizer objects
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val, inp_lang, inp_lang_val,targ_lang, targ_lang_val, input_gold_val, targ_gold_val = load_dataset(training_data_path, roBERTa_eng_model_path, roberta_fr_model, num_examples)

# Calculate max_length of the target tensors
max_length_inp_train, max_length_inp_val = max_length(input_tensor_train), max_length(input_tensor_val)
max_length_targ_train, max_length_targ_val = max_length(target_tensor_train), max_length(target_tensor_val)

max_length_inp = max(max_length_inp_train, max_length_inp_val)

max_length_targ = max(max_length_targ_train, max_length_targ_val)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

8800 8800 2200 2200


In [0]:
BUFFER_SIZE = len(input_tensor_train)
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
vocab_tar_size = len(targ_lang.word_index)+1
vocab_inp_size = inp_lang.vocab_size

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [30]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape
mask = example_input_batch != 0
mask.shape
print("Vocab size: ", vocab_inp_size)

Vocab size:  20000


In [0]:
class EncoderWithRoberta(tf.keras.Model):
    def __init__(self, roberta_path, embedding_dim, enc_units, batch_sz):
        super(EncoderWithRoberta, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        
        self.roberta = TFRobertaModel.from_pretrained(roberta_path)
        
         #self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        mask = x != 0
        #roBERTa fine-tuned
        roberta_hiddens = self.roberta(x, attention_mask=mask)[0] # The last hidden-state is the first element of the output tuple
        #roBERTa for feature extraction
        #roberta_hiddens = tf.stop_gradient(self.roberta(x)[0], name="roberta_hiddens") # The last hidden-state is the first element of the output tuple
        output, state = self.gru(roberta_hiddens, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [34]:
encoder = EncoderWithRoberta(roBERTa_eng_model_path, embedding_dim, units, BATCH_SIZE)
print("made model")
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

made model
Encoder output shape: (batch size, sequence length, units) (16, 110, 252)
Encoder Hidden state shape: (batch size, units) (16, 252)


In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [36]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (16, 252)
Attention weights shape: (batch_size, sequence_length, 1) (16, 110, 1)


In [0]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [38]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (16, 16297)


## Define the optimizer and the loss function

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [41]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
          # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(targ[:, t], predictions)

          # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

tf.config.experimental.list_physical_devices('GPU')
print(tf.test.is_gpu_available())
print(tf.test.is_built_with_cuda())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
True


In [42]:
# Training
print(tf.config.experimental.list_physical_devices('GPU'))
print(tf.test.is_gpu_available())
print(tf.test.is_built_with_cuda())

for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

checkpoint.save(file_prefix = checkpoint_prefix)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True
True
Epoch 1 Batch 0 Loss 1.6368
Epoch 1 Batch 100 Loss 1.3989
Epoch 1 Batch 200 Loss 1.4259
Epoch 1 Batch 300 Loss 1.2599
Epoch 1 Batch 400 Loss 1.3759
Epoch 1 Batch 500 Loss 1.2188
Epoch 1 Loss 1.3455
Time taken for 1 epoch 635.9484951496124 sec

Epoch 2 Batch 0 Loss 1.1031
Epoch 2 Batch 100 Loss 1.1485
Epoch 2 Batch 200 Loss 1.3118
Epoch 2 Batch 300 Loss 1.1515
Epoch 2 Batch 400 Loss 1.1731
Epoch 2 Batch 500 Loss 1.3768
Epoch 2 Loss 1.1599
Time taken for 1 epoch 465.74996757507324 sec



'./training_checkpoints/ckpt-1'

In [0]:
def evaluate(inputs):
    sentence = inputs

    result = ''
    inputs = inp_lang.encode(inputs, return_tensors="tf",pad_to_max_length=True )
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))

        predicted_id = tf.argmax(predictions[0]).numpy()

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence
        
        result += targ_lang.index_word[predicted_id] + ' '
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [0]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    #print("FRENCH TRANSLATION " + result)
    return result

## Restore the latest checkpoint and test

In [45]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd1b64867b8>

In [46]:
count = 0
with open(target_gold_path, 'w', encoding='utf-8') as target_file_gold, \
open(target_predicted_path, 'w', encoding='utf-8') as target_file_predicted, \
open(input_gold_path, 'w', encoding='utf-8') as input_file_gold:

    for sent in input_gold_val:
        count+=1
        if count%50==0:
            print(count)  
        sent = sent.replace('<start> ', "")
        sent = sent.replace(' <end>', "")
        sent = sent.replace('<oov>', "")
        input_file_gold.write(sent.strip() + '\n') #writing the gold english sentences to file
        res = translate(sent)
        target_file_predicted.write(res.strip() + '\n') #writing the predicted french sentences to file 
    
    for line in targ_gold_val:
        line = line.replace('<start> ', "")
        line = line.replace(' <end>', "")
        line = line.replace('<oov>', "")
        target_file_gold.write(line.strip() + '\n') #writing the gold french sentences to file
        

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
