In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Main Iterative Back Translation Notebook

## Create Dataset for Training

In [2]:
import pandas as pd
import numpy as np
import re
df = pd.read_parquet("hf://datasets/shahules786/PoetryFoundationData/data/train-00000-of-00001-486832872ed96d17.parquet")

print(f"dataframe columns: {df.columns}")

newyork = df[df['author'].isin(["John Ashbery", "Barbara Guest", "James Schuyler", "Kenneth Koch", "Frank O'Hara"])]
shake = df[df['author'] == 'William Shakespeare']

print(f"Shakespeare: {len(shake)} examples\nNew Yorkers: {len(newyork)} examples")
print(f"Shakespeare avg length: {np.average([len(poem) for poem in shake['content']])}\nNew Yorkers avg length: {np.average([len(poem) for poem in newyork['content']])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dataframe columns: Index(['poem name', 'content', 'author', 'type', 'age'], dtype='object')
Shakespeare: 85 examples
New Yorkers: 81 examples
Shakespeare avg length: 1468.5058823529412
New Yorkers avg length: 1810.6049382716049


In [3]:
def process_poem(poem) :
  proc = re.sub(r'[\r\n]+', ' ', poem)
  proc = re.sub(r'\s+', ' ', proc)
  sentences = re.split(r'(?<=[.!?:;])\s+', proc)
  sentences = [sentence for sentence in sentences if len(sentence) > 0]
  return sentences
process_poem("this is a sentence. This is -another :SENTENCE!!!!!\nAND this is a question? again this is a sentence; and another separated by a semicolon: and a last one with a colon.")

['this is a sentence.',
 'This is -another :SENTENCE!!!!!',
 'AND this is a question?',
 'again this is a sentence;',
 'and another separated by a semicolon:',
 'and a last one with a colon.']

In [4]:
# Create Datasets for training


newyork_processed = []
for i in range(len(newyork)) :
   newyork_processed += process_poem(newyork['content'].iloc[i])
newyork_labels = [0 for i in range(len(newyork_processed))]

shake_processed = []
for i in range(len(shake)) :
   shake_processed += process_poem(shake['content'].iloc[i])
shake_labels = [1 for i in range(len(shake_processed))]

processed_poems = newyork_processed + shake_processed
labels = newyork_labels + shake_labels

##

print(f"Number of New Yorker sentences: {len(newyork_processed)} with avg length of {np.mean([len(sentence) for sentence in newyork_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {newyork_processed[i]}")
print(f"\nNumber of Shakespearean sentences: {len(shake_processed)} with avg length of {np.mean([len(sentence) for sentence in shake_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {shake_processed[i]}")

Number of New Yorker sentences: 1331 with avg length of 100.68219383921863 characters
eg:
    Is anything central?
   Orchards flung out on the land, Urban forests, rustic plantations, knee-high hills?
   Are place names central?
   Elm Grove, Adcock Corner, Story Book Farm?
   As they concur with a rush at eye level Beating themselves into eyes which have had enough Thank you, no more thank you.
   And they come on like scenery mingled with darkness The damp plains, overgrown suburbs, Places of known civic pride, of civil obscurity.
   These are connected to my version of America But the juice is elsewhere.
   This morning as I walked out of your room After breakfast crosshatched with Backward and forward glances, backward into light, Forward into unfamiliar light, Was it our doing, and was it The material, the lumber of life, or of lives We were measuring, counting?
   A mood soon to be forgotten In crossed girders of light, cool downtown shadow In this morning that has seized us aga

In [5]:
sentence_lengths = [len(poem) for poem in processed_poems]
max_length = max(sentence_lengths)
avg_length = np.mean(sentence_lengths)
print(f"Max Length = {max_length}\nAvg Length = {avg_length}")

Max Length = 2108
Avg Length = 95.30429732868757


In [6]:
perm = np.random.permutation(len(processed_poems))
shuffled_poems = np.array(processed_poems)[perm]
shuffled_labels = np.array(labels)[perm]

training_data = shuffled_poems[:-100]
training_labels = shuffled_labels[:-100]

validation_data = shuffled_poems[-100:]
validation_labels = shuffled_labels[-100:]

## Iterative Back Translation

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Flatten
from transformers import AutoTokenizer, TFAutoModelForCausalLM, TFAutoModelForSequenceClassification, TFT5ForConditionalGeneration, pipeline, set_seed, AutoModelForSeq2SeqLM, GPT2ForSequenceClassification


In [8]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(f"GPUS: {gpus}")
if gpus:
    try:
        for gpu in gpus:
            print(f"Setting Memory Growth limits for {gpu}")
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

GPUS: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Setting Memory Growth limits for PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


First initialize the two models (S --> NY) and (NY --> S) to pretrained language models.

In [9]:
# load initial translator models
tokenizer_trans = AutoTokenizer.from_pretrained('/content/drive/MyDrive/cs230projmodels/gemini_small_tokenizer')
tokenizer_trans.pad_token = tokenizer_trans.eos_token
model_s_to_ny = TFT5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/cs230projmodels/fine_tuned_gemini_small')
model_s_to_ny.config.pad_token_id = model_s_to_ny.config.eos_token_id
model_ny_to_s = TFT5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/cs230projmodels/fine_tuned_gemini_small')
model_ny_to_s.config.pad_token_id = model_ny_to_s.config.eos_token_id

# load discriminator model
model_disc = TFAutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/cs230projmodels/gpt2_discriminator')
model_disc.config.pad_token_id = model_disc.config.eos_token_id
tokenizer_disc = AutoTokenizer.from_pretrained('/content/drive/MyDrive/cs230projmodels/gpt2_discriminator_tokenizer')
tokenizer_disc.pad_token = tokenizer_disc.eos_token

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

All the weights of TFGPT2ForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2ForSequenceClassificatio

In [10]:
print("=================\nS to NY model summary:")
print(model_s_to_ny.summary())
print("\n\n=================\nNY to S model summary:")
print(model_ny_to_s.summary())
print("\n\n=================\nDiscriminator model summary:")
print(model_disc.summary())

S to NY model summary:
Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222903552 (850.31 MB)
Trainable params: 222903552 (850.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


NY to S model summary:
Model: "tft5_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)      

Test out initial models on some examples

In [11]:
# testpipe = pipeline('text2text-generation', model=model_ny_to_s, tokenizer=tokenizer_trans, device=0)
# prompt = ["Shall I compare thee to a summer’s day? Thou art more lovely and more", "This is another sentence, my dearest machine."]
# out = [model_output['generated_text'] for model_output in testpipe(prompt)]
# print(out)

In [12]:
# prompts = ["Shall I compare thee to a summer’s day? Thou art more lovely and more", "I look at you and I would rather look at you than all the portraits in the world"]
# inputs = tokenizer_disc(prompts, return_tensors='tf', padding=True, truncation=True)
# logits = model_disc(**inputs).logits
# print(logits)
# predicted_class_id = tf.math.argmax(logits, axis=-1)
# print(predicted_class_id)

Now, define optimizers for the translators

In [13]:
trans_init_lr = 0.001
trans_decay_steps = 1000
trans_decay_alpha = 0.0

# trans_lr_schedule = tf.keras.optimizers.schedules.CosineDecay(trans_init_lr, trans_decay_steps, alpha=trans_decay_alpha)
optimizer_s_to_ny = tf.keras.optimizers.Adam(learning_rate=trans_init_lr) #trans_lr_schedule)
optimizer_ny_to_s = tf.keras.optimizers.Adam(learning_rate=trans_init_lr)

Now, define the training loop of IBT including an adversarial discriminator loss

In [14]:
# define model pipelines
internal_pipe_s_to_ny = pipeline("text2text-generation", model=model_s_to_ny, tokenizer=tokenizer_trans, device=0)
def pipe_s_to_ny(prompts) :
    return [out['generated_text'] for out in internal_pipe_s_to_ny(prompts)]
def pipe_s_to_ny_tensor(prompts) :
    inputs = [s.decode('utf-8') for s in prompts.numpy().tolist()]
    return [out['generated_text'] for out in internal_pipe_s_to_ny(inputs)]

internal_pipe_ny_to_s = pipeline("text2text-generation", model=model_ny_to_s, tokenizer=tokenizer_trans, device=0)
def pipe_ny_to_s(prompts) :
    return [out['generated_text'] for out in internal_pipe_ny_to_s(prompts)]
def pipe_ny_to_s_tensor(prompts) :
    inputs = [s.decode('utf-8') for s in prompts.numpy().tolist()]
    return [out['generated_text'] for out in internal_pipe_ny_to_s(inputs)]

# custom classification pipeline
def pipe_disc(prompts, from_logits=False) :
    inputs = tokenizer_disc(prompts, return_tensors='tf', padding=True, truncation=True)
    logits = model_disc(**inputs).logits
    logits = logits[:, 1]  # Extract logits for expected class
    if from_logits:
        return logits
    else:
        probs = tf.nn.sigmoid(logits)
        return probs

In [15]:
## define function that will construct a mini dataset for the translators to train on
def construct_parallel_corpus_tf(inputs, targets) :
    model_inputs = tokenizer_trans(
        inputs,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    with tokenizer_trans.as_target_tokenizer() :
        model_targets = tokenizer_trans(
            targets,
            max_length=256,
            truncation=True,
            padding='max_length',
            return_tensors='tf'
        )
    ins = model_inputs['input_ids']
    att = model_inputs['attention_mask']
    lab = model_targets['input_ids']
    lab = tf.where(lab == tokenizer_trans.pad_token_id, -100, lab)
    return ins, att, lab

In [16]:
# @tf.function
# @tf.custom_gradient
bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def train_step(x_s_to_ny, x_ny_to_s, train_weight=1, adv_weight=1, verbose=False) :
    print(f"\n\nStarting Training Step...\n")
    with tf.GradientTape() as tape_s_to_ny, tf.GradientTape() as tape_ny_to_s :
        tape_s_to_ny.watch(model_s_to_ny.trainable_variables)
        tape_ny_to_s.watch(model_ny_to_s.trainable_variables)
        # forward pass
        # s to ny forward
        # if verbose : print(f"\nStarting Shakespeare to New Yorker forward pass")
        # if verbose : print(f"    input size: {len(x_s_to_ny)} type: {type(x_s_to_ny)}")
        if verbose : print("sny inputs:", *x_s_to_ny, sep="\n    ")
        x_s_to_ny_out = pipe_s_to_ny(x_s_to_ny)
        if verbose : print("sny outputs:", *x_s_to_ny_out, sep="\n    ")
        x_s_to_ny_out_disc = pipe_disc(x_s_to_ny_out, from_logits=True)
        if verbose : print("sny discriminator predictions:", *x_s_to_ny_out_disc, sep="\n    ")
        # ny to s forward
        # if verbose : print(f"Starting New Yorker to Shakespeare forward pass")
        # if verbose : print(f"    input size: {len(x_ny_to_s)} type: {type(x_ny_to_s)}")
        if verbose : print("nys inputs:", *x_ny_to_s, sep="\n    ")
        x_ny_to_s_out = pipe_ny_to_s(x_ny_to_s)
        if verbose : print("nys outputs:", *x_ny_to_s_out, sep="\n    ")
        x_ny_to_s_out_disc = pipe_disc(x_ny_to_s_out, from_logits=True)
        if verbose : print("nys discriminator predictions:", *x_ny_to_s_out_disc, sep="\n    ")

        # compute pseudo parallel corpora to train on
        # if verbose : print(f"Constructing Parallel Corpus for Shakespeare to NY")
        inputs_ids_s_to_ny, attention_mask_s_to_ny, labels_s_to_ny = construct_parallel_corpus_tf(x_ny_to_s_out, x_ny_to_s)
        # if verbose : print(f"Constructing Parallel Corpus for NY to Shakespeare")
        inputs_ids_ny_to_s, attention_mask_ny_to_s, labels_ny_to_s = construct_parallel_corpus_tf(x_s_to_ny_out, x_s_to_ny)

        # train on pseudo parallel corpus data
        # if verbose : print(f"Training S to NY model on pseudo parallel corpus")
        trainout_s_to_ny = model_s_to_ny(
            input_ids = inputs_ids_s_to_ny,
            attention_mask = attention_mask_s_to_ny,
            labels = labels_s_to_ny,
            training=True
        )
        # if verbose : print(f"Training NY to S model on pseudo parallel corpus")
        trainout_ny_to_s = model_ny_to_s(
            input_ids = inputs_ids_ny_to_s,
            attention_mask = attention_mask_ny_to_s,
            labels = labels_ny_to_s,
            training=True
        )

        # losses
        # if verbose : print(f"Starting loss calculations for both models training phases")
        loss_s_to_ny_train = trainout_s_to_ny.loss
        loss_ny_to_s_train = trainout_ny_to_s.loss
        # if verbose : print(f"Starting additional discriminator loss calculations")
        loss_s_to_ny_adv = bce(tf.zeros_like(x_s_to_ny_out_disc), x_s_to_ny_out_disc)
        loss_ny_to_s_adv = bce(tf.ones_like(x_ny_to_s_out_disc), x_ny_to_s_out_disc)
        # total losses
        loss_s_to_ny = train_weight * loss_s_to_ny_train + adv_weight * loss_s_to_ny_adv
        loss_ny_to_s = train_weight * loss_ny_to_s_train + adv_weight * loss_ny_to_s_adv
        if verbose : print(f"Calculated S to NY loss as {tf.round(loss_s_to_ny, 2)} (train loss = {tf.round(loss_s_to_ny_train, 2)}, adv loss = {tf.round(loss_s_to_ny_adv, 2)})")
        if verbose : print(f"Calculated NY to S loss as {tf.round(loss_ny_to_s, 2)} (train loss = {tf.round(loss_ny_to_s_train, 2)}, adv loss = {tf.round(loss_ny_to_s_adv, 2)})")

    # gradients
    # if verbose : print(f"Starting to apply gradients and update parameters for the translator models")
    gradients_s_to_ny = tape_s_to_ny.gradient(loss_s_to_ny, model_s_to_ny.trainable_variables)
    optimizer_s_to_ny.apply_gradients(zip(gradients_s_to_ny, model_s_to_ny.trainable_variables))

    gradients_ny_to_s = tape_ny_to_s.gradient(loss_ny_to_s, model_ny_to_s.trainable_variables)
    optimizer_ny_to_s.apply_gradients(zip(gradients_ny_to_s, model_ny_to_s.trainable_variables))

    if verbose : print(f"Finished updating the translator models")

    return loss_s_to_ny, loss_ny_to_s

### Now, train over some batches for a few epochs

In [17]:
batch_size = 2
raw_x_s_to_ny = shake_processed #[training_data[i] for i in range(len(training_labels)) if labels[i] == 1]
sny_cardinality = len(raw_x_s_to_ny)
raw_x_s_to_ny = raw_x_s_to_ny + raw_x_s_to_ny[:batch_size]
raw_x_ny_to_s = newyork_processed #[training_data[i] for i in range(len(training_labels)) if labels[i] == 0]
nys_cardinality = len(raw_x_ny_to_s)
raw_x_ny_to_s = raw_x_ny_to_s + raw_x_ny_to_s[:batch_size]


def get_training_batch(batch_num, batch_size=8) :
    curr_sny_idx = (batch_num * batch_size) % sny_cardinality
    curr_nys_idx = (batch_num * batch_size) % nys_cardinality
    x_s_to_ny = raw_x_s_to_ny[curr_sny_idx:curr_sny_idx+batch_size]
    x_ny_to_s = raw_x_ny_to_s[curr_nys_idx:curr_nys_idx+batch_size]
    return x_s_to_ny, x_ny_to_s

In [18]:
num_batches_per_epoch = max(nys_cardinality//batch_size, sny_cardinality//batch_size)
print(num_batches_per_epoch)

665


In [19]:
for i in range(5) :
  s, ny = get_training_batch(i, batch_size=batch_size)
  print(f"s @ {i}", *s, sep="\n   ")
  print(f"ny @ {i}", *ny, sep="\n   ")

s @ 0
    Let the bird of loudest lay On the sole Arabian tree Herald sad and trumpet be, To whose sound chaste wings obey.
   But thou shrieking harbinger, Foul precurrer of the fiend, Augur of the fever's end, To this troop come thou not near.
ny @ 0
    Is anything central?
   Orchards flung out on the land, Urban forests, rustic plantations, knee-high hills?
s @ 1
   From this session interdict Every fowl of tyrant wing, Save the eagle, feather'd king;
   Keep the obsequy so strict.
ny @ 1
   Are place names central?
   Elm Grove, Adcock Corner, Story Book Farm?
s @ 2
   Let the priest in surplice white, That defunctive music can, Be the death-divining swan, Lest the requiem lack his right.
   And thou treble-dated crow, That thy sable gender mak'st With the breath thou giv'st and tak'st, 'Mongst our mourners shalt thou go.
ny @ 2
   As they concur with a rush at eye level Beating themselves into eyes which have had enough Thank you, no more thank you.
   And they come on like scen

In [20]:
num_epochs = 1
train_weight = 0
adv_weight = 1
##
tf.config.optimizer.set_jit(True)
for epoch in range(num_epochs) : # num epochs
    print(f"Epoch {epoch + 1} starting...")
    epoch_loss_sny = 0
    epoch_loss_nys = 0
    for batch_num in range(num_batches_per_epoch) :
        x_s_to_ny, x_ny_to_s = get_training_batch(batch_num, batch_size=batch_size)
        (loss_sny, loss_nys) = train_step(x_s_to_ny, x_ny_to_s, train_weight=train_weight, adv_weight=adv_weight, verbose=True)
        epoch_loss_sny += loss_sny
        epoch_loss_nys += loss_nys
        print(f"   Batch S to NY Loss: {loss_sny}\n   Batch NY to S Loss: {loss_nys}")
    print(f"Total Epoch S to NY Loss: {epoch_loss_sny}\nTotal Epoch NY to S Loss: {epoch_loss_nys}\n")

Epoch 1 starting...


Starting Training Step...

sny inputs:
     Let the bird of loudest lay On the sole Arabian tree Herald sad and trumpet be, To whose sound chaste wings obey.
    But thou shrieking harbinger, Foul precurrer of the fiend, Augur of the fever's end, To this troop come thou not near.




sny outputs:
    Let the bird of loudest lay On the sole Arabian tree Herald sad and trumpet be,
    But thou shrieking harbinger, Foul precurrer
sny discriminator predictions:
    tf.Tensor(7.8929796, shape=(), dtype=float32)
    tf.Tensor(6.758224, shape=(), dtype=float32)
nys inputs:
     Is anything central?
    Orchards flung out on the land, Urban forests, rustic plantations, knee-high hills?
nys outputs:
    Is anything central?
    Orchards flung out on the land, Urban forests, rustic plantations,
nys discriminator predictions:
    tf.Tensor(-8.100761, shape=(), dtype=float32)
    tf.Tensor(-7.083472, shape=(), dtype=float32)




Calculated S to NY loss as [7.] (train loss = [2.], adv loss = 7.0)
Calculated NY to S loss as [8.] (train loss = [4.], adv loss = 8.0)
Finished updating the translator models
   Batch S to NY Loss: [7.326369]
   Batch NY to S Loss: [7.5926876]


Starting Training Step...

sny inputs:
    From this session interdict Every fowl of tyrant wing, Save the eagle, feather'd king;
    Keep the obsequy so strict.
sny outputs:
    From this session interdict Every fowl of tyrant wing, Save the
    Keep the obsequy so strict.
sny discriminator predictions:
    tf.Tensor(2.7242296, shape=(), dtype=float32)
    tf.Tensor(-1.6606796, shape=(), dtype=float32)
nys inputs:
    Are place names central?
    Elm Grove, Adcock Corner, Story Book Farm?
nys outputs:
    Are place names central?
    Elm Grove, Adcock Corner, Story Book Farm?
nys discriminator predictions:
    tf.Tensor(-8.901896, shape=(), dtype=float32)
    tf.Tensor(-5.4961195, shape=(), dtype=float32)
Calculated S to NY loss as [1.] (trai

KeyboardInterrupt: 

In [None]:
s, ny = get_training_batch(0, batch_size=4)
sout = pipe_s_to_ny(s)
nyout = pipe_ny_to_s(ny)
spred = pipe_disc(sout)
nypred = pipe_disc(nyout)
sdisc = pipe_disc(sout, from_logits=True)
nydisc = pipe_disc(nyout, from_logits=True)
print(spred, nypred)
print(sdisc, nydisc)

In [None]:
bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
sloss = bce(tf.zeros_like(sdisc), sdisc)
nyloss = bce(tf.ones_like(nydisc), nydisc)
print(sloss, nyloss)