<a href="https://colab.research.google.com/github/bystrowska/idiom-paraphrasing/blob/main/idiom_paraphrasing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up

Clone the git repository and install git-lfs to be able to download model files.

In [None]:
! sudo apt-get install git-lfs

In [None]:
! git clone https://github.com/bystrowska/idiom-paraphrasing.git

In [None]:
%cd idiom-paraphrasing
! git lfs install
! git lfs pull

# Inference

This section will use the pre-trained T5-small model saved in the GitHub repository to generate paraphrases from a sample sentene

In [None]:
!pip install datasets transformers

In [None]:
checkpoint = "t5-small"
sentence = "paraphrase: There is Peter with a tray of food, it is a sight for sore eyes!"

In [None]:
from transformers import AutoTokenizer, TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained("./models/" + checkpoint)
# model = TFT5ForConditionalGeneration.from_pretrained(path + "models/" + checkpoint + "/tf_model.h5")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

input_ids = tokenizer(sentence, return_tensors="tf").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Dataset processing and tokenizing

This section will load the PIE dataset from a csv file, create train/test/validate split and tokenize it.

In [None]:
checkpoint = "t5-small"

In [None]:
! pip install datasets transformers

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files=("data.csv"))

Create a 60:20:20 train/test/validate split

In [None]:
split_dataset = dataset['train'].train_test_split(test_size=0.2)
tmp = split_dataset['train'].train_test_split(test_size=0.25)
split_dataset['train'] = tmp['train']
split_dataset['validate'] = tmp['test']

## Tokenizing

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
prefix = "paraphrase: "

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples["Idiomatic_Sent"]]
    targets = examples["Literal_Sent"]

    model_inputs = tokenizer(inputs)

    labels = tokenizer(targets).input_ids

    model_inputs["labels"] = labels
    return model_inputs

In [None]:
tokenized_datasets = split_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

In [None]:
tokenized_datasets.save_to_disk("tokenized_dataset")

# Training

This section will prepare the toknized dataset and then use it to train the model with AdaFactor optimizer. Afterwards it'll compute evaluation metrics.

In [None]:
checkpoint = "t5-small" # t5-small, t5-base or t5-large
batch_size = 128
num_epochs = 20

In [None]:
!pip install -U nltk

In [None]:
!pip install datasets transformers rouge_score sacrebleu sentencepiece

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
from datasets import load_from_disk

tokenized_dataset = load_from_disk("tokenized_dataset")

In [None]:
from transformers import TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained(checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf") 


In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(columns=["input_ids", "attention_mask", "labels"],
                                                  collate_fn=data_collator,
                                                  shuffle=True,
                                                  batch_size=batch_size)

tf_validate_dataset = tokenized_dataset["validate"].to_tf_dataset(columns=["input_ids", "attention_mask", "labels"],
                                                  collate_fn=data_collator,
                                                  shuffle=False,
                                                  batch_size=batch_size)
                                                  
tf_test_dataset = tokenized_dataset["test"].to_tf_dataset(columns=["input_ids", "attention_mask", "labels"],
                                                  collate_fn=data_collator,
                                                  shuffle=False,
                                                  batch_size=batch_size)


In [None]:
from datasets import load_metric

bleu = load_metric("sacrebleu")
rouge = load_metric("rouge")
meteor = load_metric("meteor")
sari = load_metric("sari")
perplexity = load_metric("perplexity")


In [None]:
import numpy as np

def compute_metrics():
    all_preds = []
    all_labels = []
    all_inputs = []
    for batch in tf_test_dataset:
        predictions = model.generate(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = batch["labels"].numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        decoded_inputs = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
        decoded_inputs = [input.strip()[12:] for input in decoded_inputs]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)
        all_inputs.extend(decoded_inputs)

    results_bleu = bleu.compute(predictions=all_preds, references=all_labels)['score']
    print(results_bleu)
    results_rouge = {key: value.mid.fmeasure * 100 for key, value in rouge.compute(predictions=all_preds, references=all_labels).items()}
    print(results_rouge)
    results_meteor = meteor.compute(predictions=all_preds, references=all_labels)['meteor']
    print(results_meteor)
    results_sari = sari.compute(sources=all_inputs, predictions=all_preds, references=all_labels)['sari']
    print(results_sari)

### AdaFactor

In [None]:
!pip install tensor2tensor

In [None]:
from transformers import create_optimizer
import tensorflow as tf
from tensor2tensor.utils.adafactor import AdafactorOptimizer

optimizer = AdafactorOptimizer(multiply_by_parameter_scale=False,
                               learning_rate=0.001,
                               decay_rate=None,
                               beta1=0.0,
                               clipping_threshold=1.0,
                               factored=True,
                               simulated_quantize_bits=None,
                               parameter_encoding=None,
                               use_locking=False,
                               epsilon1=1e-30,
                               epsilon2=1e-3)

num_train_steps = len(tf_train_dataset) * num_epochs

model.compile(optimizer=optimizer)

### AdamW

In [None]:
# from transformers import create_optimizer
# import tensorflow as tf

# num_train_steps = len(tf_train_dataset) * num_epochs

# optimizer, schedule = create_optimizer(
#     init_lr=3e-4,
#     num_warmup_steps=0,
#     num_train_steps=num_train_steps,
#     weight_decay_rate=0.01,
# )
# model.compile(optimizer=optimizer)

### Training loop

In [None]:
history = model.fit(tf_train_dataset,
                    validation_data=tf_validate_dataset,
                    epochs=num_epochs)

In [None]:
print(history.params)
print(history.history.keys())
for key in history.history.keys():
  print(str(key) + ": " + str(history.history[key]))

In [None]:
compute_metrics()