In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.optim import AdamW
from evaluate import load
from seq2seq import create_transformers_train_data, train_transformer, decode_with_transformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
data = pd.read_csv('../yelp_parallel/yelp_parallel/test_en_parallel.txt', sep='\t')

In [4]:
negative = data["Style 1"].values.tolist()
positive = data["Style 2"].values.tolist()

In [5]:
bleu = load("bleu")
bertscore = load("bertscore")

In [6]:
def run_experiment(model_name, negative, positive, lr, epochs, batch_size=256, device=None):
    print(f"Model: {model_name}, Learning rate: {lr}, Epochs: {epochs}")
    print(f"Learning rate: {lr}")
    print(f"Epochs: {epochs}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    train_dataset = create_transformers_train_data(negative, positive, tokenizer)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

    optimizer = AdamW(model.parameters(), lr=lr)

    train_transformer(model, train_loader, optimizer, epochs, device=device)

    predicted_sentence = decode_with_transformer(negative[0], tokenizer, model)
    reference_sentence = positive[0]

    print("\nPrediction:")
    print(predicted_sentence)
    print("\nReference:")
    print(reference_sentence)

    # Metrics
    bleu_score = bleu.compute(predictions=[predicted_sentence],
                              references=[reference_sentence])

    bert_score = bertscore.compute(
        predictions=[predicted_sentence],
        references=[reference_sentence],
        lang="en" # go koristam ova zatoa shto toa vo auditoriskata traeshe podolgo model_type='microsoft/deberta-xlarge-mnli'
    )

    print("\nBLEU:")
    print(bleu_score)

    print("\nBERTScore:")
    print(bert_score)

    return {
        "prediction": predicted_sentence,
        "reference": reference_sentence,
        "bleu": bleu_score,
        "bertscore": bert_score
    }


In [7]:
results = []

In [None]:
results.append(run_experiment("t5-small", negative, positive, lr=0.001, epochs=3))
results.append(run_experiment("t5-small", negative, positive, lr=0.0001, epochs=10))
results.append(run_experiment("google/flan-t5-small", negative, positive, lr=0.0001, epochs=5))

Model: t5-small, Learning rate: 0.001, Epochs: 3
Learning rate: 0.001
Epochs: 3




Epoch 1/3, Loss: 3.0141
Epoch 2/3, Loss: 2.3884
Epoch 3/3, Loss: 2.1708

Prediction:
ever since joes has changed hands

Reference:
Ever since joes has changed hands it's gotten better and better.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BLEU:
{'bleu': 0.27952792741962756, 'precisions': [0.8333333333333334, 0.8, 0.75, 0.6666666666666666], 'brevity_penalty': 0.36787944117144233, 'length_ratio': 0.5, 'translation_length': 6, 'reference_length': 12}

BERTScore:
{'precision': [0.9461432695388794], 'recall': [0.90846848487854], 'f1': [0.9269232153892517], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.57.3)'}
Model: t5-small, Learning rate: 0.0001, Epochs: 10
Learning rate: 0.0001
Epochs: 10
Epoch 1/10, Loss: 3.4837


In [None]:
print(results)