In [1]:
# ! pip install transformers datasets
# ! pip install transformers datasets evaluate bleu

In [2]:
import re
import string
import pandas
import evaluate
import unicodedata
import numpy as np
from io import open
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline

In [3]:
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
source_lang = "en"
# target_lang = "fr"
target_lang = "si"
prefix = "translate English to Sindarin: "
metric = evaluate.load("sacrebleu")
# metric = evaluate.load("bleu")

# notebook_login()
# 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

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=64, truncation=True)
    return model_inputs

In [5]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [6]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
data = pandas.read_csv('data/sindarin-eng.txt' , sep='\t', lineterminator='\n')
data = data.dropna().drop_duplicates()
data = data.reset_index()
data.columns = ['id','si', 'en']

data = data.replace({r'\r': ''}, regex=True)
data = data.replace({r'[^\w\s]': ''}, regex=True)

data['si'] = data['si'].str.lower()
data['en'] = data['en'].str.lower()

data['si'] = data['si'].str.strip()
data['en'] = data['en'].str.strip()

data['si'] = data['si'].str.normalize('NFD').str.encode('ascii', errors='ignore').str.decode('utf-8')
data['en'] = data['en'].str.normalize('NFD').str.encode('ascii', errors='ignore').str.decode('utf-8')

data['translation'] = data[['si', 'en']].apply(dict, axis=1)
data.drop(['si', 'en'], axis=1, inplace=True)

# print(data)

In [8]:
# sind = load_dataset('text', data_files={'train': 'data/sindarin-eng.txt'})
# train_ds = Dataset.from_pandas(data)
# sind['train'] = train_ds
# sind = load_dataset({'train': dataset_train})
# sind = sind.remove_columns(["__index_level_0__"])
# print(sind['train'][0])

In [9]:
# books = load_dataset("opus_books", "en-fr")
# books = books["train"].train_test_split(test_size=0.2)
# print(books["train"][0])

books = load_dataset('text', data_files={'train': 'data/sindarin-eng.txt'})
train_ds = Dataset.from_pandas(data)
books['train'] = train_ds
books = books["train"].train_test_split(test_size=0.2)
print(books['train'][0])

tokenized_books = books.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_sindarin_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Found cached dataset text (C:/Users/cpb5867/.cache/huggingface/datasets/text/default-a7038b24a72eb2b5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

{'id': 576, 'translation': {'en': 'to where do you intend to go', 'si': 'na mhan nidhig mened'}}


Map:   0%|          | 0/3170 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

c:\Users\cpb5867\Documents\CSE-582-FINAL-PROJECT\my_awesome_sindarin_model is already a clone of https://huggingface.co/cpb5867/my_awesome_sindarin_model. Make sure you pull the latest changes with `repo.git_pull()`.


In [10]:
trainer.train()
trainer.push_to_hub()



  0%|          | 0/995 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 3.656026840209961, 'eval_bleu': 0.0612, 'eval_gen_len': 17.4414, 'eval_runtime': 23.6605, 'eval_samples_per_second': 33.516, 'eval_steps_per_second': 2.113, 'epoch': 1.0}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 3.4073023796081543, 'eval_bleu': 0.1168, 'eval_gen_len': 12.5788, 'eval_runtime': 23.4388, 'eval_samples_per_second': 33.833, 'eval_steps_per_second': 2.133, 'epoch': 2.0}
{'loss': 3.9239, 'learning_rate': 1.0010050251256282e-05, 'epoch': 2.51}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 3.257267951965332, 'eval_bleu': 0.2427, 'eval_gen_len': 12.3178, 'eval_runtime': 23.6963, 'eval_samples_per_second': 33.465, 'eval_steps_per_second': 2.11, 'epoch': 3.0}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 3.1766810417175293, 'eval_bleu': 0.2812, 'eval_gen_len': 12.2585, 'eval_runtime': 23.1663, 'eval_samples_per_second': 34.231, 'eval_steps_per_second': 2.158, 'epoch': 4.0}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 3.1459407806396484, 'eval_bleu': 0.3288, 'eval_gen_len': 12.348, 'eval_runtime': 23.109, 'eval_samples_per_second': 34.316, 'eval_steps_per_second': 2.164, 'epoch': 5.0}
{'train_runtime': 345.338, 'train_samples_per_second': 45.897, 'train_steps_per_second': 2.881, 'train_loss': 3.690807798759422, 'epoch': 5.0}


Upload file pytorch_model.bin:   0%|          | 1.00/850M [00:00<?, ?B/s]

To https://huggingface.co/cpb5867/my_awesome_sindarin_model
   be37f54..2af01af  main -> main

To https://huggingface.co/cpb5867/my_awesome_sindarin_model
   2af01af..6259b7b  main -> main



'https://huggingface.co/cpb5867/my_awesome_sindarin_model/commit/2af01af7095982405fafe292e69eb552b53da74e'

In [11]:
text = "translate English to Sindarin: well met."

translator = pipeline("translation", model="my_awesome_sindarin_model")
translator(text)

tokenizer = AutoTokenizer.from_pretrained("my_awesome_sindarin_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_sindarin_model")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
tokenizer.decode(outputs[0], skip_special_tokens=True)



'tae henn.'