In [1]:
# ! pip install transformers datasets
# ! pip install transformers datasets evaluate bleu

In [2]:
import re
import string
import pandas
import evaluate
import unicodedata
import numpy as np
from io import open
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline

In [3]:
checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
source_lang = "en"
target_lang = "fr"
prefix = "translate English to Sindarin: "
metric = evaluate.load("sacrebleu")
# metric = evaluate.load("bleu")

# notebook_login()
# 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

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [5]:
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def pairs_preprocess_function(pairs):
    inputs = []
    targets = []
    for pair in pairs:
        inputs.append(prefix + pair[1])
        targets.append(pair[0])
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [8]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [10]:
lines = open('data/%s-%s.txt' % ('sindarin', 'eng'), encoding='utf-8').\
        read().strip().split('\n')

# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

In [48]:
books = load_dataset("opus_books", "en-fr")
print(books['train']['translation'][0])

Found cached dataset opus_books (C:/Users/cpb5867/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

{'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}


In [49]:
data = pandas.read_csv('data/sindarin-eng.txt' , sep='\t', lineterminator='\n')
# data = data.reset_index(drop=True)
data.columns = ['sindarin', 'english']
data.index.name = 'id'
data = data.replace({r'\r': ''}, regex=True)
data['sindarin'] = data['sindarin'].str.lower()
data['english'] = data['english'].str.lower()
#preprocess
data = data.dropna().drop_duplicates()
print(data)

                                             sindarin  \
id                                                      
0                                                 ae!   
1                                                 ae!   
2                                                 ae!   
3                                                 ae!   
4                                              galdol   
...                                               ...   
3997                 padol rath fair, padol rath fair   
3998  ithil eriol, dolthol raith erib nan genid glass   
3999                                  in elin thinnar   
4000                    i vôr danna, câr 'ardh gostad   
4001                    dartha dínen a mreithad menel   

                                                english  
id                                                       
0                                                hello!  
1                                                  hey!  
2                         

In [13]:
train_pairs = pairs[:round((len(pairs) * .9))]
test_pairs = pairs[round((len(pairs) * .9)):]
train_hold = pairs_preprocess_function(train_pairs)
test_hold = pairs_preprocess_function(test_pairs)


{'input_ids': [[13959, 1566, 12, 17542, 291, 77, 10, 21820, 3, 55, 1], [13959, 1566, 12, 17542, 291, 77, 10, 3, 13133, 3, 55, 1], [13959, 1566, 12, 17542, 291, 77, 10, 3, 32, 107, 3, 55, 1], [13959, 1566, 12, 17542, 291, 77, 10, 3, 9, 107, 3, 55, 1], [13959, 1566, 12, 17542, 291, 77, 10, 2222, 1], [13959, 1566, 12, 17542, 291, 77, 10, 207, 1], [13959, 1566, 12, 17542, 291, 77, 10, 11320, 1], [13959, 1566, 12, 17542, 291, 77, 10, 14276, 1], [13959, 1566, 12, 17542, 291, 77, 10, 10463, 1], [13959, 1566, 12, 17542, 291, 77, 10, 3, 9, 10463, 1], [13959, 1566, 12, 17542, 291, 77, 10, 168, 1736, 1], [13959, 1566, 12, 17542, 291, 77, 10, 168, 1736, 1], [13959, 1566, 12, 17542, 291, 77, 10, 2222, 1], [13959, 1566, 12, 17542, 291, 77, 10, 168, 1107, 1], [13959, 1566, 12, 17542, 291, 77, 10, 44, 336, 3, 55, 1], [13959, 1566, 12, 17542, 291, 77, 10, 73, 9803, 287, 15, 1], [13959, 1566, 12, 17542, 291, 77, 10, 8293, 1107, 1], [13959, 1566, 12, 17542, 291, 77, 10, 18660, 7, 1], [13959, 1566, 12, 17

In [None]:
class Dataset:
    def __init__(self, pairs):
        self.translation = pairs

    def __len__(self):
        return len(self.tokenized_dataset["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_dataset.items()}

In [14]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)
print(books["train"][0])

tokenized_books = books.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_sindarin_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_hold,
    eval_dataset=test_hold,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Found cached dataset opus_books (C:/Users/cpb5867/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


  0%|          | 0/1 [00:00<?, ?it/s]

{'id': '4710', 'translation': {'en': '"My reasons for marrying are, first, that I think it a right thing for every clergyman in easy circumstances (like myself) to set the example of matrimony in his parish; secondly, that I am convinced that it will add very greatly to my happiness; and thirdly--which perhaps I ought to have mentioned earlier, that it is the particular advice and recommendation of the very noble lady whom I have the honour of calling patroness.', 'fr': '– Les raisons qui me déterminent a me marier, continua-t-il, sont les suivantes : premierement, je considere qu’il est du devoir de tout clergyman de donner le bon exemple a sa paroisse en fondant un foyer. Deuxiemement, je suis convaincu, ce faisant, de travailler a mon bonheur.'}}


Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

c:\Users\cpb5867\Documents\CSE-582-FINAL-PROJECT\my_awesome_sindarin_model is already a clone of https://huggingface.co/cpb5867/my_awesome_sindarin_model. Make sure you pull the latest changes with `repo.git_pull()`.


In [15]:
trainer.train()
trainer.push_to_hub()



  0%|          | 0/2 [00:00<?, ?it/s]

AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'

In [9]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

translator = pipeline("translation", model="my_awesome_opus_books_model")
translator(text)

tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
tokenizer.decode(outputs[0], skip_special_tokens=True)



"Les lignées partagent les ressources des bactéries qui fixent l'azote."