In [1]:
!pip install transformers SentencePiece pandas datasets tokenizers

Collecting SentencePiece
  Downloading sentencepiece-0.1.97-cp39-cp39-macosx_10_9_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: SentencePiece
Successfully installed SentencePiece-0.1.97


In [2]:
# from tokenizers import Tokenizer
# from tokenizers.models import BPE
# from tokenizers.trainers import BpeTrainer
# from tokenizers.pre_tokenizers import Whitespace

# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# tokenizer.pre_tokenizer = Whitespace()
# files = [r'/content/drive/MyDrive/Colab Notebooks/NLP_proj/data/train.labeled']
# tokenizer.train(files, trainer)

In [5]:
import os
import pandas as pd

def preprocess_train(filename):
  with open(filename) as f:
    lines = f.read()
  eng_entries = []
  ger_entries = []
  for entry in lines.split(2*os.linesep):
    en_ger_couple = entry.split("English:\n")
    if len(en_ger_couple)<=1:
      continue
    ger, eng = en_ger_couple[0].replace('German:\n', ''), en_ger_couple[1]
    eng_entries.append(eng)
    ger_entries.append(ger)
  return {"ger":ger_entries, "eng":eng_entries}

train_data_dict = preprocess_train(r'data/train.labeled')
val_data_dict = preprocess_train(r'data/val.labeled')

In [6]:
from transformers import AutoConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [7]:
from datasets import Dataset
train_ds = Dataset.from_dict(train_data_dict)
eval_ds = Dataset.from_dict(val_data_dict)

In [8]:
train_ds[0]

{'ger': 'Was ist da so falsch gelaufen?\nDie Wirtschaftskrise scheint die naheliegendste Erklärung zu sein, vielleicht zu naheliegend.\n',
 'eng': 'What has gone so wrong?\nThe economic crisis seems to be the most obvious explanation, but perhaps too obvious.'}

In [9]:
eval_ds[0]

{'ger': 'Und weiterreichende Kürzungen wie die von der EU vorgeschlagenen – 20 Prozent unterhalb der Werte von 1990 innerhalb von zwölf Jahren – würden die globalen Temperaturen bis 2100 lediglich um ein Sechzigstel Grad Celsius (ein Dreißigstel Grad Fahrenheit) senken, und das bei Kosten von 10 Billionen Dollar.\nFür jeden ausgegebenen Dollar hätten wir nur eine Wertschöpfung von vier Cent erreicht.\n',
 'eng': 'And deeper emissions cuts like those proposed by the European Union – 20% below 1990 levels within 12 years – would reduce global temperatures by only one-sixtieth of one degree Celsius (one-thirtieth of one degree Fahrenheit) by 2100, at a cost of $10 trillion.\nFor every dollar spent, we would do just four cents worth of good.'}

In [10]:
import torch

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# tokenize the input
input_sequences = train_ds["eng"]
output_sequences = train_ds["ger"]

encoding = tokenizer(
    input_sequences,
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

# encode the targets
target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids

# replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100



In [12]:
labels

tensor([[ 2751,   229,   836,  ...,  -100,  -100,  -100],
        [28508,   521,   107,  ...,  -100,  -100,  -100],
        [ 9515, 15690,   266,  ...,  3870,   177,     1],
        ...,
        [ 2215,     6,   319,  ...,  -100,  -100,  -100],
        [  316, 31661,  5754,  ...,  -100,  -100,  -100],
        [  604,     5,   781,  ...,  -100,  -100,  -100]])

In [None]:
# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
loss.item()

In [None]:
target_encoding.input_ids