# Parameters

In [1]:
# Set global random seed
SEED = 1234

# Set the locations of the train, dev, and test files
# Gold is used for test it is the same as .test, but with the actual output labels attached in a similar format to train and dev
TRAIN = 'data/deu_600.train'
DEV = 'data/deu.dev'
TEST = 'data/deu.gold'

# Give the model(name) for the Huggingface or the location of a model on your local device
# Outputname can be left blank, unless you want to specify a specific name for the currently trained model
# Prefix is the prefix used for the task that we are finetuning the model on
# GEN_MODEL_OVERRIDE can be used to load a saved model for generation
MODEL = 'google/byt5-small'
OUTPUTNAME = ''
PREFIX = 'morph'
GEN_MODEL_OVERRIDE = None

# Set the important parameters for the model
LEARNRATE = 4e-5
EPOCH = 10
BATCHES = 16

# Code

## Imports, time, and random seed

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, get_scheduler, T5ForConditionalGeneration, set_seed, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from torch.optim import AdamW
from pathlib import Path
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
TIMESTRING = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [4]:
torch.manual_seed(SEED)
set_seed(SEED)
np.random.seed(SEED)

## Data

In [5]:
header_names = ['lemma', 'labels', 'features']

train = pd.read_csv(TRAIN, sep='\t', names=header_names)
dev = pd.read_csv(DEV, sep='\t', names=header_names)
test = pd.read_csv(TEST, sep='\t', names=header_names)

In [6]:
train['input'] = PREFIX + ': ' + train['lemma'] + ' ' + train['features']
train = train[train.columns[[-1, 1]]]
train.head()

Unnamed: 0,input,labels
0,morph: Plätzchen N;NOM;NEUT;PL,Plätzchen
1,morph: Kastanie N;NOM;FEM;PL,Kastanien
2,morph: Linie N;NOM;FEM;PL,Linien
3,morph: Scherz N;NOM;MASC;PL,Scherze
4,morph: Wiederholung N;NOM;FEM;PL,Wiederholungen


In [7]:
dev['input'] = PREFIX + ': ' + dev['lemma'] + ' ' + dev['features']
dev = dev[dev.columns[[-1, 1]]]
dev.head()

Unnamed: 0,input,labels
0,morph: Flitzer N;NOM;MASC;PL,Flitzer
1,morph: Brosche N;NOM;FEM;PL,Broschen
2,morph: Chinese N;NOM;MASC;PL,Chinesen
3,morph: Kloster N;NOM;NEUT;PL,Klöster
4,morph: Urlaub N;NOM;MASC;PL,Urlaube


In [8]:
test['input'] = PREFIX + ': ' + test['lemma'] + ' ' + test['features']
test = test[test.columns[[-1, 1]]]
test.head()

Unnamed: 0,input,labels
0,morph: Orgie N;NOM;FEM;PL,Orgien
1,morph: Sieger N;NOM;MASC;PL,Sieger
2,morph: Klotz N;NOM;MASC;PL,Klötze
3,morph: Kalk N;NOM;MASC;PL,Kalke
4,morph: Skelett N;NOM;NEUT;PL,Skelette


In [9]:
from datasets import Dataset

train = Dataset.from_pandas(train)
dev = Dataset.from_pandas(dev)
test = Dataset.from_pandas(test)

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = T5ForConditionalGeneration.from_pretrained(MODEL).to(device)

In [11]:
def tokenize_function(examples):
    labels = tokenizer(examples["labels"], padding="longest")
    labels['input_ids_label'] = labels.pop('input_ids')
    labels.pop('attention_mask')

    tokenized_dict = tokenizer(examples["input"], padding="longest")
    tokenized_dict.update(labels)

    return tokenized_dict


tokenized_train = train.map(tokenize_function, batched=True)
tokenized_dev = dev.map(tokenize_function, batched=True)

                                                               

In [12]:
tokenized_train.set_format(type='torch')
tokenized_dev.set_format(type='torch')

## Training

In [13]:
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=BATCHES)
eval_dataloader = DataLoader(tokenized_dev, batch_size=BATCHES)
optimizer = AdamW(model.parameters(), lr=LEARNRATE)

num_epochs = EPOCH
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [15]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        tensor_list = ['input_ids', 'attention_mask', 'input_ids_label']
        # batch = {k: v.to(device) for k, v in batch.items()}
        batch_t = {k: v.to(device) for k, v in batch.items() if k in tensor_list}
        outputs = model(input_ids=batch_t['input_ids'],
                        attention_mask=batch_t['attention_mask'],
                        labels=batch_t['input_ids_label'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 380/380 [29:15<00:00,  4.16s/it]

## Model storage

In [16]:
def model_serializer(model, output_name, model_name, timestring):
    '''Saves the trained/finetuned model'''
    path_folder = './models'
    Path(path_folder).mkdir(parents=True, exist_ok=True)
    
    if output_name == '':
        path_str = (f'{path_folder}/{timestring}-pytorch_model'
                    f'_{model_name}.bin')
    else:
        path_str = (f'{path_folder}/{timestring}-pytorch_model'
                    f'_{output_name}_{model_name}.bin')

    torch.save(model.state_dict(), path_str)
    return path_str


tuned_model = model_serializer(model, OUTPUTNAME, MODEL.replace('/', '_'), TIMESTRING)

## Model load and generation

In [48]:
if GEN_MODEL_OVERRIDE:
    tuned_model = GEN_MODEL_OVERRIDE

gen_model = T5ForConditionalGeneration.from_pretrained(tuned_model, return_dict=True, config=MODEL)
gen_model.to(device)

gen_inputs = tokenizer([f'{PREFIX}: {item}' for item in test['input']], return_tensors='pt', padding=True).to(device)

outputs = model.generate(
    input_ids=gen_inputs['input_ids'],
    attention_mask=gen_inputs['attention_mask'],
    # max_length=50,
    # num_beams=5,
    # no_repeat_ngram_size=2,
    # early_stopping=True,
    do_sample=False,  # disable sampling to test if batching affects output
)

gen_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)



In [None]:
gen_comparison = pd.DataFrame.from_dict({'Expected': test['labels'][:20], 'Predicted': gen_outputs[:20]})
gen_comparison

## Evaluation

In [None]:
ac_score = 0
for idx, item in enumerate(test['labels']):
    if item == gen_outputs[idx]:
        ac_score += 1

print('The accuracy score is {}'.format(ac_score / len(test['labels'])))
    