# Parameters

In [1]:
# Set global random seed
SEED = 1234

# Set the project directories for local and/or Google Colab 
dir_local = '.'
dir_colab = 'drive/MyDrive/NLP_code_notebooks/project/'

# Set the locations of the train, dev, and test files
# Gold is used for test it is the same as .test, but with the actual output labels attached in a similar format to train and dev
file_train = 'data/deu_600.train'
file_validation = 'data/deu.dev'
file_test = 'data/deu.gold'

# Give the model(name) for the Huggingface or the location of a model on your local device
# Outputname can be left blank, unless you want to specify a specific name for the currently trained model
# Prefix is the prefix used for the task that we are finetuning the model on
# GEN_MODEL_OVERRIDE can be used to load a saved model for generation

model_name = 'google/byt5-small'


# Set the important parameters for the model
learning_rate = 1e-4
num_epochs = 20
batch_size = 16

In [2]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
    print('Running on Google Colab')
    from google.colab import drive

    !pip install transformers datasets
    drive.mount('/content/drive')
    dir_project = dir_colab
else:
    print('Running locally')
    dir_project = dir_local

# Code

## Imports, time, and random seed

In [4]:
import numpy as np
import os
import pandas as pd
import torch
from datetime import datetime
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration, set_seed

In [5]:
time_string = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [6]:
torch.manual_seed(SEED)
set_seed(SEED)
np.random.seed(SEED)

## Data

In [7]:
header_names = ['lemma', 'labels', 'features']

df_train = pd.read_csv(os.path.join(dir_project, file_train), sep='\t', names=header_names)
df_valid = pd.read_csv(os.path.join(dir_project, file_validation), sep='\t', names=header_names)
df_test = pd.read_csv(os.path.join(dir_project, file_test), sep='\t', names=header_names)

In [8]:
df_train.head()

Unnamed: 0,lemma,labels,features
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL


In [9]:
for df_variant in (df_train, df_valid, df_test):
    df_variant["inputs"] = df_variant["lemma"] + df_variant["features"]

In [10]:
df_train.head()

Unnamed: 0,lemma,labels,features,inputs
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL,PlätzchenN;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL,KastanieN;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL,LinieN;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL,ScherzN;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL,WiederholungN;NOM;FEM;PL


In [11]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
morph_inflection_model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
def tokenize_function(df_data, input_column_name="inputs"):
    tokenized_dict = {}
    
    inputs = tokenizer(df_data[input_column_name].to_list(), padding="longest", return_tensors="pt")
    labels = tokenizer(df_data["labels"].to_list(), padding="longest", return_tensors="pt").input_ids

    tokenized_dict["input_ids"] = inputs["input_ids"]
    tokenized_dict["attention_mask"] = inputs["attention_mask"]
    tokenized_dict["labels"] = labels

    return tokenized_dict

In [13]:
tokenized_train = tokenize_function(df_train)

In [14]:
tokenized_valid = tokenize_function(df_valid)

In [15]:
tokenized_test = tokenize_function(df_test)

In [16]:
tokenized_train

{'input_ids': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]])}

## Training

In [17]:
class MorphInflectionDataset(Dataset):
    def __init__(self, dict_data):
        self.dict_data = dict_data

    def __len__(self):
        return len(self.dict_data["labels"])

    def __getitem__(self, idx):
        dict_sample = {}
        input_ids = self.dict_data["input_ids"][idx]
        attention_mask = self.dict_data["attention_mask"][idx]
        labels = self.dict_data["labels"][idx]
        return input_ids, attention_mask, labels

In [18]:
#eval_dataloader = DataLoader(tokenized_dev, batch_size=BATCHES)

optimizer = AdamW(morph_inflection_model.parameters(), lr=learning_rate)

In [19]:
train_dataloader = DataLoader(MorphInflectionDataset(tokenized_train), shuffle=True, batch_size=batch_size, num_workers=4)



In [20]:
morph_inflection_model.train()

T5ForConditionalGeneration(
  (shared): Embedding(384, 1472)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1472)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1472, out_features=384, bias=False)
              (k): Linear(in_features=1472, out_features=384, bias=False)
              (v): Linear(in_features=1472, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=1472, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1472, out_features=3584, bias=False)
              (wi_1): Linear(in_features=1472, out_features=3584, bias=False)
              (w

In [21]:
num_train_batches = len(train_dataloader)

In [22]:
for epoch in range(num_epochs):
    loss_for_epoch = 0.0
    for input_ids, attention_mask, labels in train_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        outputs = morph_inflection_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss_for_epoch += loss
        loss.backward()
        optimizer.step()
    loss_for_epoch /= num_train_batches
    print(f"{epoch + 1} / {num_epochs}, loss: {loss:.4f}")

1 / 20, loss: 20.1732
2 / 20, loss: 4.7346
3 / 20, loss: 4.0640
4 / 20, loss: 3.5882
5 / 20, loss: 2.3374
6 / 20, loss: 1.1447
7 / 20, loss: 0.5861
8 / 20, loss: 0.2164
9 / 20, loss: 0.3112
10 / 20, loss: 0.1335
11 / 20, loss: 0.1333
12 / 20, loss: 0.1614
13 / 20, loss: 0.1480
14 / 20, loss: 0.0688
15 / 20, loss: 0.0944
16 / 20, loss: 0.1290
17 / 20, loss: 0.1218
18 / 20, loss: 0.1193
19 / 20, loss: 0.0789
20 / 20, loss: 0.0848


## Model storage

In [23]:
model_filepath = os.path.join(dir_project , "morph_inflection_byt5_small.pth")

In [24]:
torch.save(morph_inflection_model.state_dict(), model_filepath)

## Model load and generation

In [25]:
gen_model = T5ForConditionalGeneration.from_pretrained(model_filepath, return_dict=True, config=model_name)
gen_model.to(device)

gen_inputs = tokenizer([f"{item}" for item in df_test["inputs"]], return_tensors="pt", padding=True).to(device)

outputs = gen_model.generate(
    input_ids=gen_inputs["input_ids"],
    attention_mask=gen_inputs["attention_mask"],
    # max_length=50,
    # num_beams=5,
    # no_repeat_ngram_size=2,
    # early_stopping=True,
    do_sample=False,  # disable sampling to test if batching affects output
)

gen_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)



In [26]:
df_generated_comparison = pd.DataFrame.from_dict({"Expected": df_test["labels"], "Predicted": gen_outputs})

In [27]:
df_generated_comparison.head(20)

Unnamed: 0,Expected,Predicted
0,Orgien,Orgien
1,Sieger,Sieger
2,Klötze,Klotze
3,Kalke,Kalke
4,Skelette,Skelette
5,Flocken,Flocken
6,Schwänze,Schwanze
7,Schwämme,Schwamme
8,Kegel,Kegel
9,Geckos,Geckos


In [28]:
file_csv_generated_output_comparison = os.path.join(dir_project, "generated_words.csv")

In [29]:
df_generated_comparison.to_csv(file_csv_generated_output_comparison)

## Evaluation

In [43]:
def acc_score(pred, gold, dec):
    outcomes = {'correct': [], 'incorrect': []}
    for idx, i in enumerate(pred):
        if i == gold[idx]:
            outcomes['correct'].append([idx, i])
        else:
            outcomes['incorrect'].append([idx, i])
    
    score = round(len(outcomes['correct']) / len(gold), dec)

    print('The accuracy score is {}'.format(score))
    print('\n\nThe incorrect items are:\n')
    print('idx: pred - gold\n')
    for x, y in outcomes['incorrect']:
        print(f'{x}: {y} - {gold[x]}')
    

acc_score(df_generated_comparison['Predicted'].to_list(), df_generated_comparison['Expected'].to_list(), 2)

The accuracy score is 0.76


The incorrect items are:

idx: pred - gold

2: Klotze - Klötze
6: Schwanze - Schwänze
7: Schwamme - Schwämme
14: Köchinen - Köchinnen
15: Vorsprungen - Vorsprünge
21: Flusse - Flüsse
26: Stope - Stops
31: Passagier - Passagiere
35: Arsche - Ärsche
36: Schlamme - Schlämme
49: Komponiste - Komponisten
52: Grasse - Gräser
61: Freilichtmuseume - Freilichtmuseen
63: Kranze - Kränze
69: Wracke - Wracks
77: Dorne - Dornen
80: Ohrenschmälze - Ohrenschmalze
82: Leibe - Leiber
83: Mauseloche - Mauselöcher
90: Leinwanden - Leinwände
95: Schachten - Schächte
97: Gedrängen - Gedränge
100: Gasse - Gase
106: Cabriolete - Cabriolets
111: Wunsche - Wünsche
112: Fledermausen - Fledermäuse
115: Zwiebel - Zwiebeln
116: Gewächshause - Gewächshäuser
120: Organismuse - Organismen
124: Kirchturme - Kirchtürme
125: Pedante - Pedanten
129: Ahne - Ahnen
131: Knurrhahne - Knurrhähne
132: Ebenholze - Ebenhölzer
133: Sehnsuchten - Sehnsüchte
138: Perlhuhne - Perlhühner
146: Idiote - Idi