# Parameters

In [10]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.26.1-py3-none-any.whl (6.3 MB)
Collecting datasets
  Using cached datasets-2.10.1-py3-none-any.whl (469 kB)
Collecting huggingface-hub<1.0,>=0.11.0
  Using cached huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting aiosignal>=1.1.2
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Installing collected packages: aiosignal, huggingface-hub, aiohttp, transformers, datasets
Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 datasets-2.10.1 huggingface-hub-0.13.2 transformers-4.26.1


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
dir_data_google_drive = "drive/MyDrive/NLP_code_notebooks/project/"

In [13]:
# Set global random seed
SEED = 1234

# Set the locations of the train, dev, and test files
# Gold is used for test it is the same as .test, but with the actual output labels attached in a similar format to train and dev
file_train = 'data/deu_600.train'
file_validation = 'data/deu.dev'
file_test = 'data/deu.gold'

# Give the model(name) for the Huggingface or the location of a model on your local device
# Outputname can be left blank, unless you want to specify a specific name for the currently trained model
# Prefix is the prefix used for the task that we are finetuning the model on
# GEN_MODEL_OVERRIDE can be used to load a saved model for generation

model_name = 'google/byt5-small'


# Set the important parameters for the model
learning_rate = 1e-4
num_epochs = 20
batch_size = 16

# Code

## Imports, time, and random seed

In [14]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, get_scheduler, T5ForConditionalGeneration, set_seed, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from pathlib import Path
from datetime import datetime

In [15]:
import os

In [16]:
time_string = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [17]:
torch.manual_seed(SEED)
set_seed(SEED)
np.random.seed(SEED)

## Data

In [18]:
#dir_data = "drive/MyDrive/NLP_code_notebooks/project/data/"

In [19]:
header_names = ['lemma', 'labels', 'features']

df_train = pd.read_csv(os.path.join(dir_data_google_drive, file_train), sep='\t', names=header_names)
df_valid = pd.read_csv(os.path.join(dir_data_google_drive, file_validation), sep='\t', names=header_names)
df_test = pd.read_csv(os.path.join(dir_data_google_drive, file_test), sep='\t', names=header_names)

In [20]:
df_train.head()

Unnamed: 0,lemma,labels,features
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL


In [21]:
df_train["inputs"] = df_train["lemma"] + df_train["features"]

In [22]:
df_valid["inputs"] = df_valid["lemma"] + df_valid["features"]

In [23]:
df_test["inputs"] = df_test["lemma"] + df_test["features"]

In [24]:
df_train.head()

Unnamed: 0,lemma,labels,features,inputs
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL,PlätzchenN;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL,KastanieN;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL,LinieN;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL,ScherzN;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL,WiederholungN;NOM;FEM;PL


In [25]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
morph_inflection_model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
def tokenize_function(df_data, input_column_name="inputs"):
    tokenized_dict = {}
    
    inputs = tokenizer(df_data[input_column_name].to_list(), padding="longest", return_tensors="pt")
    labels = tokenizer(df_data["labels"].to_list(), padding="longest", return_tensors="pt").input_ids

    tokenized_dict["input_ids"] = inputs["input_ids"]
    tokenized_dict["attention_mask"] = inputs["attention_mask"]
    tokenized_dict["labels"] = labels

    return tokenized_dict

In [27]:
tokenized_train = tokenize_function(df_train)

In [28]:
tokenized_valid = tokenize_function(df_valid)

In [29]:
tokenized_test = tokenize_function(df_test)

In [30]:
tokenized_train

{'input_ids': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]])}

## Training

In [None]:
class MorphInflectionDataset(Dataset):
    def __init__(self, dict_data):
        self.dict_data = dict_data

    def __len__(self):
        return len(self.dict_data["labels"])

    def __getitem__(self, idx):
        dict_sample = {}
        input_ids = self.dict_data["input_ids"][idx]
        attention_mask = self.dict_data["attention_mask"][idx]
        labels = self.dict_data["labels"][idx]
        return input_ids, attention_mask, labels

In [None]:
#eval_dataloader = DataLoader(tokenized_dev, batch_size=BATCHES)

optimizer = AdamW(morph_inflection_model.parameters(), lr=learning_rate)

In [None]:
train_dataloader = DataLoader(MorphInflectionDataset(tokenized_train), shuffle=True, batch_size=batch_size, num_workers=4)

In [None]:
train_dataloader

In [None]:
morph_inflection_model.train()

In [None]:
num_train_batches = len(train_dataloader)

In [None]:
for epoch in range(num_epochs):
    loss_for_epoch = 0.0
    for input_ids, attention_mask, labels in train_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        outputs = morph_inflection_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss_for_epoch += loss
        loss.backward()
        optimizer.step()
    loss_for_epoch /= num_batches
    print(f"{epoch + 1} / {num_epochs}, loss: {loss:.4f}")

## Model storage

In [32]:
file_path_model = os.path.join(dir_data_google_drive , "morph_inflection_byt5_small.pth")

In [123]:
torch.save(morph_inflection_model.state_dict(), file_path_model)

## Model load and generation

In [33]:
gen_model = T5ForConditionalGeneration.from_pretrained(file_path_model, return_dict=True, config=model_name)
gen_model.to(device)

gen_inputs = tokenizer([f"{item}" for item in df_test["inputs"]], return_tensors="pt", padding=True).to(device)

outputs = gen_model.generate(
    input_ids=gen_inputs["input_ids"],
    attention_mask=gen_inputs["attention_mask"],
    # max_length=50,
    # num_beams=5,
    # no_repeat_ngram_size=2,
    # early_stopping=True,
    do_sample=False,  # disable sampling to test if batching affects output
)

gen_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)



In [34]:
df_generated_comparison = pd.DataFrame.from_dict({"Expected": df_test["labels"], "Predicted": gen_outputs})

In [35]:
df_generated_comparison.head(20)

Unnamed: 0,Expected,Predicted
0,Orgien,Orgien
1,Sieger,Sieger
2,Klötze,Klotze
3,Kalke,Kalke
4,Skelette,Skelette
5,Flocken,Flocken
6,Schwänze,Schwanze
7,Schwämme,Schwamme
8,Kegel,Kegel
9,Geckos,Geckos


In [132]:
file_csv_generated_output_comparison = os.path.join(dir_data_google_drive, "generated_words.csv")

In [133]:
df_generated_comparison.to_csv(file_csv_generated_output_comparison)

## Evaluation

In [None]:
# to be done