# Parameters

In [1]:
import os
import sys

In [2]:
# Set global random seed
SEED = 1234

# Set the project directories for local and/or Google Colab 
dir_local = '.'
dir_colab = 'drive/MyDrive/NLP_code_notebooks/project/'
file_to_import = os.path.join(dir_colab, "byt5_model.py")

# Set the important parameters for the model
learning_rate = 1e-4
num_epochs = 50
batch_size = 16

In [3]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [4]:
if IN_COLAB:
  print('Running on Google Colab')
  from google.colab import drive

  !pip install transformers datasets SentencePiece
  drive.mount('/content/drive')
  dir_project = dir_colab
  !cp $file_to_import .
  #sys.path.append(dir_project)
else:
  print('Running locally')
  dir_project = dir_local

Running on Google Colab
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Code

## Imports, time, and random seed

In [5]:
import pandas as pd
import numpy as np
import torch
from byt5_model import *
from pathlib import Path
from datetime import datetime

In [6]:
from transformers import set_seed

In [7]:
dir_dataset = os.path.join(dir_project, 'data')

In [8]:
time_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [9]:
torch.manual_seed(SEED)
np.random.seed(SEED)
set_seed(SEED)

In [10]:
model_name = 'google/byt5-small'


## Data

In [11]:
header_names = ["lemma", "labels", "features"]
df_train_ger, df_valid_ger, df_test_ger = load_raw_data_as_df(dir_dataset)
df_train_tur, df_valid_tur, df_test_tur = load_raw_data_as_df(dir_dataset, which_dataset="turkish")

In [12]:
df_train_ger.head()

Unnamed: 0,lemma,labels,features,inputs
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL,Plätzchen N;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL,Kastanie N;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL,Linie N;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL,Scherz N;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL,Wiederholung N;NOM;FEM;PL


In [13]:
df_train_tur.head()

Unnamed: 0,lemma,labels,features,inputs
0,kurtarmak,kurtarmış olacak mıydı,V;PROSP;SG;3;POS;PST;INTR;LGSPEC03,kurtarmak V;PROSP;SG;3;POS;PST;INTR;LGSPEC03
1,inmek,inecek olacak mıyım,V;PROSP;IND;SG;1;POS;FUT;INTR,inmek V;PROSP;IND;SG;1;POS;FUT;INTR
2,kaçmak,kaçmamışlarmış,V;DECL;PL;3;NEG;PST;LGSPEC01;LGSPEC03,kaçmak V;DECL;PL;3;NEG;PST;LGSPEC01;LGSPEC03
3,söz vermek,söz vermeyeceğim,V;DECL;IND;SG;1;NEG;FUT,söz vermek V;DECL;IND;SG;1;NEG;FUT
4,parlatmak,parlatmamış mısın,V;SG;2;NEG;PST;INTR;LGSPEC03,parlatmak V;SG;2;NEG;PST;INTR;LGSPEC03


In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [15]:
tokenizer_ger = get_tokenizer(model_name)
tokenizer_tur = get_tokenizer(model_name)

In [16]:
tokenizer_tur.save_pretrained(os.path.join(dir_project, "saved_tokenizer_tur"))
tokenizer_ger.save_pretrained(os.path.join(dir_project, "saved_tokenizer_ger"))

('drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/tokenizer_config.json',
 'drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/special_tokens_map.json',
 'drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/added_tokens.json')

In [17]:
tokenized_train_ger, tokenized_valid_ger, tokenized_test_ger = get_tokenized_data(tokenizer_ger, df_train_ger, df_valid_ger, df_test_ger)

In [18]:
tokenized_train_tur, tokenized_valid_tur, tokenized_test_tur = get_tokenized_data(tokenizer_tur, df_train_tur, df_valid_tur, df_test_tur)

In [19]:
tokenized_train_ger

{'input_ids': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]])}

In [20]:
tokenized_train_tur

{'input_ids': tensor([[110, 120, 117,  ...,   0,   0,   0],
         [108, 113, 112,  ...,   0,   0,   0],
         [110, 100, 198,  ...,   0,   0,   0],
         ...,
         [125, 100, 105,  ...,   0,   0,   0],
         [118, 199, 180,  ...,   0,   0,   0],
         [101, 198, 185,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[110, 120, 117,  ...,   0,   0,   0],
         [108, 113, 104,  ...,   0,   0,   0],
         [110, 100, 198,  ...,   0,   0,   0],
         ...,
         [125, 100, 105,  ...,   0,   0,   0],
         [118, 199, 180,  ...,   0,   0,   0],
         [101, 198, 185,  ...,   0,   0,   0]])}

## Fine-tuning the pretrained model for the German dataset

In [21]:
import torch
from transformers import T5ForConditionalGeneration, T5Config

In [22]:
dir_path_model_ger = os.path.join(dir_project, "saved_model_scratch_ger_new")
if not os.path.isdir(dir_path_model_ger):
  os.makedirs(dir_path_model_ger)

In [23]:
model_scratch_ger = get_byt5_model(device, model_name, pretrained=False)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [24]:
optimizer_scratch_ger = get_optimizer(model_scratch_ger, learning_rate)

In [25]:
train_dataloader_ger = get_dataloader(tokenized_train_ger, batch_size=batch_size)
valid_dataloader_ger = get_dataloader(tokenized_valid_ger, batch_size=1, shuffle=False)



In [26]:
list_train_losses_scratch_ger, list_valid_losses_scratch_ger = [], []
list_train_losses_scratch_ger, list_valid_losses_scratch_ger = train_validation_loop(model_scratch_ger, 
                                                                                     train_dataloader_ger, 
                                                                                     valid_dataloader_ger, 
                                                                                     optimizer_scratch_ger, 
                                                                                     device, 
                                                                                     dir_path_model_ger, 
                                                                                     num_epochs=num_epochs)
np.save(os.path.join(dir_project, "train_losses_scratch_ger.npy"), np.array(list_train_losses_scratch_ger))
np.save(os.path.join(dir_project, "valid_losses_scratch_ger.npy"), np.array(list_valid_losses_scratch_ger))

epoch: 1 / 50, train loss: 3.3330, validation loss: 3.1108
epoch: 2 / 50, train loss: 2.4358, validation loss: 2.4808
epoch: 3 / 50, train loss: 1.9812, validation loss: 2.0092
epoch: 4 / 50, train loss: 1.6831, validation loss: 1.7649
epoch: 5 / 50, train loss: 1.4661, validation loss: 1.5621
epoch: 6 / 50, train loss: 1.2881, validation loss: 1.4272
epoch: 7 / 50, train loss: 1.1610, validation loss: 1.3568
epoch: 8 / 50, train loss: 1.0563, validation loss: 1.3115
epoch: 9 / 50, train loss: 0.9587, validation loss: 1.2077
epoch: 10 / 50, train loss: 0.8865, validation loss: 1.1632
epoch: 11 / 50, train loss: 0.8192, validation loss: 1.1580
epoch: 12 / 50, train loss: 0.7424, validation loss: 1.1138
epoch: 13 / 50, train loss: 0.6659, validation loss: 1.1066
epoch: 14 / 50, train loss: 0.6254, validation loss: 1.0718
epoch: 15 / 50, train loss: 0.5571, validation loss: 1.0357
epoch: 16 / 50, train loss: 0.4869, validation loss: 0.9973
epoch: 17 / 50, train loss: 0.4480, validation lo

## Fine-tuning the pretrained model for the Turkish dataset


In [27]:
dir_path_model_tur = os.path.join(dir_project, "saved_model_scratch_tur_new")
if not os.path.isdir(dir_path_model_tur):
  os.makedirs(dir_path_model_tur)

In [28]:
model_scratch_tur = get_byt5_model(device, model_name, pretrained=False)

In [29]:
optimizer_scratch_tur = get_optimizer(model_scratch_tur, learning_rate)

In [30]:
train_dataloader_tur = get_dataloader(tokenized_train_tur, batch_size=batch_size)
valid_dataloader_tur = get_dataloader(tokenized_valid_tur, batch_size=1, shuffle=False)

In [31]:
list_train_losses_scratch_tur, list_valid_losses_scratch_tur = [], []
list_train_losses_scratch_tur, list_valid_losses_scratch_tur = train_validation_loop(model_scratch_tur, 
                                                                                     train_dataloader_tur, 
                                                                                     valid_dataloader_tur, 
                                                                                     optimizer_scratch_tur, 
                                                                                     device, 
                                                                                     dir_path_model_tur, 
                                                                                     num_epochs=num_epochs)
np.save(os.path.join(dir_project, "train_losses_scratch_tur.npy"), np.array(list_train_losses_scratch_tur))
np.save(os.path.join(dir_project, "valid_losses_scratch_tur.npy"), np.array(list_valid_losses_scratch_tur))

epoch: 1 / 50, train loss: 3.5470, validation loss: 2.3004
epoch: 2 / 50, train loss: 2.1254, validation loss: 1.5940
epoch: 3 / 50, train loss: 1.6711, validation loss: 1.3628
epoch: 4 / 50, train loss: 1.4596, validation loss: 1.2068
epoch: 5 / 50, train loss: 1.2762, validation loss: 1.0468
epoch: 6 / 50, train loss: 1.1063, validation loss: 0.9149
epoch: 7 / 50, train loss: 0.9497, validation loss: 0.8099
epoch: 8 / 50, train loss: 0.8287, validation loss: 0.7169
epoch: 9 / 50, train loss: 0.7148, validation loss: 0.6418
epoch: 10 / 50, train loss: 0.6413, validation loss: 0.5991
epoch: 11 / 50, train loss: 0.5854, validation loss: 0.5817
epoch: 12 / 50, train loss: 0.5270, validation loss: 0.5471
epoch: 13 / 50, train loss: 0.4734, validation loss: 0.5089
epoch: 14 / 50, train loss: 0.4269, validation loss: 0.4963
epoch: 15 / 50, train loss: 0.3972, validation loss: 0.4817
epoch: 16 / 50, train loss: 0.3698, validation loss: 0.4527
epoch: 17 / 50, train loss: 0.3380, validation lo