# Parameters

In [1]:
import os
import sys

In [2]:
# Set global random seed
SEED = 1234

# Set the project directories for local and/or Google Colab 
dir_local = '.'
dir_colab = 'drive/MyDrive/NLP_code_notebooks/project/'
file_to_import = os.path.join(dir_colab, "byt5_model.py")

# Set the important parameters for the model
learning_rate = 1e-4
num_epochs = 50
batch_size = 16

In [3]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [4]:
if IN_COLAB:
  print('Running on Google Colab')
  from google.colab import drive

  !pip install transformers datasets
  drive.mount('/content/drive')
  dir_project = dir_colab
  !cp $file_to_import .
  #sys.path.append(dir_project)
else:
  print('Running locally')
  dir_project = dir_local

Running on Google Colab
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.3 MB/s[0m 

# Code

## Imports, time, and random seed

In [5]:
import pandas as pd
import numpy as np
import torch
from byt5_model import *
from pathlib import Path
from datetime import datetime

In [6]:
from transformers import set_seed

In [7]:
dir_dataset = os.path.join(dir_project, 'data')

In [8]:
time_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [9]:
torch.manual_seed(SEED)
np.random.seed(SEED)
set_seed(SEED)

In [10]:
model_name = 'google/byt5-small'


## Data

In [11]:
header_names = ["lemma", "labels", "features"]
df_train_ger, df_valid_ger, df_test_ger = load_raw_data_as_df(dir_dataset)
df_train_tur, df_valid_tur, df_test_tur = load_raw_data_as_df(dir_dataset, which_dataset="turkish", turkish_large=True)

In [12]:
df_train_ger.head()

Unnamed: 0,lemma,labels,features,inputs
0,Plätzchen,Plätzchen,N;NOM;NEUT;PL,Plätzchen N;NOM;NEUT;PL
1,Kastanie,Kastanien,N;NOM;FEM;PL,Kastanie N;NOM;FEM;PL
2,Linie,Linien,N;NOM;FEM;PL,Linie N;NOM;FEM;PL
3,Scherz,Scherze,N;NOM;MASC;PL,Scherz N;NOM;MASC;PL
4,Wiederholung,Wiederholungen,N;NOM;FEM;PL,Wiederholung N;NOM;FEM;PL


In [13]:
df_train_tur.head()

Unnamed: 0,lemma,labels,features,inputs
0,masör,masörlerdim,N;ARGNO1S;PL;PST,masör N;ARGNO1S;PL;PST
1,kışkırtmak,kışkırtmış mıyım,V;SG;1;POS;PST;INTR;LGSPEC03,kışkırtmak V;SG;1;POS;PST;INTR;LGSPEC03
2,küçük,küçük değildiler,ADJ;DECL;PL;3;NEG;PST;LGSPEC01,küçük ADJ;DECL;PL;3;NEG;PST;LGSPEC01
3,Yahudi,Yahudimizin,N;GEN;SG;PSS1P,Yahudi N;GEN;SG;PSS1P
4,bitirttirmek,bitirttirecek olacak mıymışım,V;PROSP;SG;1;POS;FUT;INTR;LGSPEC03,bitirttirmek V;PROSP;SG;1;POS;FUT;INTR;LGSPEC03


In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [15]:
tokenizer_ger = get_tokenizer(model_name)
tokenizer_tur = get_tokenizer(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [16]:
tokenizer_tur.save_pretrained(os.path.join(dir_project, "saved_tokenizer_tur"))
tokenizer_ger.save_pretrained(os.path.join(dir_project, "saved_tokenizer_ger"))

('drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/tokenizer_config.json',
 'drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/special_tokens_map.json',
 'drive/MyDrive/NLP_code_notebooks/project/saved_tokenizer_ger/added_tokens.json')

In [17]:
tokenized_train_ger, tokenized_valid_ger, tokenized_test_ger = get_tokenized_data(tokenizer_ger, df_train_ger, df_valid_ger, df_test_ger)

In [18]:
tokenized_train_tur, tokenized_valid_tur, tokenized_test_tur = get_tokenized_data(tokenizer_tur, df_train_tur, df_valid_tur, df_test_tur)

In [19]:
tokenized_train_ger

{'input_ids': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[ 83, 111, 198,  ...,   0,   0,   0],
         [ 78, 100, 118,  ...,   0,   0,   0],
         [ 79, 108, 113,  ...,   0,   0,   0],
         ...,
         [ 80, 108, 119,  ...,   0,   0,   0],
         [ 86, 119, 100,  ...,   0,   0,   0],
         [ 85, 120, 103,  ...,   0,   0,   0]])}

In [20]:
tokenized_train_tur

{'input_ids': tensor([[112, 100, 118,  ...,   0,   0,   0],
         [110, 199, 180,  ...,   0,   0,   0],
         [110, 198, 191,  ...,   0,   0,   0],
         ...,
         [106, 198, 185,  ...,   0,   0,   0],
         [101, 120, 117,  ...,   0,   0,   0],
         [110, 100, 111,  ...,   0,   0,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[112, 100, 118,  ...,   0,   0,   0],
         [110, 199, 180,  ...,   0,   0,   0],
         [110, 198, 191,  ...,   0,   0,   0],
         ...,
         [106, 198, 185,  ...,   0,   0,   0],
         [101, 120, 117,  ...,   0,   0,   0],
         [110, 100, 111,  ...,   0,   0,   0]])}

## Fine-tuning the pretrained model for the German dataset

In [None]:
import torch
from transformers import T5ForConditionalGeneration

In [None]:
dir_path_model_ger = os.path.join(dir_project, "saved_model_fine_ger_new")
if not os.path.isdir(dir_path_model_ger):
  os.makedirs(dir_path_model_ger)

In [None]:
model_fine_ger = get_byt5_model(device, model_name)

In [None]:
optimizer_fine_ger = get_optimizer(model_fine_ger, learning_rate)

In [None]:
train_dataloader_ger = get_dataloader(tokenized_train_ger, batch_size=batch_size)
valid_dataloader_ger = get_dataloader(tokenized_valid_ger, batch_size=1, shuffle=False)



In [None]:
list_train_losses_fine_ger, list_valid_losses_fine_ger = [], []
list_train_losses_fine_ger, list_valid_losses_fine_ger = train_validation_loop(model_fine_ger, 
                                                                               train_dataloader_ger, 
                                                                               valid_dataloader_ger, 
                                                                               optimizer_fine_ger, 
                                                                               device, 
                                                                               dir_path_model_ger, 
                                                                               num_epochs=num_epochs)
np.save(os.path.join(dir_project, "train_losses_fine_ger.npy"), np.array(list_train_losses_fine_ger))
np.save(os.path.join(dir_project, "valid_losses_fine_ger.npy"), np.array(list_valid_losses_fine_ger))

epoch: 1 / 50, train loss: 35.1419, validation loss: 17.3433
epoch: 2 / 50, train loss: 12.8769, validation loss: 4.0278
epoch: 3 / 50, train loss: 4.2891, validation loss: 3.5307
epoch: 4 / 50, train loss: 3.6173, validation loss: 2.9870
epoch: 5 / 50, train loss: 3.0373, validation loss: 2.1625
epoch: 6 / 50, train loss: 2.0979, validation loss: 0.8594
epoch: 7 / 50, train loss: 0.8113, validation loss: 0.1864
epoch: 8 / 50, train loss: 0.3339, validation loss: 0.1204
epoch: 9 / 50, train loss: 0.2314, validation loss: 0.0991
epoch: 10 / 50, train loss: 0.1700, validation loss: 0.0877
epoch: 11 / 50, train loss: 0.1565, validation loss: 0.0856
epoch: 12 / 50, train loss: 0.1402, validation loss: 0.0700
epoch: 13 / 50, train loss: 0.1241, validation loss: 0.0608
epoch: 14 / 50, train loss: 0.1068, validation loss: 0.0591
epoch: 15 / 50, train loss: 0.0986, validation loss: 0.0540
epoch: 16 / 50, train loss: 0.0954, validation loss: 0.0499
epoch: 17 / 50, train loss: 0.0878, validation

## Fine-tuning the pretrained model for the Turkish dataset


In [22]:
dir_path_model_tur = os.path.join(dir_project, "saved_model_fine_tur_latest")
if not os.path.isdir(dir_path_model_tur):
  os.makedirs(dir_path_model_tur)

In [24]:
model_fine_tur = get_byt5_model(device, model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [25]:
optimizer_fine_tur = get_optimizer(model_fine_tur, learning_rate)

In [26]:
train_dataloader_tur = get_dataloader(tokenized_train_tur, batch_size=batch_size)
valid_dataloader_tur = get_dataloader(tokenized_valid_tur, batch_size=1, shuffle=False)



In [27]:
list_train_losses_fine_tur, list_valid_losses_fine_tur = [], []
list_train_losses_fine_tur, list_valid_losses_fine_tur = train_validation_loop(model_fine_tur, 
                                                                               train_dataloader_tur, 
                                                                               valid_dataloader_tur, 
                                                                               optimizer_fine_tur, 
                                                                               device, 
                                                                               dir_path_model_tur, 
                                                                               num_epochs=num_epochs)
np.save(os.path.join(dir_project, "train_losses_fine_tur.npy"), np.array(list_train_losses_fine_tur))
np.save(os.path.join(dir_project, "valid_losses_fine_tur.npy"), np.array(list_valid_losses_fine_tur))

epoch: 1 / 50, train loss: 5.0631, validation loss: 0.2509
epoch: 2 / 50, train loss: 0.2683, validation loss: 0.1674
epoch: 3 / 50, train loss: 0.1920, validation loss: 0.1522
epoch: 4 / 50, train loss: 0.1653, validation loss: 0.1362
epoch: 5 / 50, train loss: 0.1411, validation loss: 0.1052
epoch: 6 / 50, train loss: 0.1082, validation loss: 0.0806
epoch: 7 / 50, train loss: 0.0804, validation loss: 0.0512
epoch: 8 / 50, train loss: 0.0549, validation loss: 0.0337
epoch: 9 / 50, train loss: 0.0361, validation loss: 0.0195
epoch: 10 / 50, train loss: 0.0256, validation loss: 0.0157
epoch: 11 / 50, train loss: 0.0189, validation loss: 0.0128
epoch: 12 / 50, train loss: 0.0147, validation loss: 0.0111
epoch: 13 / 50, train loss: 0.0117, validation loss: 0.0112
epoch: 14 / 50, train loss: 0.0098, validation loss: 0.0100
epoch: 15 / 50, train loss: 0.0088, validation loss: 0.0083
epoch: 16 / 50, train loss: 0.0074, validation loss: 0.0089
epoch: 17 / 50, train loss: 0.0061, validation lo