# Parameters

In [1]:
import os

In [2]:
# Set global random seed
SEED = 1234

# Set the project directories for local and/or Google Colab 
dir_local = '.'
dir_colab = 'drive/MyDrive/NLP_code_notebooks/project/'
file_to_import = os.path.join(dir_colab, "byt5_model.py")

# Give the model(name) for the Huggingface or the location of a model on your local device
# Outputname can be left blank, unless you want to specify a specific name for the currently trained model
# Prefix is the prefix used for the task that we are finetuning the model on
# GEN_MODEL_OVERRIDE can be used to load a saved model for generation

model_name = 'google/byt5-small'

In [3]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [4]:
if IN_COLAB:
    print('Running on Google Colab')
    from google.colab import drive

    !pip install transformers datasets
    drive.mount('/content/drive')
    dir_project = dir_colab
    !cp $file_to_import .
else:
    print('Running locally')
    dir_project = dir_local

Running on Google Colab
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m25.0 MB/s[0m eta [

# Code

## Imports, time, and random seed

In [5]:
import os
import numpy as np
import pandas as pd
import torch
from datetime import datetime
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration, set_seed

In [6]:
torch.manual_seed(SEED)
set_seed(SEED)
np.random.seed(SEED)

In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [8]:
from byt5_model import *

In [9]:
dir_dataset = os.path.join(dir_project, 'data')

In [10]:
_, _, df_test_ger = load_raw_data_as_df(dir_dataset)
_, _, df_test_tur = load_raw_data_as_df(dir_dataset, which_dataset="turkish")

In [11]:
def acc_score_new(pred, gold, dec):
    #print(0)
    outcomes = {'correct': [], 'incorrect': []}
    for idx, i in enumerate(pred):
        if i == gold[idx]:
            outcomes['correct'].append([idx, i])
        else:
            outcomes['incorrect'].append([idx, i])
    
    score = round(len(outcomes['correct']) / len(gold), dec)
    #print(1)
    print(f'The accuracy score is {score:.4f}')
    #print(2)
    #print('\n\nThe incorrect items are:\n')
    #print(3)
    #print('idx: pred - gold\n')
    """
    for x, y in outcomes['incorrect']:
        print(f'{x}: {y} - {gold[x]}')
    """
    return

## Evaluation German language

In [12]:
dir_model_fine_ger = os.path.join(dir_project, "saved_model_fine_ger")

In [13]:
dir_tokenizer_ger = os.path.join(dir_project, "saved_tokenizer_ger")

In [14]:
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer_ger)

In [15]:
gen_model = T5ForConditionalGeneration.from_pretrained(dir_model_fine_ger, return_dict=True, config=model_name)
gen_model.to(device)

gen_inputs = tokenizer([f"{item}" for item in df_test_ger["inputs"]], return_tensors="pt", padding=True).to(device)

outputs = gen_model.generate(
    input_ids=gen_inputs["input_ids"],
    attention_mask=gen_inputs["attention_mask"],
    max_length=60,
    num_beams=5,
    do_sample=False,  # disable sampling to test if batching affects output
)

gen_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

In [16]:
df_generated_comparison = pd.DataFrame.from_dict({"Expected": df_test_ger["labels"], "Predicted": gen_outputs})

In [17]:
df_generated_comparison.head(10)

Unnamed: 0,Expected,Predicted
0,Orgien,Orgien
1,Sieger,Sieger
2,Klötze,Klotze
3,Kalke,Kalke
4,Skelette,Skelette
5,Flocken,Flocken
6,Schwänze,Schwänze
7,Schwämme,Schwämme
8,Kegel,Kegel
9,Geckos,Geckon


In [18]:
acc_score_new(df_generated_comparison['Predicted'].to_list(), df_generated_comparison['Expected'].to_list(), 4)

The accuracy score is 0.8017


In [None]:
# file_csv_generated_output_comparison = os.path.join(dir_project, "generated_words.csv")
# df_generated_comparison.to_csv(file_csv_generated_output_comparison)

## Evaluation Turkish language

In [21]:
dir_tokenizer_tur = os.path.join(dir_project, "saved_tokenizer_tur")
tokenizer_tur = AutoTokenizer.from_pretrained(dir_tokenizer_tur)

In [22]:
dir_model_fine_tur = os.path.join(dir_project, "saved_model_fine_tur_latest")
gen_model_tur = T5ForConditionalGeneration.from_pretrained(dir_model_fine_tur, return_dict=True, config=model_name)
gen_model_tur.to(device)

gen_inputs_tur = tokenizer_tur([f"{item}" for item in df_test_tur["inputs"]], return_tensors="pt", padding=True).to(device)

outputs_tur_complete_set = []
num_test_count = 0

for count in range(4):
    outputs_tur = gen_model_tur.generate(
        input_ids=gen_inputs_tur["input_ids"][num_test_count:num_test_count+500],
        attention_mask=gen_inputs_tur["attention_mask"][num_test_count:num_test_count+500],
        max_length=64,
        num_beams=3,
        do_sample=False,  # disable sampling to test if batching affects output
    )
    outputs_tur_complete_set += outputs_tur
    num_test_count += 500

gen_outputs_tur = tokenizer_tur.batch_decode(outputs_tur_complete_set, skip_special_tokens=True)
df_generated_comparison_tur = pd.DataFrame.from_dict({"Expected": df_test_tur["labels"], "Predicted": gen_outputs_tur})

In [23]:
df_generated_comparison_tur.head(10)

Unnamed: 0,Expected,Predicted
0,satıyor olmalı mıydık,satıyor olmalı mıydık
1,hamızlarından,hamızlarından
2,ovmaz mıydı,ovmaz mıydı
3,yıpranmamalı mıymışsınız,yıpranmamalı mıymışsınız
4,hüzünlerde,hüzünlerde
5,manivelanızda,manivelanızda
6,dokuyor olmalısın,dokuyor olmalısın
7,barışmamalılar mıydı,barışmamalılar mıydı
8,sahipli yazılımlarımızı,sahipli yazılımlarımızı
9,yanıt vermedim mi,yanıt vermedim mi


In [24]:
acc_score_new(df_generated_comparison_tur['Predicted'].to_list(), df_generated_comparison_tur['Expected'].to_list(), 4)

The accuracy score is 0.9420
