In [22]:
# SOURCE: https://github.com/M-Taghizadeh/flan-t5-base-imdb-text-classification

In [None]:
! pip install transformers pytorch_lightning sentencepiece datasets

In [None]:
! pip install -q tqdm pandas scikit-learn evaluate nltk

In [None]:
! pip install huggingface_hub accelerate

In [None]:
# from huggingface_hub import login
# login()

In [3]:
import os
import shutil
import json
import time
import re
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [4]:
import datasets
from datasets import Dataset, DatasetDict
from datasets import concatenate_datasets

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

UKRAINIAN_LETTERS = 'абвгґдеєжзиіїйклмнопрстуфхцчшщьюя'
UKRAINAIN_VOWELS = 'аеєиіїоуюя'
ENGLISH_LETTERS = 'abcdefghijklmnopqrstuvwxyz'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
allowed_punctuation = """ .,!?;:'"«»()+-—–"""
other_punctuation =  """ $%&<>{}[]*"""

voa_df = pd.read_csv('./voa_stressed_cleaned_data.csv')
unique_letters = set(''.join(voa_df['text'].to_list()))

unique_letters = unique_letters - set(UKRAINIAN_LETTERS) \
                                - set(UKRAINIAN_LETTERS.upper()) \
                                - set(allowed_punctuation) \
                                - set(other_punctuation)

df = voa_df[~voa_df['text'].apply(lambda x: any(c in unique_letters for c in x))]
df = df[['text', 'labels']]
df = df.rename(columns={
    'labels': 'label'
})
df.shape

(137078, 2)

In [17]:
def get_data(df):
    train_df, eval_df = train_test_split(df, test_size=0.01, random_state=42)

    train_dataset = Dataset.from_pandas(train_df).remove_columns(['__index_level_0__'])
    eval_dataset = Dataset.from_pandas(eval_df).remove_columns(['__index_level_0__'])
    dataset = DatasetDict({
        "train": train_dataset,
        "eval": eval_dataset
    })
    return dataset


model_id = 'google/byt5-small'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
dataset = get_data(df)

In [18]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["eval"]]).map(lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])

tokenized_targets = concatenate_datasets([dataset["train"], dataset["eval"]]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label'])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

Map:   0%|          | 0/137078 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/137078 [00:00<?, ? examples/s]

In [19]:
def preprocess_function(sample, padding="max_length"):
    # print(sample)
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["label"] = labels["input_ids"]
    return model_inputs

In [20]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label'])

Map:   0%|          | 0/135707 [00:00<?, ? examples/s]

Map:   0%|          | 0/1371 [00:00<?, ? examples/s]

In [28]:
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-accentor-model"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=2,
    # per_device_eval_batch_size=2,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,

    num_train_epochs=2,
    max_steps=100,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=1000,
    # evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    # report_to="tensorboard",
    # push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    # eval_dataset=tokenized_dataset["eval"],
    # compute_metrics=compute_metrics,
)

In [29]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 23.53 GiB of which 79.44 MiB is free. Process 3875131 has 23.45 GiB memory in use. Of the allocated memory 22.14 GiB is allocated by PyTorch, and 1.07 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)