In [2]:
!pip install datasets




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [3]:
!pip install --upgrade pip setuptools

Collecting pip
  Using cached pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Using cached setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Using cached pip-24.0-py3-none-any.whl (2.1 MB)
Using cached setuptools-69.5.1-py3-none-any.whl (894 kB)


ERROR: To modify pip, please run the following command:
C:\Python312\python.exe -m pip install --upgrade pip setuptools

[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [4]:
!pip install evaluate
!pip install sacrebleu




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [5]:
!pip install huggingface_hub




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [6]:
!pip install ipywidgets




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from datasets import load_dataset
kr_dataset = load_dataset("msarmi9/korean-english-multitarget-ted-talks-task")

### Setting up the data

In [8]:
kr_dataset

DatasetDict({
    train: Dataset({
        features: ['korean', 'english'],
        num_rows: 166215
    })
    validation: Dataset({
        features: ['korean', 'english'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['korean', 'english'],
        num_rows: 1982
    })
})

1) Add index column "id"

In [9]:
def add_index_column(dataset): #dataset = split_data
    dataset_with_id = dataset.map(lambda example, idx: {'id': idx, **example}, with_indices=True)
    return dataset_with_id

# Add index column "id" to each split of the dataset
kr_dataset = {split_name: add_index_column(split_data) for split_name, split_data in kr_dataset.items()}
                                                          #ie. "train"  ie. dataset of train

2. Add translation column 'translation'

In [10]:
def add_translation_to_row(example, idx):
    return {'translation': {'en': example['english'], 'kr': example['korean']}}

def add_translation_column(dataset): #dataset = split_data

    dataset_with_translation = dataset.map(add_translation_to_row, with_indices=True)
    return dataset_with_translation

kr_dataset = {split_name: add_translation_column(split_data) for split_name, split_data in kr_dataset.items()}
                                                          #ie. "train"  ie. dataset of train

3) Remove original 'korean' and 'english' columns

In [11]:
def remove_korean_and_english_features(dataset):
    dataset_without_features = dataset.remove_columns(['korean', 'english'])
    return dataset_without_features

kr_dataset = {split_name: remove_korean_and_english_features(split_data) for split_name, split_data in kr_dataset.items()}


In [14]:
print(kr_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 166215
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1982
    })
})


In [13]:
print(kr_dataset["train"][0])

{'id': 0, 'translation': {'en': "(Applause) David Gallo: This is Bill Lange. I'm Dave Gallo.", 'kr': '(박수) 이쪽은 Bill Lange 이고, 저는 David Gallo입니다'}}


4. Scaling down dataset 

In [40]:
kr_dataset["train"][0]
print(kr_dataset["train"].shape)

kr_dataset["train"] = kr_dataset["train"].select(range(len(kr_dataset["train"])//3))
print(kr_dataset["train"].shape)

kr_dataset["test"][0]
print(kr_dataset["test"].shape)

kr_dataset["test"] = kr_dataset["test"].select(range(len(kr_dataset["test"])//2))
print(kr_dataset["test"].shape)

(166215, 2)
(55405, 2)
(1982, 2)
(991, 2)


### Building the model

In [4]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [42]:
source_lang = "en"
target_lang = "kr"
prefix = "translate English to Korean: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [44]:
tokenized_dataset = kr_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/55405 [00:00<?, ? examples/s]

Map:   0%|          | 0/991 [00:00<?, ? examples/s]

In [45]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 55405
    })
    validation: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1958
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 991
    })
})


In [46]:
from transformers import DataCollatorForSeq2Seq, PreTrainedTokenizerBase
import numpy as np

class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer: PreTrainedTokenizerBase, model=None, return_tensors: str = "tf", pad_to_multiple_of: int = 8):
        super().__init__(tokenizer=tokenizer, model=model, return_tensors=return_tensors, pad_to_multiple_of=pad_to_multiple_of)

    def __call__(self, features):
        batch = super().__call__(features)

        # Generate decoder_input_ids from labels
        decoder_input_ids = batch["labels"]

        # Add decoder_input_ids to the batch
        batch["decoder_input_ids"] = decoder_input_ids

        return batch

# Create a new instance of the custom data collator
data_collator = CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")


In [47]:
import evaluate

metric = evaluate.load("sacrebleu")

In [48]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [49]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [5]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)





All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [51]:
print(tokenized_dataset["train"])

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 55405
})


In [52]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [53]:
print(tf_train_set)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>


In [54]:
# Drop the 'decoder_input_ids' key from the dataset elements
tf_train_set = tf_train_set.map(lambda x, y: ({k:v for k, v in x.items() if k != 'decoder_input_ids'}, y))

# Print the modified tf_train_set
print(tf_train_set)

tf_test_set = tf_test_set.map(lambda x, y: ({k:v for k, v in x.items() if k != 'decoder_input_ids'}, y))
print(tf_test_set)

<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>
<_MapDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>


In [55]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [56]:
from transformers.keras_callbacks import KerasMetricCallback


metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

### Train the model

In [60]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=2)#, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x22ab5bd6df0>

### Save the model

In [56]:
model.save_pretrained("/Users/user/Downloads/test")

### Testing out the model

In [139]:
text = "translate English to Korean: Hello!"

In [140]:
# !pip show sentencepiece

In [141]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(text, return_tensors="tf").input_ids

In [142]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained("/Users/user/Downloads/test")#, from_pt=True)
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at /Users/user/Downloads/test.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [143]:
print(outputs[0][0])

tf.Tensor(0, shape=(), dtype=int32)


In [145]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

Hello!
