## Токенизация.
Для обучения модели нужно рассмотреть такую важную часть как токенизация модели.

In [92]:
# DataFrame to Dataset
import pandas as pd
#import datasets
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer
from transformers import DataCollatorForSeq2Seq

In [93]:
# read the data
train = pd.read_csv('../data/train.csv')
train.columns = map(str.lower, train.columns)
test = pd.read_csv('../data/test.csv')
test.columns = map(str.lower, test.columns)

In [94]:
# Форимруем Dataset класс из библотеки transformers
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
dataset = DatasetDict()
dataset['train'] = train
dataset['test'] = test
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 32520
    })
    test: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 5738
    })
})


Загрузим токенайзер, посмотрим на обработку текста.

In [95]:
model_id="google/flan-t5-xxl"
tokenizer = T5Tokenizer.from_pretrained(model_id)



In [96]:
special_tokens = tokenizer.special_tokens_map
print(special_tokens.keys())

dict_keys(['eos_token', 'unk_token', 'pad_token', 'additional_special_tokens'])


In [97]:
for spec_token in ['eos_token', 'unk_token', 'pad_token']:
    print(f"Special token: {spec_token} has mask: {special_tokens[spec_token]}, and it token num: {tokenizer.encode(special_tokens[spec_token])[0]}")

Special token: eos_token has mask: </s>, and it token num: 1
Special token: unk_token has mask: <unk>, and it token num: 2
Special token: pad_token has mask: <pad>, and it token num: 0




In [98]:
# как видно токенайзер автоматически добавляет токен конца последовательности при кодировани
print(dataset['test']['question'][0])
print(tokenizer.encode(dataset['test']['question'][0]))

What do you get when you inject human DNA into a goat?
[363, 103, 25, 129, 116, 25, 15823, 936, 6642, 139, 3, 9, 18174, 58, 1]


In [104]:
print(tokenizer.encode(dataset['test']['question'][0], max_length=100, truncation=True, padding='max_length'))

[363, 103, 25, 129, 116, 25, 15823, 936, 6642, 139, 3, 9, 18174, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [89]:
# Теперь напишем функцию которая добавялет промт к инпуту, и сделаем токенизацию приведя dataset к формату необходимого для моделирования
def preprocess_function(sample):
    # add prefix to the input for t5
    prompt = "I ask a question, you answer as a joke. Question: "
    inputs = [prompt + item for item in sample["question"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["question", "answer", "id"])
tokenized_dataset['train'].save_to_disk("../data/dt_train")
tokenized_dataset['test'].save_to_disk("../data/dt_test")
print(tokenized_dataset)

Map:   0%|          | 0/32520 [00:00<?, ? examples/s]

Map:   0%|          | 0/5738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32520 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5738 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 32520
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5738
    })
})


In [90]:
# Посмотрим на работу DataCollator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [91]:
print(tokenized_dataset['test']['labels'][0])
print(data_collator(tokenized_dataset['test'])['labels'][0])

[20759, 15, 26, 91, 13, 8, 158, 6031, 3, 172, 32, 32, 5, 1]
tensor([20759,    15,    26,    91,    13,     8,   158,  6031,     3,   172,
           32,    32,     5,     1,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
    