## Токенизация.
Для обучения модели нужно рассмотреть такую важную часть как токенизация модели.

In [24]:
# DataFrame to Dataset
import pandas as pd
#import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq

In [61]:
# read the data
train = pd.read_csv('../data/train.csv')
train.columns = map(str.lower, train.columns)
test = pd.read_csv('../data/test.csv')
test.columns = map(str.lower, test.columns)

In [None]:
train.rename({'question':'input_ids', 'answer':'labels'}, axis=1, inplace=True)

In [63]:
# Форимруем Dataset класс из библотеки transformers
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
dataset = DatasetDict()
dataset['train'] = train
dataset['test'] = test
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 32520
    })
    test: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 5738
    })
})


Загрузим токенайзер, посмотрим на обработку текста.

In [31]:
model_id="google/flan-t5-xxl"
tokenizer = AutoTokenizer.from_pretrained(model_id)



In [37]:
special_tokens = tokenizer.special_tokens_map
print(special_tokens.keys())

dict_keys(['eos_token', 'unk_token', 'pad_token', 'additional_special_tokens'])


In [40]:
for spec_token in ['eos_token', 'unk_token', 'pad_token']:
    print(f"Special token: {spec_token} has mask: {special_tokens[spec_token]}, and it token num: {tokenizer.encode(special_tokens[spec_token])[0]}")

Special token: eos_token has mask: </s>, and it token num: 1
Special token: unk_token has mask: <unk>, and it token num: 2
Special token: pad_token has mask: <pad>, and it token num: 0


In [44]:
# как видно токенайзер автоматически добавляет токен конца последовательности при кодировании
print(tokenizer.encode(dataset['test']['question'][0]))

[363, 133, 14082, 31, 7, 564, 36, 3, 99, 3, 88, 47, 2170, 16, 1894, 58, 1]


In [76]:
# Теперь напишем функцию которая добавялет промт к инпуту, и сделаем токенизацию приведя dataset к формату необходимого для моделирования
def preprocess_function(sample):
    # add prefix to the input for t5
    promt = "I ask a question, you answer as a joke. Question: "
    inputs = [promt + item for item in sample["question"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["question", "answer", "id"])
tokenized_dataset['train'].save_to_disk("../data/dt_train")
tokenized_dataset['test'].save_to_disk("../data/dt_test")
print(tokenized_dataset)

Map:   0%|          | 0/32520 [00:00<?, ? examples/s]

Map:   0%|          | 0/5738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32520 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5738 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 32520
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5738
    })
})


In [71]:
tokenized_dataset['train'].save_to_disk("../data/tokenized_dataset")
#tokenized_dataset["test"].save_to_disk("data/eval")

Saving the dataset (0/1 shards):   0%|          | 0/32520 [00:00<?, ? examples/s]

In [73]:
from datasets import load_from_disk

In [74]:
dt = load_from_disk('../data/tokenized_dataset')

In [75]:
dt

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 32520
})

In [25]:
# Загружаем токенизатор и создаем DataCollator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding='longest', label_pad_token_id = -100)

In [26]:
data_collator(train)

ValueError: type of What's the best anti diarrheal prescription? unknown: <class 'str'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [None]:




# Создаем примеры данных
texts = ["Это пример предложения.", "Еще один пример."]
inputs = tokenizer(texts, padding=True, return_tensors="pt")

# Применяем DataCollator
outputs = data_collator([inputs])

# Визуализируем результаты
print(outputs.keys())  # Выведет ключи в словаре outputs
print(outputs['input_ids'])  # Выведет тензор с входными идентификаторами
print(outputs['attention_mask'])  # Выведет тензор с масками внимания

In [11]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
valid = Dataset.from_pandas(valid)

dataset['validation'] = valid

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Question', 'Answer'],
        num_rows: 32522
    })
    test: Dataset({
        features: ['ID', 'Question', 'Answer'],
        num_rows: 5739
    })
    validation: Dataset({
        features: ['ID', 'Question', 'Answer'],
        num_rows: 8
    })
})