# Finetuning ruBert for SberSQUAD

In [None]:
!pip install transformers==4.28.0
!pip install datasets evaluate
!pip install tqdm
!pip install chardet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Загружаем dataset

### Разбиваем на train и test

In [None]:
from tqdm import tqdm # прогресбар

In [None]:
import datasets
from datasets import Dataset, DatasetDict, load_dataset
from datasets import load_dataset

# dataset = load_dataset('json', data_files='train.json', split="train")
dataset = load_dataset("sberquad", split="train[:]")





In [None]:
dataset = dataset.train_test_split(test_size=0.2)
# dataset = dataset.train_test_split(test_size=0.1)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 36262
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 9066
    })
})

In [None]:
squad = dataset

### Загрузка модели и токенизатора

In [None]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")

#### Функция предварительной обработки данных

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Токенизация

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/36262 [00:00<?, ? examples/s]

Map:   0%|          | 0/9066 [00:00<?, ? examples/s]

#### Создаем итератор - DataCollator

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Начинаем тренировать ( обучать )

#### Загружаем модель

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("ai-forever/ruBert-base")

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the mo

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.7582,1.647981
2,1.4601,1.622624
3,1.2019,1.692356


TrainOutput(global_step=6801, training_loss=1.5586957444935716, metrics={'train_runtime': 8393.9282, 'train_samples_per_second': 12.96, 'train_steps_per_second': 0.81, 'total_flos': 2.131907173371187e+16, 'train_loss': 1.5586957444935716, 'epoch': 3.0})

In [None]:
model.save_pretrained('pretrained_ru_bert_sbersquad')

In [None]:
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
!zip -r drive/MyDrive/Colab\ Notebooks/pretrained_ru_bert_sbersquad.zip pretrained_ru_bert_sbersquad

  adding: pretrained_ru_bert_sbersquad/ (stored 0%)
  adding: pretrained_ru_bert_sbersquad/config.json (deflated 53%)
  adding: pretrained_ru_bert_sbersquad/pytorch_model.bin (deflated 7%)


In [None]:
!zip -r drive/MyDrive/Colab\ Notebooks/tokenizer.zip tokenizer

  adding: tokenizer/ (stored 0%)
  adding: tokenizer/tokenizer_config.json (deflated 45%)
  adding: tokenizer/tokenizer.json (deflated 73%)
  adding: tokenizer/vocab.txt (deflated 66%)
  adding: tokenizer/special_tokens_map.json (deflated 42%)


## Проверяем работу тренированой сети

In [None]:
question = "Кто возлогает цветы у памятника Ленину?"
context = "Пионеры ставят корзины с цветами к подножию монумента В.И.Ленина на площади Ленина у Финляндского вокзала."

In [None]:
from transformers import pipeline

model = AutoModelForQuestionAnswering.from_pretrained("pretrained_ru_bert_sbersquad")

question_answerer = pipeline("question-answering", model=model,tokenizer=tokenizer)
question_answerer(question=question, context=context)

{'score': 0.9158473610877991, 'start': 0, 'end': 7, 'answer': 'Пионеры'}