In [None]:
!pip install -U transformers[torch]
!pip install -U datasets
!pip install evaluate

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
import numpy as np
from tqdm import tqdm
import collections
import evaluate
import torch

from utils.utils import *# preprocess_training, 

In [5]:
# Загрузим датасет и посмотрим на формат данных
dataset = load_dataset("sberquad")
dataset['train'][0]

{'id': 62310,
 'title': 'SberChallenge',
 'context': 'В протерозойских отложениях органические остатки встречаются намного чаще, чем в архейских. Они представлены известковыми выделениями сине-зелёных водорослей, ходами червей, остатками кишечнополостных. Кроме известковых водорослей, к числу древнейших растительных остатков относятся скопления графито-углистого вещества, образовавшегося в результате разложения Corycium enigmaticum. В кремнистых сланцах железорудной формации Канады найдены нитевидные водоросли, грибные нити и формы, близкие современным кокколитофоридам. В железистых кварцитах Северной Америки и Сибири обнаружены железистые продукты жизнедеятельности бактерий.',
 'question': 'чем представлены органические остатки?',
 'answers': {'text': ['известковыми выделениями сине-зелёных водорослей'],
  'answer_start': [109]}}

In [2]:
# загрузка пока что необученных моделей
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModelForQuestionAnswering.from_pretrained("cointegrated/rubert-tiny2")
model.cuda();

In [3]:
max_length = 384
stride = 128
# так как внутри функции нужен токенизатор, инициализрующийся в ноутбуке,
# то не получится вынести функцию в отдельный файл

def preprocess_training(data):
    questions = [q.strip() for q in data["question"]]
    inputs = tokenizer(
        questions,
        data["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = data["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:

            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_data = dataset["train"].map(
    preprocess_training,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
train_data.set_format("torch")

In [4]:
# def preprocess_validation(data):
#     questions = [q.strip() for q in data["question"]]
#     inputs = tokenizer(
#         questions,
#         data["context"],
#         max_length=max_length,
#         truncation="only_second",
#         stride=stride,
#         return_overflowing_tokens=True,
#         return_offsets_mapping=True,
#         padding="max_length",
#     )

#     sample_map = inputs.pop("overflow_to_sample_mapping")
#     example_ids = []

#     for i in range(len(inputs["input_ids"])):
#         sample_idx = sample_map[i]
#         example_ids.append(data["id"][sample_idx])

#         sequence_ids = inputs.sequence_ids(i)
#         offset = inputs["offset_mapping"][i]
#         inputs["offset_mapping"][i] = [
#             o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
#         ]

#     inputs["example_id"] = example_ids
#     return inputs

validation_data = dataset["validation"].map(
    preprocess_validation,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
val_data = validation_data.remove_columns(["example_id", "offset_mapping"])
val_data.set_format("torch")

In [5]:
test_dataset = dataset["test"].map(
    preprocess_validation,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
test_data = test_dataset.remove_columns(["example_id", "offset_mapping"])
test_data.set_format("torch")

Map: 100%|██████████| 23936/23936 [00:08<00:00, 2939.19 examples/s]


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
num_train_epochs = 20


train_dataloader = DataLoader(
    train_data,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=120,
)
eval_dataloader = DataLoader(
    val_data,
    shuffle=False,
    collate_fn=default_data_collator,
    batch_size=120,
)
test_loader = DataLoader(
    test_data,
    shuffle=False,
    collate_fn=default_data_collator,
    batch_size=10,
)

optimizer = AdamW(model.parameters(), lr=2e-5)
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=1,
    num_training_steps=num_train_epochs * len(train_dataloader),
)

In [33]:
metric = evaluate.load("squad")
n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": str(example_id), "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": str(example_id), "prediction_text": ""})

    theoretical_answers = [{"id": str(ex["id"]), "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers), predicted_answers, theoretical_answers

In [None]:
from tqdm.auto import tqdm
import torch
import os
import numpy as np

MODEL_PATH = '/kaggle/working/torch_pipiline'
os.makedirs(MODEL_PATH, exist_ok=True)

prev_f1_score = 0
for epoch in tqdm(range(num_train_epochs)):
    # Training
    model.train()
    for batch in tqdm(train_dataloader):
        outputs = model(**{k: v.cuda() for k, v in batch.items()})
        loss = outputs.loss
        loss.backward() 
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []

    print("Evaluation!")
    for batch in eval_dataloader:
        with torch.inference_mode():
            outputs = model(**{k: v.cuda() for k, v in batch.items()})

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_data)]
    end_logits = end_logits[: len(validation_data)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_data, dataset["validation"]
    )
    print(f"epoch {epoch}:", metrics)
    
    if metrics['f1'] >= prev_f1_score:
        print(f'{prev_f1_score} -> {metrics["f1"]} SAVIMG')
        model.save_pretrained(MODEL_PATH)
        tokenizer.save_pretrained(MODEL_PATH)

### Протестируем свежеобученную модель

In [30]:
model = AutoModelForQuestionAnswering.from_pretrained("./models")
model.eval()
model.cuda();

In [36]:
start_logits = []
end_logits = []

for batch in test_loader:
    with torch.inference_mode():
        outputs = model(**{k: v.cuda() for k, v in batch.items()})

    start_logits.append(outputs.start_logits.cpu().numpy())
    end_logits.append(outputs.end_logits.cpu().numpy())

start_logits = np.concatenate(start_logits)
end_logits = np.concatenate(end_logits)
# start_logits = start_logits[: len(validation_data)]
# end_logits = end_logits[: len(validation_data)]

metrics, predicted_answers, theoretical_answers = compute_metrics(
    start_logits, end_logits, validation_data, dataset["validation"]
)
print(metrics)

100%|██████████| 5036/5036 [00:06<00:00, 771.21it/s]


{'exact_match': 0.07942811755361398, 'f1': 5.1638270804312985}
