# Data Preprocessing

In [1]:
# !pip3 install -U pandas

In [2]:
import pandas as pd

splits = {"train": "sberquad/train-00000-of-00001.parquet", "validation": "sberquad/validation-00000-of-00001.parquet", "test": "sberquad/test-00000-of-00001.parquet"}
df = pd.read_parquet("hf://datasets/kuznetsoffandrey/sberquad/" + splits["train"])
df.head()

Unnamed: 0,id,title,context,question,answers
0,62310,SberChallenge,В протерозойских отложениях органические остат...,чем представлены органические остатки?,{'text': ['известковыми выделениями сине-зелён...
1,28101,SberChallenge,В протерозойских отложениях органические остат...,что найдено в кремнистых сланцах железорудной ...,"{'text': ['нитевидные водоросли, грибные нити'..."
2,48834,SberChallenge,В протерозойских отложениях органические остат...,что встречается в протерозойских отложениях?,"{'text': ['органические остатки'], 'answer_sta..."
3,83056,SberChallenge,В протерозойских отложениях органические остат...,что относится к числу древнейших растительных ...,{'text': ['скопления графито-углистого веществ...
4,5816,SberChallenge,В протерозойских отложениях органические остат...,как образовалось графито-углистое вещество?,{'text': ['в результате разложения Corycium en...


In [3]:
df.drop(columns=["id", "title"], inplace=True)

In [4]:
df.shape

(45328, 3)

In [5]:
select_idx = [i for i in range(len(df)) if df["answers"][i]["answer_start"] != [-1] and "".join(df["answers"][i]["text"]).strip() != ""]
len(select_idx)

41444

In [6]:
df = df.iloc[select_idx]
df.shape

(41444, 3)

In [7]:
lens = df["context"].map(len)
lens.min(), lens.max(), lens.sum() / len(lens)

(279, 7231, 756.3410867676865)

In [8]:
# !pip3 install -U transformers torch torchvision torchaudio
# !pip3 show torch

In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("KirrAno93/rubert-base-cased-finetuned-squad")
max_length = 192
tokenizer(df["question"][0], df["context"][0], max_length=max_length, truncation="only_second", padding="max_length", return_offsets_mapping=True)

{'input_ids': [101, 3622, 17025, 65361, 24094, 166, 102, 781, 40147, 8380, 1519, 2999, 67595, 65361, 24094, 17943, 24856, 16462, 128, 3622, 845, 30501, 860, 2999, 132, 9621, 17025, 4582, 49911, 37879, 3187, 49335, 130, 46577, 57539, 128, 29666, 3187, 63032, 128, 53445, 40732, 17914, 17995, 868, 1755, 132, 6776, 4582, 17090, 57539, 128, 861, 22579, 41664, 64678, 41845, 16340, 42085, 8067, 22343, 130, 13625, 41502, 20387, 128, 40281, 11157, 845, 6542, 56840, 20346, 33267, 12966, 10622, 114635, 52597, 9481, 132, 781, 87285, 34972, 884, 42584, 858, 20736, 106478, 1700, 64124, 18211, 21831, 25352, 59869, 73376, 128, 25766, 2059, 54248, 851, 14037, 128, 35637, 29542, 24955, 46588, 48833, 12668, 14064, 132, 781, 7162, 25484, 116843, 55214, 12819, 15222, 851, 20234, 18715, 7162, 23939, 24860, 55384, 41106, 132, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
def preproc_func(row):
    question = row.question.strip()
    inputs = tokenizer(
        question, 
        row.context,
        max_length=max_length,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True
    )

    offset_mapping = inputs.pop("offset_mapping")
    answer = row.answers
    start_pos, end_pos = 0, 0
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    
    context_start = inputs["token_type_ids"].index(1)
    context_end = len(inputs["token_type_ids"]) - 1
    while inputs["token_type_ids"][context_end] != 1:
        context_end -= 1
    context_end -= 1
    
    if not (offset_mapping[context_start][0] > end_char or offset_mapping[context_end][1] < start_char):
        idx = context_start
        while idx <= context_end and offset_mapping[idx][0] <= start_char:
            idx += 1
        start_pos = idx - 1

        idx = context_end
        while idx >= context_start and offset_mapping[idx][1] >= end_char:
            idx -= 1
        end_pos = idx + 1

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
    # print(tokenizer.decode(inputs["input_ids"][context_start:context_end+1]), tokenizer.decode(inputs["input_ids"][start_pos:end_pos+1]), answer, sep="\n")
    return inputs

preproc_func(df.iloc[0])

{'input_ids': [101, 3622, 17025, 65361, 24094, 166, 102, 781, 40147, 8380, 1519, 2999, 67595, 65361, 24094, 17943, 24856, 16462, 128, 3622, 845, 30501, 860, 2999, 132, 9621, 17025, 4582, 49911, 37879, 3187, 49335, 130, 46577, 57539, 128, 29666, 3187, 63032, 128, 53445, 40732, 17914, 17995, 868, 1755, 132, 6776, 4582, 17090, 57539, 128, 861, 22579, 41664, 64678, 41845, 16340, 42085, 8067, 22343, 130, 13625, 41502, 20387, 128, 40281, 11157, 845, 6542, 56840, 20346, 33267, 12966, 10622, 114635, 52597, 9481, 132, 781, 87285, 34972, 884, 42584, 858, 20736, 106478, 1700, 64124, 18211, 21831, 25352, 59869, 73376, 128, 25766, 2059, 54248, 851, 14037, 128, 35637, 29542, 24955, 46588, 48833, 12668, 14064, 132, 781, 7162, 25484, 116843, 55214, 12819, 15222, 851, 20234, 18715, 7162, 23939, 24860, 55384, 41106, 132, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
# !pip3 install -U scipy scikit-learn

In [13]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.15, shuffle=True, random_state=42)
train_df.shape, test_df.shape

((35227, 3), (6217, 3))

# Training

In [14]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(3407)
torch.cuda.manual_seed_all(3407)

In [15]:
from torch.utils.data import Dataset, DataLoader

class SberquadDs(Dataset):
    def __init__(self, df):
        self.inputs = df.apply(preproc_func, axis=1).tolist()

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return (
            torch.tensor(self.inputs[index]["input_ids"], dtype=torch.int32),
            torch.tensor(self.inputs[index]["attention_mask"], dtype=torch.int8),
            torch.tensor(self.inputs[index]["start_positions"], dtype=torch.int64),
            torch.tensor(self.inputs[index]["end_positions"], dtype=torch.int64)
        )
        
train_ds, test_ds = SberquadDs(train_df), SberquadDs(test_df)

batch_size = 32
train_dl, test_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4), DataLoader(test_ds, batch_size=batch_size, shuffle=False, pin_memory=True)
len(train_dl), len(test_dl)

(1101, 195)

In [16]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("KirrAno93/rubert-base-cased-finetuned-squad")
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [17]:
import torch.nn.functional as F
from tqdm.notebook import tqdm

def train(model, train_dl, optimizer, scheduler):
    print("=" * 8, "Starting training", "=" * 8)
    model.train()
    train_loss = 0.
    for input_ids, attention_mask, start_pos, end_pos in tqdm(train_dl):
        input_ids, attention_mask, start_pos, end_pos = input_ids.to(device), attention_mask.to(device), start_pos.to(device), end_pos.to(device)
        outs = model(input_ids, attention_mask)
        loss = F.cross_entropy(outs.start_logits, start_pos) + F.cross_entropy(outs.end_logits, end_pos)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    return train_loss / len(train_dl)

In [18]:
def valid(model, val_dl):
    print("=" * 8, "Starting validation", "=" * 8)
    model.eval()
    val_loss, val_em, val_f1 = 0., 0., 0.
    for input_ids, attention_mask, start_pos, end_pos in tqdm(val_dl):
        input_ids, attention_mask, start_pos, end_pos = input_ids.to(device), attention_mask.to(device), start_pos.to(device), end_pos.to(device)

        with torch.no_grad():
            outs = model(input_ids, attention_mask)
            loss = F.cross_entropy(outs.start_logits, start_pos) + F.cross_entropy(outs.end_logits, end_pos)
            
            val_loss += loss.item()
    return val_loss / len(val_dl)

In [19]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(train_dl) * 0.5, num_training_steps=len(train_dl) * epochs)

train_losses, val_losses = [], []

for epoch in range(epochs):
    print("=" * 8, f"Epoch {epoch + 1}", "=" * 8)

    train_loss = train(model, train_dl, optimizer, scheduler)
    print(f"Train loss {train_loss}")

    val_loss = valid(model, test_dl)
    print(f"Validation loss {val_loss}")
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)



  0%|          | 0/1101 [00:00<?, ?it/s]

Train loss 2.634825864561464


  0%|          | 0/195 [00:00<?, ?it/s]

Validation loss 2.5087086365773126


  0%|          | 0/1101 [00:00<?, ?it/s]

Train loss 1.8447496210521832


  0%|          | 0/195 [00:00<?, ?it/s]

Validation loss 2.3141532039031003


  0%|          | 0/1101 [00:00<?, ?it/s]

Train loss 0.9101986776526466


  0%|          | 0/195 [00:00<?, ?it/s]

Validation loss 2.72524761618712


In [20]:
model.save_pretrained("./flask_app/saved_model/")
tokenizer.save_pretrained("./flask_app/saved_model/")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [36]:
import numpy as np
import string

def normalization(text):
    text = " ".join(text.split()).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

def exact_match(pred, true):
    pred = np.vectorize(normalization)(pred)
    true = np.vectorize(normalization)(true)
    return np.mean(pred == true)

def f1_score(pred, true):
    pred = np.vectorize(normalization)(pred)
    true = np.vectorize(normalization)(true)
    res = 0.
    for i in range(len(pred)):
        pred_tokens = pred[i].split()
        true_tokens = true[i].split()
        common_tokens = set(pred_tokens) & set(true_tokens)
        if len(common_tokens) != 0 and len(pred_tokens) != 0 and len(true_tokens) != 0:
            prec = len(common_tokens) / len(pred_tokens)
            rec = len(common_tokens) / len(true_tokens)
            res += 2 * (prec * rec) / (prec + rec)
    return res / len(pred)

def get_answer(context, start, end):
    return [context[i][start[i]:end[i]+1] for i in range(len(context))]

In [37]:
model.eval()

val_loss, val_em, val_f1 = 0., 0., 0.
for input_ids, attention_mask, start_positions, end_positions in tqdm(test_dl):
    input_ids, attention_mask, start_positions, end_positions = input_ids.to(device), attention_mask.to(device), start_positions.to(device), end_positions.to(device)

    with torch.no_grad():
        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions,
        )

        loss = outputs.loss
        val_loss += loss.item()

        start_pred = outputs.start_logits.argmax(axis=1).cpu().numpy()
        end_pred = outputs.end_logits.argmax(axis=1).cpu().numpy()
        start_true = start_positions.cpu().numpy()
        end_true = end_positions.cpu().numpy()
        input_ids = input_ids.cpu().tolist()
        ans_pred = get_answer(input_ids, start_pred, end_pred)
        ans_true = get_answer(input_ids, start_true, end_true)
        text_pred = [tokenizer.decode(ans_pred[i]) for i in range(len(ans_pred))]
        text_true = [tokenizer.decode(ans_true[i]) for i in range(len(ans_true))]
        val_em += exact_match(text_pred, text_true)
        val_f1 += f1_score(text_pred, text_true)

val_loss /= len(test_dl)
val_em /= len(test_dl)
val_f1 /= len(test_dl)
print(f"Loss {val_loss} EM {val_em} F1 {val_f1}")

  0%|          | 0/195 [00:00<?, ?it/s]

Loss 1.36262380809356 EM 0.5611289173789173 F1 0.7658744994034126


In [38]:
tokenizer = AutoTokenizer.from_pretrained("./saved_model/", local_files_only=True)
model = AutoModelForQuestionAnswering.from_pretrained("./saved_model/", local_files_only=True)
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [39]:
def predict(question, context):
    inputs = tokenizer(
        question, 
        context,
        max_length=max_length,
        truncation="only_second",
        padding="max_length",
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    return tokenizer.decode(predict_answer_tokens)

### Test case 1:

In [41]:
question = "Где встречаются первые упоминания о строении человеческого тела?"
context = "Первые упоминания о строении человеческого тела встречаются в Древнем Египте..."

pred = predict(question, context)
pred

'в Древнем Египте'

### Test case 2:

In [42]:
question = "Кто совершил первый полет в космос?"
context = "12 апреля 1961 года Юрий Гагарин стал первым человеком в мировой истории, совершившим полёт в космическое пространство. Ракета-носитель «Восток» с кораблём «Восток-1», на борту которого находился Гагарин, была запущена с космодрома Байконур, расположенного в Кызылординской области Казахской ССР. После 108 минут полёта Гагарин успешно приземлился в Саратовской области, неподалёку от Энгельса. 12 апреля 1961 года, день полёта Юрия Гагарина в космос, был объявлен праздником — Днём космонавтики."

pred = predict(question, context)
pred

'Юрий Гагарин'

### Test case 3:

In [44]:
question = "Как назывался космический корабль?"
context = "12 апреля 1961 года Юрий Гагарин стал первым человеком в мировой истории, совершившим полёт в космическое пространство. Ракета-носитель «Восток» с кораблём «Восток-1», на борту которого находился Гагарин, была запущена с космодрома Байконур, расположенного в Кызылординской области Казахской ССР. После 108 минут полёта Гагарин успешно приземлился в Саратовской области, неподалёку от Энгельса. 12 апреля 1961 года, день полёта Юрия Гагарина в космос, был объявлен праздником — Днём космонавтики."

pred = predict(question, context)
pred

'« Восток - 1 »'