### Вопросно-ответная модель

In [16]:
import torch
from datasets import Dataset
from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from deeppavlov import build_model
import json
import random

In [3]:
model = build_model(config='squad_bert')
qa_model = list(model)[3].model
toker = list(model)[0].tokenizer
collator = DataCollatorForTokenClassification(toker, pad_to_multiple_of=128)
dev = 'cuda:0'

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

### Входные данные

In [5]:
from dataset import DialogueDataset
from dataset.multiwoz import load_multiwoz

In [6]:
MULTIWOZ_PATH = (
    "multiwoz/data/MultiWOZ_2.2"
)


train = DialogueDataset.from_miltiwoz_v22(
    load_multiwoz('train', MULTIWOZ_PATH,
    order=[
    'dialogues_001.json', 'dialogues_011.json', 'dialogues_007.json', 'dialogues_010.json', 
    'dialogues_017.json', 'dialogues_005.json', 'dialogues_015.json', 'dialogues_012.json', 
    'dialogues_016.json', 'dialogues_013.json', 'dialogues_004.json', 'dialogues_009.json', 
    'dialogues_003.json', 'dialogues_006.json', 'dialogues_008.json', 'dialogues_002.json', 
    'dialogues_014.json'
    ])
)

Loading train part of MultiWOZ from multiwoz/data/MultiWOZ_2.2


100%|██████████████████████████████████████████████████████████████████████████████| 17/17 [00:07<00:00,  2.41it/s]


In [None]:
train_set = {'kings lynn': ['i need a train on tuesday out of kings lynn', 'Looking for a train Tuesday leaving kings lynn after 9:30 to go to cambridge.'], 
             'cambridge': ['Please find me a train from cambridge to stansted airport', "Hi, I'd like to book a train to stansted airport from cambridge. Can you help?"]}
slot_type = 'train-departure'

In [7]:
# извлечение вопросов из датасета
with open(f'{MULTIWOZ_PATH}/schema.json', 'r') as f:
  schema = json.load(f)
question_for_slot_type = {}
for service in schema:
  for slot in service['slots']:
    question_for_slot_type[slot['name']] = f"{slot['description'][0].upper()}{slot['description'][1:]}?"


In [8]:
# модифицированные вопросы
m_question_for_slot_type = {'hotel-pricerange': 'What is price budget of the hotel?',
 'hotel-type': 'What is the type of the hotel?',
 'hotel-parking': 'Whether the hotel has parking?',
 'hotel-bookday': 'What is day of the hotel booking?',
 'hotel-bookpeople': 'For how many people to book the hotel?',
 'hotel-bookstay': 'For how many days to stay at the hotel?',
 'hotel-stars': 'What is star rating of the hotel?',
 'hotel-internet': 'Whether the hotel has internet?',
 'hotel-name': 'What is the name of the hotel?',
 'hotel-area': 'Where is the hotel located?',
 'hotel-address': 'What is the address of the hotel?',
 'train-arriveby': 'What time does the train arrive?',
 'train-departure': 'Where is departure location of the train?',
 'train-day': 'What is the day of the train?',
 'train-bookpeople': 'How many train tickets you need?',
 'train-leaveat': 'What time does the train leave?',
 'train-destination': 'What is the destination of the train?',
 'attraction-area': 'What area to search for attractions?',
 'attraction-name': 'What is the name of the attraction?',
 'attraction-type': 'What is the type of the attraction?',
 'restaurant-area': 'Where does the restaurant place?',
 'restaurant-name': 'What is the name of the restaurant?',
 'restaurant-bookday': 'What is the day of the restaurant booking?',
 'restaurant-pricerange': 'What is the price budget for the restaurant?',
 'restaurant-food': 'What the cuisine of the restaurant you are looking for?',
 'restaurant-bookpeople': 'For how many people to reserve the restaurant?',
 'restaurant-booktime': 'What is the time of the restaurant booking?',
 'hospital-department': 'What is the type of medical care?',
 'taxi-leaveat': 'What is the leaving time of taxi?',
 'taxi-destination': 'What is the destination of taxi?',
 'taxi-departure': 'Where is departure location of taxi?',
 'taxi-arriveby': 'What is the arrival time of taxi?', 
 'bus-departure': 'What is the departure location of bus?',
 'bus-destination': 'What is the destination of bus?'}

In [9]:
# преобразование данных
train_data = {'input_ids': [], 'start_positions': [], 'end_positions': []}
keys_in_dataset = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
short_keys_in_dataset = ['input_ids', 'token_type_ids', 'attention_mask']
def make_train_dataset(train_set, slot_types, modified_questions=False):
  encoded_dict = {key: [] for key in keys_in_dataset}
  for slot_value, slot_type in zip(train_set.keys(), slot_types):
    for utt in train_set[slot_value]:
      question = question_for_slot_type[slot_type]
      encoded = toker(question, utt.lower())
      encoded_question = toker(question)
      encoded_context = toker(utt.lower())
      encoded_slot = toker(slot_value.lower())['input_ids'][1:-1]
      start_position = -1
      end_position = -1
      for i in range(len(encoded_slot)):
        if encoded_slot[i] in encoded_context['input_ids']:
          start_position = encoded_context['input_ids'].index(encoded_slot[i]) + len(encoded_question['input_ids']) - 1
          break
      for i in reversed(range(len(encoded_slot))):
        if encoded_slot[i] in encoded_context['input_ids']:
          end_position = encoded_context['input_ids'].index(encoded_slot[i]) + len(encoded_question['input_ids']) - 1
          break
      if start_position != -1 and end_position != -1:
        for key in short_keys_in_dataset:
          encoded_dict[key].append(encoded[key])
        encoded_dict['start_positions'].append([start_position])
        encoded_dict['end_positions'].append([end_position])

  return Dataset.from_dict(encoded_dict)

### Разделение типов слотов

In [10]:
from split_slot_types import get_splitted_slot_types
from prepare_data import prepare_data

In [11]:
train_types, dev_types, test_types = get_splitted_slot_types()

  all_types = np.unique(np.hstack(domains.values())).tolist()


In [12]:
train_data = prepare_data(train, train_types)
dev_data = prepare_data(train, dev_types)

In [17]:
train_data_small = random.SystemRandom().sample(train_data, 50)
train_data_small

[{'utterance': "I'm looking for a cheap hotel in the centre.",
  'slots': {'hotel-type': 'hotel'}},
 {'utterance': 'Yes. What can you tell me about cherry hinton hall and grounds? I am interested in visiting it. Do I need to book a tour?',
  'slots': {'attraction-name': 'cherry hinton hall and grounds'}},
 {'utterance': '4 people. Starting Tuesday, 5 nights.',
  'slots': {'hotel-bookstay': '5'}},
 {'utterance': 'book a table for 7 people at 12:15 on thursday and get me the reference number',
  'slots': {'restaurant-bookday': 'thursday'}},
 {'utterance': "I'm looking for attractions in town that are in the architecture category.",
  'slots': {'attraction-type': 'architecture'}},
 {'utterance': 'Yes, are there any 3 star guesthouses in the west that include free wifi?',
  'slots': {'hotel-type': 'guesthouse'}},
 {'utterance': "It doesn't matter. Please make a recommendation and book a table for 8 people at 11:30 on saturday.",
  'slots': {'restaurant-bookday': 'saturday'}},
 {'utterance'

### Предобучение на тренировочных типах

In [18]:
pretrain_set = {} 
slot_types_for_pretrain_dataset = []
for utt_slot in train_data_small:
   utt = utt_slot['utterance']
   for slot_type, slot_value in utt_slot['slots'].items():
    if slot_value in pretrain_set.keys():
        pretrain_set[slot_value].append(utt)
    else:
        pretrain_set[slot_value] = [utt]
        slot_types_for_pretrain_dataset.append(slot_type)

In [19]:
pretrain_set

{'hotel': ["I'm looking for a cheap hotel in the centre.",
  "Sure, let's look for a 4 star hotel instead.",
  "I'm looking for a place to stay in Cambridge, moderate price for a hotel.",
  'If there is no such hotel, how about one that has a star of 4',
  'Lets try a hotel instead.',
  'What expensive hotel do you recommend?'],
 'cherry hinton hall and grounds': ['Yes. What can you tell me about cherry hinton hall and grounds? I am interested in visiting it. Do I need to book a tour?'],
 '5': ['4 people. Starting Tuesday, 5 nights.',
  "Yes, that works. I'd like to book for 4 people for 5 nights, starting on Saturday, please.",
  'i want to leave by 10:45'],
 'thursday': ['book a table for 7 people at 12:15 on thursday and get me the reference number',
  "I'd like to book for 1 person at 1145 on thursday please"],
 'architecture': ["I'm looking for attractions in town that are in the architecture category."],
 'guesthouse': ['Yes, are there any 3 star guesthouses in the west that incl

In [20]:
num_utterances = 0
for utterances in pretrain_set.values():
    num_utterances += len(utterances)

In [21]:
pretrain_dataset = make_train_dataset(pretrain_set, [slot_type]*num_utterances)
pretrain_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 50
})

In [22]:
train_arguments = TrainingArguments(
  output_dir="./models/results",
  overwrite_output_dir = True,
  evaluation_strategy="epoch",
  learning_rate=1e-5,
  weight_decay=0.01,
  logging_steps=1,
  num_train_epochs = 3,
  remove_unused_columns = False
)

In [23]:
path_finetuned_model = './models/checkpoint'

In [24]:
trainer = Trainer(model=qa_model,
                            args=train_arguments,
                            train_dataset=pretrain_dataset,
                            eval_dataset=pretrain_dataset,
                            data_collator=collator
                            )

trainer.train()

qa_model.eval()
qa_model.save_pretrained(path_finetuned_model)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,4.4163,4.110243
2,3.8226,3.446166
3,3.7798,3.15965


### Дообучение

In [33]:
test_slot_type = test_types[0]
test_data = prepare_data(train, test_types)
updated_test_data = []
for utt_slot in test_data:
    if test_slot_type in utt_slot['slots'].keys():
        updated_test_data.append(utt_slot)
test_data_small = random.SystemRandom().sample(updated_test_data, 10)

In [38]:
sup_data, query_data = test_data_small[:5], test_data_small[5:]

In [41]:
sup_set = {} 
slot_types_for_sup_dataset = []
for utt_slot in sup_data:
    utt = utt_slot['utterance']
    for slot_type, slot_value in utt_slot['slots'].items():
        if test_slot_type == slot_type:
            if slot_value in sup_set.keys():
                sup_set[slot_value].append(utt)
            else:
                sup_set[slot_value] = [utt]
                slot_types_for_sup_dataset.append(slot_type)

In [52]:
query_set = {} 
slot_types_for_sup_dataset = []
for utt_slot in sup_data:
    utt = utt_slot['utterance']
    for slot_type, slot_value in utt_slot['slots'].items():
        if test_slot_type == slot_type:
            if slot_value in query_set.keys():
                query_set[slot_value].append(utt)
            else:
                query_set[slot_value] = [utt]
                slot_types_for_sup_dataset.append(slot_type)

In [49]:
sup_dataset = make_train_dataset(sup_set, [test_slot_type]*5)

In [45]:
train_arguments = TrainingArguments(
  output_dir="./models/results",
  overwrite_output_dir = True,
  evaluation_strategy="epoch",
  learning_rate=1e-5,
  weight_decay=0.01,
  logging_steps=1,
  num_train_epochs = 6,
  remove_unused_columns = False
)

In [46]:
path_finetuned_model = './models/checkpoint'

In [47]:
qa_model.from_pretrained(path_finetuned_model)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [58]:
trainer = Trainer(model=qa_model,
                            args=train_arguments,
                            train_dataset=sup_dataset,
                            eval_dataset=sup_dataset,
                            data_collator=collator
                            )

trainer.train()

qa_model.eval()
qa_model.save_pretrained(path_finetuned_model)



Epoch,Training Loss,Validation Loss


### Тестирование

In [54]:
query_set
test_utterances = []
gold_slot_values = []
for slot, utts in query_set.items():
    for utt in utts:
      test_utterances.append(utt)
      gold_slot_values.append(slot)

In [57]:
test_utterances

["I'd like something on the south side, please.",
 'I think maybe the north for this trip.',
 'I am going to the centre and want to go somewhere fun.',
 'I would also like to know about any attractions that is in the centre of town that is a theatre type.',
 'Can you tell me what colleges are open to tourists on the west side of town?']

In [59]:
pipe = pipeline('question-answering', model=qa_model.to(dev), tokenizer=toker, device=0)
res = pipe({'question': len(test_utterances) * [question_for_slot_type[test_slot_type]], 'context': test_utterances})

In [60]:
res

[{'score': 0.022393349558115005, 'start': 26, 'end': 31, 'answer': 'south'},
 {'score': 0.027888767421245575, 'start': 18, 'end': 23, 'answer': 'north'},
 {'score': 0.016384506598114967, 'start': 18, 'end': 24, 'answer': 'centre'},
 {'score': 0.006965253036469221,
  'start': 63,
  'end': 95,
  'answer': 'centre of town that is a theatre'},
 {'score': 0.013118656352162361, 'start': 58, 'end': 62, 'answer': 'west'}]

### Подсчет метрик

In [61]:
from deeppavlov.metrics import accuracy

In [62]:
conf_levels = [0.8, 0.9, 0.95, 0.99]

In [63]:
def get_metrics(gold_slot_values, res, conf_level=0.0):
    updated_predicted_slot_values, updated_gold_slot_values = [], []
    for (answer, gold_slot_value) in zip(res, gold_slot_values):
        if answer['score'] >= conf_level:
            updated_predicted_slot_values.append(answer['answer'])
            updated_gold_slot_values.append(gold_slot_value)
    acc = accuracy.sets_accuracy(updated_gold_slot_values, updated_predicted_slot_values)
    tp = acc * len(updated_predicted_slot_values)
    fp = len(updated_predicted_slot_values) - tp
    fn = len(gold_slot_values) - tp
    f1 = tp/(tp + 1/2 * (fn + fp))
    return acc, f1

In [64]:
acc, f1 = get_metrics(gold_slot_values, res)
acc, f1

(0.8, 0.8)

In [65]:
acc, f1 = get_metrics(gold_slot_values, res, 0.5)
acc, f1

(0, 0.0)