# Default

In [None]:
path_train = "/content/drive/MyDrive/AINTI/rucos/rucos_train.jsonl"
path_valid = "/content/drive/MyDrive/AINTI/rucos/rucos_val.jsonl"
path_test = "/content/drive/MyDrive/AINTI/rucos/rucos_test.jsonl"

# Train

## Libraries

In [None]:
!pip install jsonlines
!pip install transformers -U
!pip install natasha
!pip install simpletransformers
!pip install pymorphy2

Requirement already up-to-date: transformers in /usr/local/lib/python3.7/dist-packages (4.3.3)


In [None]:
import jsonlines
import transformers
import json
import natasha
import pandas as pd
import sys
import re
from simpletransformers.question_answering import QuestionAnsweringArgs, QuestionAnsweringModel
import logging
import pymorphy2

## Create train dataset

In [None]:
def get_data(data_json_file):
    train_data = []
    json_list = []
    with open(data_json_file, 'r') as json_file:
        json_list = list(json_file)
    with open(path_valid, 'r') as json_file:
        json_list += list(json_file)
    for json_str in json_list:
        item = json.loads(json_str)
        d = {}
        questions = item['qas']
        q = questions[0]
        #if q['answers'][0]['start'] < len()
        d['context'] = item['passage']['text']
        d['qas'] = []
        idx = item['idx']
        query = q['query']
        d1 = {'id': idx, 'is_impossible': False, 'question': query}
        d1['answers'] = [{'text': q['answers'][0]['text'], 'answer_start': d['context'].index(q['answers'][0]['text'])}]
        d['qas'].append(d1)
        train_data.append(d)
    return train_data

In [None]:
train_data = get_data(path_train)

## Finetune model

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
model_args = QuestionAnsweringArgs(num_train_epochs=5,
                                overwrite_output_dir=True,
                                evaluate_during_training=False,
                                evaluate_during_training_verbose=False,
                                reprocess_input_data=True,
                                train_batch_size=16,
                                eval_batch_size=16,
                                save_model_every_epoch = False,
                                save_best_model = True,
                                max_seq_length = 512,
                                use_multiprocessing = True,
                                learning_rate = 2e-5,
                                max_answer_length=18,
                                output_dir='outputs/',
                                max_query_length=64
                                )
model = QuestionAnsweringModel(
    "bert", "/content/drive/MyDrive/AINTI/PretrainWords", args=model_args) # Предобученная модель


Some weights of the model checkpoint at /content/drive/MyDrive/AINTI/PretrainWords were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at /content/drive/MyDrive

In [None]:
model.train_model(train_data)

# Craete submit

In [None]:
test_data = []
test_contexts = []
test_queries = []
with open(path_test, 'r') as json_file:
    json_list = list(json_file)
for json_str in json_list:
    item = json.loads(json_str)
    idx = item['idx']
    text = item['passage']['text']
    test_contexts.append(text)
    query = item['qas'][0]['query']
    test_queries.append(query)
    d = {}
    d['context'] = text
    d['qas'] = []
    d1 = {'id': idx, 'question': query}
    d['qas'].append(d1)
    test_data.append(d)

In [None]:
answers1, probabilities1 = model.predict(test_data)

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
100%|██████████| 7257/7257 [01:07<00:00, 108.31it/s]
add example index and unique id: 100%|██████████| 7257/7257 [00:00<00:00, 1060374.99it/s]


Running Prediction:   0%|          | 0/454 [00:00<?, ?it/s]

In [None]:
def create_entities():
    list_of_entities = []
    json_list = []
    with open(path_test, 'r') as json_file:
        json_list += list(json_file)
    iter = 0
    for json_str in json_list:

        item = json.loads(json_str)
        text = item['passage']['text']
        entities = item['passage']['entities']
        arr = []
        for i in range(len(entities)):
            entitie = text[entities[i]['start']: entities[i]['end']].lower()
            arr.append(entitie)
        list_of_entities.append(arr)
    return list_of_entities

In [None]:
list_of_entities = create_entities()

In [None]:
def most_frequent(List): 
    counter = 0
    num = List[0] 
      
    for i in List: 
        curr_frequency = List.count(i) 
        if(curr_frequency> counter): 
            counter = curr_frequency 
            num = i 
  
    return num 

In [None]:
def get_lem_p(text, morph):
    answer = []
    for  i in text.split():
        word = morph.parse(i)[0]
        answer.append(word.normal_form)
    return " ".join(answer)


In [None]:
outputs = []
errors = 0
iter = 0
morph = pymorphy2.MorphAnalyzer(lang='ru')

for i in range(len(answers1)):
    iter += 1
    print(iter)
    answers_arr = answers1[i]['answer']
    answer = ''
    ans_l = []
    #list_of_entities[i] = list(set(list_of_entities[i] ))
    #list_of_entities[i] += NERText(test_contexts[i]).ners()
    list_of_entities[i] = list(set(list_of_entities[i]))
    l_e = []
    for ent in list_of_entities[i]:
        ent_lem = get_lem_p(ent.lower(), morph)
        if ent_lem not in l_e:
            l_e.append(ent_lem)
    is_finded_answer = False
    for a in answers_arr[:2]:
        a1 = get_lem_p(a.lower(), morph)
        for e in l_e:
            try:
                b = a1.index(get_lem_p(e.lower(), morph))
                #answer =  e
                ans_l.append(get_lem_p(e.lower(), morph))
                #is_finded_answer = True
                #break
            except:
                pass
        if is_finded_answer:
            break
    if ans_l == []:#is_finded_answer == False:
         answer = answers_arr[0]
    else:
        answer = most_frequent(ans_l)
        for e in list_of_entities[i]:
            if answer == get_lem_p(e.lower(), morph):
                answer = e.lower()
    d = {}
    d['text'] = answer
    d['idx'] = i
    d['start'] = 0#start
    d['end'] = 0#end
    outputs.append(d)

In [None]:
with open('submit.jsonl', 'w') as json_file:
    for o in outputs:
        json.dump(o, json_file, indent=4)
        json_file.write("\n")

Реализация на 0.864 