In [7]:
# # download elasticsearch
# ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
# ! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
# ! chown -R daemon:daemon elasticsearch-7.6.2

In [9]:
# # collapse-hide
# !pip install elasticsearch
# !pip install tqdm

In [13]:
# !elasticsearch-7.6.2/bin/elasticsearch-plugin install analysis-nori

In [1]:
import os
import re
import json
import torch
import pickle

from konlpy.tag import Mecab
from tqdm.notebook import tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer
from elasticsearch import Elasticsearch
from subprocess import Popen, PIPE, STDOUT
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_metric, load_from_disk, load_dataset, Features, Value, Sequence, DatasetDict, Dataset


ModuleNotFoundError: No module named 'torch'

In [2]:
train_file = load_from_disk("/opt/ml/input/data/data/train_dataset")["train"]
validation_file = load_from_disk("/opt/ml/input/data/data/train_dataset")["validation"]
test_file = load_from_disk("/opt/ml/input/data/data/test_dataset")["validation"]

with open("/opt/ml/new_dataset/preprocess_wiki.json", "r") as f:
    wiki = json.load(f)

wiki_contexts = list(dict.fromkeys([v['text'] for v in wiki.values()]))

In [4]:
qa_records = [{"example_id" : train_file[i]["id"],
               "document_title" : train_file[i]["title"],
               "question_text" : train_file[i]["question"],
               "answer" : train_file[i]["answers"]} for i in range(len(train_file))]

wiki_articles = [{"document_text" : wiki_contexts[i]} for i in range(len(wiki_contexts))]


In [8]:
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [10]:
config = {'host':'localhost', 'port':9200}
es = Elasticsearch([config])

# test connection
es.ping()

True

In [14]:
es.indices.delete(index='nori-index', ignore=[400, 404])
index_config = {
        "settings": {
            "analysis": {
                "analyzer": {
                    "nori_analyzer": {
                        "type": "custom",
                        "tokenizer": "nori_tokenizer",
                        "decompound_mode": "mixed",
                        "stopwords": "_korean_",
                    }
                }
            }
        },
        "mappings": {
            "dynamic": "strict", 
            "properties": {
                "document_text": {"type": "text", "analyzer": "nori_analyzer"}
                }
            }
        }

index_name = 'nori-index'
es.indices.create(index=index_name, body=index_config, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nori-index'}

In [15]:
def populate_index(es_obj, index_name, evidence_corpus):
    '''
    Loads records into an existing Elasticsearch index

    Args:
        es_obj (elasticsearch.client.Elasticsearch) - Elasticsearch client object
        index_name (str) - Name of index
        evidence_corpus (list) - List of dicts containing data records

    '''

    for i, rec in enumerate(tqdm(evidence_corpus)):
        try:
            index_status = es_obj.index(index=index_name, id=i, body=rec)
        except:
            print(f'Unable to load document {i}.')
            
    n_records = es_obj.count(index=index_name)['count']
    print(f'Succesfully loaded {n_records} into {index_name}')

    return

In [16]:
all_wiki_articles = wiki_articles
populate_index(es_obj=es, index_name='nori-index', evidence_corpus=all_wiki_articles)

HBox(children=(FloatProgress(value=0.0, max=56606.0), HTML(value='')))


Succesfully loaded 56606 into nori-index


In [17]:
# collapse-hide
def search_es(es_obj, index_name, question_text, n_results):
    '''
    Execute an Elasticsearch query on a specified index
    
    Args:
        es_obj (elasticsearch.client.Elasticsearch) - Elasticsearch client object
        index_name (str) - Name of index to query
        query (dict) - Query DSL
        n_results (int) - Number of results to return
        
    Returns
        res - Elasticsearch response object
    
    '''
    
    # construct query
    query = {
            'query': {
                'match': {
                    'document_text': question_text
                    }
                }
            }
    
    res = es_obj.search(index=index_name, body=query, size=n_results)
    return res

In [42]:
mecab = Mecab()
def morphs_split(text):
    text = mecab.morphs(text)
    return ' '.join(text)

def preprocess(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r'#', ' ', text)
    text = re.sub(r"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\s\.\?!》《≪≫\'<>〈〉:‘’%,『』「」＜＞・\"-“”∧]", "", text)
    return text

def context_split(text):
    text = ' '.join(text.strip().split('\\n')).strip()
    sent_list = text.strip().split('. ')
    text = ''
    for sent in sent_list:
        sent = preprocess(sent)
        sent = mecab.morphs(sent)
        text += ' '.join(sent)+'[SEP]'
    return text[:-5]

def sentence_split(text):
    text_list = [sent for sent in map(lambda x : x.strip(), text.split('[SEP]')) if sent != '']
    return text_list

In [51]:
new_train_file = {'context':[], 'id':[], 'question':[], 'top20':[], 'answer_idx':[], 'answer':[], 'start_idx':[]}
for file in train_file:
    
    context = file["context"]
    new_context = context_split(context)
    new_context =  ' '.join(sentence_split(new_context))
    
    question_text = file['question']
    res = search_es(es_obj=es, index_name='nori-index', question_text=question_text, n_results=20)
    top20_list = [hit['_source']['document_text'] for hit in res['hits']['hits']]
    
    if not new_context in top20_list:
        top20_list = top20_list[:-1] + [new_context]
        answer_idx = 19
    else:
        answer_idx = top20_list.index(new_context)
    
    answer = file['answers']['text'][0]
    new_answer = morphs_split(answer)
    
    start_idx = file["answers"]["answer_start"][0] 
    front_context = file["context"][:start_idx]
    back_context = file["context"][start_idx+len(answer):]
    
    new_front_context = context_split(front_context)
    new_back_context = context_split(back_context)
    new_front_context =  ' '.join(sentence_split(new_front_context))
    new_back_context = ' '.join(sentence_split(new_back_context))
    new_context = ' '.join([new_front_context, new_answer, new_back_context])
    
    ids_move = len(front_context) - len(new_front_context)
    start_idx = start_idx - ids_move + 1
    
    new_question = morphs_split(question_text)
    
    new_train_file['context'].append(new_context)
    new_train_file['id'].append(file['id'])
    new_train_file['question'].append(new_question)
    new_train_file['top20'].append(top20_list)
    new_train_file['answer_idx'].append(answer_idx)
    new_train_file['answer'].append(new_answer)
    new_train_file['start_idx'].append(start_idx)
    

In [52]:
new_valid_file = {'context':[], 'id':[], 'question':[], 'top20':[], 'answer_idx':[], 'answer':[], 'start_idx':[]}
for file in validation_file:
    
    context = file["context"]
    new_context = context_split(context)
    new_context =  ' '.join(sentence_split(new_context))
    
    question_text = file['question']
    res = search_es(es_obj=es, index_name='nori-index', question_text=question_text, n_results=20)
    top20_list = [hit['_source']['document_text'] for hit in res['hits']['hits']]
    
    if not new_context in top20_list:
        top20_list = top20_list[:-1] + [new_context]
        answer_idx = 19
    else:
        answer_idx = top20_list.index(new_context)
    
    answer = file['answers']['text'][0]
    new_answer = morphs_split(answer)
    
    start_idx = file["answers"]["answer_start"][0] 
    front_context = file["context"][:start_idx]
    back_context = file["context"][start_idx+len(answer):]
    
    new_front_context = context_split(front_context)
    new_back_context = context_split(back_context)
    new_front_context =  ' '.join(sentence_split(new_front_context))
    new_back_context = ' '.join(sentence_split(new_back_context))
    new_context = ' '.join([new_front_context, new_answer, new_back_context])
    
    ids_move = len(front_context) - len(new_front_context)
    start_idx = start_idx - ids_move + 1
    
    new_question = morphs_split(question_text)
    
    new_valid_file['context'].append(new_context)
    new_valid_file['id'].append(file['id'])
    new_valid_file['question'].append(new_question)
    new_valid_file['top20'].append(top20_list)
    new_valid_file['answer_idx'].append(answer_idx)
    new_valid_file['answer'].append(new_answer)
    new_valid_file['start_idx'].append(start_idx)
    

In [1]:
def save_pickle(save_path, data_set):
    file = open(save_path, "wb")
    pickle.dump(data_set, file)
    file.close()

def get_pickle(pickle_path):
    f = open(pickle_path, "rb")
    dataset = pickle.load(f)
    f.close()
    return dataset

In [56]:
save_pickle("/opt/ml/new_dataset/preprocess_train.pkl", new_train_file)

In [57]:
save_pickle("/opt/ml/new_dataset/preprocess_valid.pkl", new_valid_file)