In [1]:
import logging
import os
import sys
from utils.preprocess import prepare_datasets_with_setting
from typing import List, Callable, NoReturn, NewType, Any
import dataclasses
from datasets import load_metric, load_from_disk, Dataset, DatasetDict
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer
import torch
from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)

from tokenizers import Tokenizer
from tokenizers.models import WordPiece

from utils.trainer_qa import QuestionAnsweringTrainer

from arguments import (
    ModelArguments,
    DataTrainingArguments,
)

In [2]:

datasets = load_from_disk("../data/train_dataset")
testsets = load_from_disk("../data/test_dataset")

In [3]:
from retrieval import *

In [4]:
with open("../data/wikipedia_documents.json", "r", encoding="utf-8") as f:
            wiki = json.load(f)

contexts = list(
            dict.fromkeys([v["text"] for v in wiki.values()])
        ) 

In [5]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
tokenize_fn = tokenizer.tokenize

In [7]:

tokenized_corpus = []
p_embs = []
for p in tqdm(contexts):
    tokenized_corpus.append(tokenize_fn(p, padding="max_length", truncation=True, return_tensors='pt'))
bm25 = MyBm25(tokenized_corpus)

  0%|          | 0/56737 [00:00<?, ?it/s]

In [8]:
def get_sparse_embedding(bm25,contexts, tokenize_fn,data_path ="../",k1=1.5, b=0.75, epsilon=0.25) -> NoReturn:

    """
    Summary:
        Passage Embedding을 만들고
        TFIDF와 Embedding을 pickle로 저장합니다.
        만약 미리 저장된 파일이 있으면 저장된 pickle을 불러옵니다.
    """
    bm25_name = f"bm25.bin"
    bm25_path = os.path.join(data_path, bm25_name)
    if os.path.isfile(bm25_path):
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        print("Embedding bm25 pickle load.")
    else:
        print("Building bm25... It may take 1 minute and 30 seconds...")
        # bm25 must tokenizer first 
        # because it runs pool inside and this cuases unexpected result.
        tokenized_corpus = []
        for c in contexts:
            tokenized_corpus.append(tokenize_fn(c))
        bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print("bm25 pickle saved.")
    return bm25

In [9]:
bm25 = get_sparse_embedding(bm25,contexts,tokenize_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 512). Running this sequence through the model will result in indexing errors


Building bm25... It may take 1 minute and 30 seconds...
bm25 pickle saved.


In [10]:
k1=1.5
b=0.75
epsilon=0.25
topk=20

In [11]:
data_path = "../"

In [12]:
bm25_name = f"bm25.bin"
bm25_path = os.path.join(data_path, bm25_name)
if os.path.isfile(bm25_path):
    with open(bm25_path, "rb") as file:
        bm25 = pickle.load(file)
    print("Embedding bm25 pickle load.")
else:
    print("Building bm25... It may take 1 minute and 30 seconds...")
    # bm25 must tokenizer first 
    # because it runs pool inside and this cuases unexpected result.
    tokenized_corpus = []
    for c in contexts:
        tokenized_corpus.append(tokenize_fn(c))
    bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
    with open(bm25_path, "wb") as file:
        pickle.dump(bm25, file)
    print("bm25 pickle saved.")

Embedding bm25 pickle load.


In [13]:
datasets['validation']['question'][0]

'처음으로 부실 경영인에 대한 보상 선고를 받은 회사는?'

In [14]:
def get_top_n(bm25, query, documents, n=20):
    assert bm25.corpus_size == len(documents), "The documents given don't match the index corpus!"

    scores = bm25.get_scores(query)

    top_n_idx = np.argsort(scores)[::-1][:n]
    doc_score = scores[top_n_idx]
        
    return doc_score, top_n_idx

In [15]:
plain = datasets['train']['question'][:-1000]
addtext = datasets['train']['question'][-1000:]

In [17]:
queries = addtext + datasets['validation']['question']

In [18]:
ground_truth = datasets['train']['context'][-1000:] + datasets['validation']['context']

In [19]:
len(queries), len(ground_truth)

(1240, 1240)

In [31]:
doc_scores = []
doc_indices = []
class Retriever:
    def __init__(self, tf,bm25,contexts, queries, k =20):
        self.tokenize_fn = tf
        self.bm25 = bm25
        self.contexts = contexts
        self.k = k
    def get_relevant_doc(self, query: str, k: Optional[int] = 1) -> Tuple[List, List]:
        tok_q = self.tokenize_fn(query)
        doc_score, doc_indices = self.bm25.get_top_n(tok_q, self.contexts, n = k)
        return doc_score, doc_indices
# parallel search
# 하나로 쪼개서 안에 들어가서 각각 토크나이즈를 한다.
retriever = Retriever(tokenize_fn, bm25, contexts, queries, topk)


In [32]:
def par_search(retriever, queries, topk):
    # pool.map may put only one argument. We need two arguments: datasets and topk.
    def wrapper(query): 
        rel_doc = retriever.get_relevant_doc(query, k = topk)
        return rel_doc

    pool = Pool()

    pool.restart() 

    rel_docs_score_indices = pool.map(wrapper, queries)
    pool.close()
    pool.join()

    doc_scores = []
    doc_indices = []
    for s,idx in rel_docs_score_indices:
        doc_scores.append( s )
        doc_indices.append( idx )

    return doc_scores, doc_indices


In [17]:
doc_scores, doc_indices = par_search(retriever, queries, topk)

In [19]:
# doc_scores = np.array(doc_scores)
# doc_scores = torch.tensor(doc_scores)
# ranks = torch.argsort(doc_scores, dim=1, descending=True).squeeze()
# k = topk
# context_list = []

# for index in range(len(ranks)):
#     k_list = []
#     for i in range(k):
#         k_list.append(contexts[ranks[index][i]])
#     context_list.append(k_list)
    
# correct= 0
# for index in range(len(context_list)):
#     if datasets['validation']['context'][index] in context_list[index]:
#         correct+=1 
# print(correct/len(context_list))

array([14489,  6795, 20097, 52322,  2269, 20516, 17086, 38020, 51905,
       39217,  1281, 32596,  1738,  4879, 51907, 13987,  9588,  3331,
       51068, 56680])

In [25]:
train_length = len(datasets['train']['question'])
valid_length = len(datasets['validation']['question'])
test_length = len(testsets['validation']['question'])

In [29]:
expanded_train, expanded_valid, expanded_test = doc_scores[:train_length], doc_scores[train_length:train_length+valid_length], doc_scores[-test_length:]

In [18]:
doc_scores=torch.tensor(np.array(doc_scores))

In [21]:
datasets['validation']['question'][0]

'처음으로 부실 경영인에 대한 보상 선고를 받은 회사는?'

In [19]:
ranks = torch.argsort(doc_scores, dim=1, descending=True).squeeze()
k = topk
context_list = []
for index in range(len(ranks)):
    k_list = []
    for i in range(k):
        k_list.append(contexts[ranks[index][i]])
    context_list.append(k_list)

In [36]:
len(context_list[0])

20

In [37]:
len(expanded_train), len(expanded_valid), len(expanded_test)

(3952, 240, 600)

In [38]:
train_list, valid_list, test_list = context_list[:train_length], context_list[train_length:train_length+valid_length], context_list[-test_length:]

In [22]:
from collections import OrderedDict

In [19]:
doc_indices[0][::-1]

array([22832, 54841, 30647, 54679, 11576,  4550, 29719, 42526, 22749,
          80, 55487,  5107, 20106, 55267, 24361, 24080,  3103, 51906,
        5927, 14708, 31664, 14260,  7769, 32597, 49397, 29710,  8538,
        9619,  5154, 17406, 56680, 51068,  3331,  9588, 13987, 51907,
        4879,  1738, 32596,  1281, 39217, 51905, 38020, 17086, 20516,
        2269, 52322, 20097,  6795, 14489])

In [22]:
topk = 4

In [23]:
context_list = []
for idx in tqdm(range(len(doc_indices))):
    doc_nega = doc_indices[idx][::-1]
    toklist = []
    for k in range(topk):
        toklist.append(contexts[doc_nega[k]])
    toklist = list(OrderedDict.fromkeys([ground_truth[idx]]+toklist))
    txt = ' '.join((map(str, toklist)))
    context_list.append(txt)

  0%|          | 0/4192 [00:00<?, ?it/s]

In [24]:
t = context_list[:len(datasets['train']['context'])]

In [25]:
v = context_list[len(datasets['train']['context']):]

In [28]:
traindata = Dataset.from_dict({'answers':datasets['train']['answers'], 
                    'context':t,
                    'id' : datasets['train']['id'],
                    'question':datasets['train']['question']})

validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':v,
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})

top5 = DatasetDict({"train":traindata, "validation":validdata})

top5.save_to_disk("../data/top5_dataset/")

In [29]:
validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':v,
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})


In [30]:
top5 = DatasetDict({"train":traindata, "validation":validdata})

In [31]:
top5.save_to_disk("../data/top5_dataset/")

In [None]:
topkdata

In [4]:
from tqdm import tqdm

In [5]:
incorrect = 0
listtxt = []
train_or_valid = "train"
for idx in tqdm(range(len(datasets[train_or_valid]['context']))):
    if datasets[train_or_valid]['context'][idx] not in topkdata[train_or_valid]['context'][idx]:
        incorrect += 1
        txt = topkdata[train_or_valid]['context'][idx][:-1]
        txt.append(datasets[train_or_valid]['context'][idx])
        listtxt.append(listtxt)
    else:
        listtxt.append(topkdata[train_or_valid]['context'][idx])
print(incorrect)

100%|██████████| 3952/3952 [1:46:48<00:00,  1.62s/it]

372





In [None]:
data = []
for idx in range(len(listtxt)):
    context = ' '.join((map(str, listtxt[idx])))
    data.append(context)

In [None]:
traindata = Dataset.from_dict({'answers':topkdata['train']['answers'], 
                    'context':data,
                    'question':topkdata['train']['question']})


Error: Kernel is dead

In [None]:
validdata = Dataset.from_dict({'answers':topkdata['validation']['answers'], 
                    'context':data,
                    'question':topkdata['validation']['question']})


In [82]:
topkdata.save_to_disk("../data/topk_dataset/")

Re BM

In [20]:
from retrieval import MyBm25

In [21]:
def get_sparse_embedding(bm25,contexts, tokenize_fn,data_path ="../",k1=1.5, b=0.75, epsilon=0.25) -> NoReturn:

    """
    Summary:
        Passage Embedding을 만들고
        TFIDF와 Embedding을 pickle로 저장합니다.
        만약 미리 저장된 파일이 있으면 저장된 pickle을 불러옵니다.
    """
    bm25_name = f"bm25.bin"
    bm25_path = os.path.join(data_path, bm25_name)
    if os.path.isfile(bm25_path):
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        print("Embedding bm25 pickle load.")
    else:
        print("Building bm25... It may take 1 minute and 30 seconds...")
        # bm25 must tokenizer first 
        # because it runs pool inside and this cuases unexpected result.
        tokenized_corpus = []
        for c in contexts:
            tokenized_corpus.append(tokenize_fn(c))
        bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print("bm25 pickle saved.")
    return bm25

In [22]:
bm25 = MyBm25(tokenized_corpus)

In [23]:
se = get_sparse_embedding(bm25,contexts,tokenize_fn)
bm25 = se

Embedding bm25 pickle load.


In [24]:
doc_scores = []
for q in tqdm(queries):
    tok_q = tokenize_fn(q)
    scores = bm25.get_scores(tok_q)
    doc_scores.append(scores)
print("done!")
#return doc_scores, doc_indices

  0%|          | 0/1240 [00:00<?, ?it/s]

done!


In [25]:
doc_scores = torch.tensor(np.array(doc_scores))

In [26]:
ranks = torch.argsort(doc_scores, dim=1, descending=True).squeeze()

In [81]:
len(doc_scores[-len(testsets['validation']['question']):])

600

In [27]:
len(doc_scores)

1240

In [82]:
doc_scores = doc_scores[-len(testsets['validation']['question']):]

In [30]:

ranks = torch.argsort(doc_scores, dim=1, descending=True).squeeze()

context_list = []

for index in range(len(ranks)):
    k_list = []
    for i in range(topk):
        k_list.append(contexts[ranks[index][i]])
    context_list.append(k_list)

In [31]:
len(context_list)

1240

In [32]:
train_c, valid_c = context_list[:1000], context_list[1000:]

In [38]:
added_train = []
for c in train_c:
    txt = ' '.join((map(str, c)))
    added_train.append(txt)

added_valid = []
for c in valid_c:
    txt = ' '.join((map(str, c)))
    added_valid.append(txt)

In [41]:
len(datasets['train']['answers'][-1000:])

1000

In [42]:
traindata = Dataset.from_dict({'answers':datasets['train']['answers'][-1000:], 
                    'context':added_train,
                    'id' : datasets['train']['id'][-1000:],
                    'question':datasets['train']['question'][-1000:]})

validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':added_valid,
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})

top5 = DatasetDict({"train":traindata, "validation":validdata})

top5.save_to_disk("../data/added_dataset/")

In [49]:
traindata = Dataset.from_dict({'answers':datasets['train']['answers'][:-1000], 
                    'context':datasets['train']['context'][:-1000],
                    'id' : datasets['train']['id'][:-1000],
                    'question':datasets['train']['question'][:-1000]})

validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':datasets['validation']['context'],
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})

top5 = DatasetDict({"train":traindata, "validation":validdata})

top5.save_to_disk("../data/new_train_dataset/")

In [50]:
len(traindata), len(validdata)

(2952, 240)

In [90]:
with open('./new_wiki.txt', 'w') as f:
    for item in context_list:
        f.write("%s\n" % item)