In [13]:
import logging
import os
import sys
from utils.preprocess import prepare_datasets_with_setting
from typing import List, Callable, NoReturn, NewType, Any
import dataclasses
from datasets import load_metric, load_from_disk, Dataset, DatasetDict
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer
import torch
from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)
from tqdm import tqdm
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

from utils.trainer_qa import QuestionAnsweringTrainer

from arguments import (
    ModelArguments,
    DataTrainingArguments,
)

In [3]:

datasets = load_from_disk("../data/train_dataset")
testsets = load_from_disk("../data/test_dataset")

In [16]:
from retrieval import *

In [64]:
with open("../data/wikipedia_documents.json", "r", encoding="utf-8") as f:
            wiki = json.load(f)

contexts = list(
            dict.fromkeys([v["text"] for v in wiki.values()])
        ) 

In [66]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [67]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenize_fn = tokenizer.tokenize

In [132]:

tokenized_corpus = []
p_embs = []
for p in tqdm(contexts):
    tokenized_corpus.append(tokenize_fn(p, padding="max_length", truncation=True, return_tensors='pt'))
bm25 = MyBm25(tokenized_corpus)

  0%|          | 0/14941 [00:00<?, ?it/s]

In [133]:
def get_sparse_embedding(bm25,contexts, tokenize_fn,data_path ="../",k1=1.5, b=0.75, epsilon=0.25) -> NoReturn:

    """
    Summary:
        Passage Embedding을 만들고
        TFIDF와 Embedding을 pickle로 저장합니다.
        만약 미리 저장된 파일이 있으면 저장된 pickle을 불러옵니다.
    """
    bm25_name = f"bm25.bin"
    bm25_path = os.path.join(data_path, bm25_name)
    if os.path.isfile(bm25_path):
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        print("Embedding bm25 pickle load.")
    else:
        print("Building bm25... It may take 1 minute and 30 seconds...")
        # bm25 must tokenizer first 
        # because it runs pool inside and this cuases unexpected result.
        tokenized_corpus = []
        for c in contexts:
            tokenized_corpus.append(tokenize_fn(c))
        bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print("bm25 pickle saved.")
    return bm25

In [70]:
bm25 = get_sparse_embedding(bm25,contexts,tokenize_fn)

Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 512). Running this sequence through the model will result in indexing errors


Building bm25... It may take 1 minute and 30 seconds...
bm25 pickle saved.


In [71]:
k1=1.5
b=0.75
epsilon=0.25
topk=100

In [73]:
data_path = "../"

In [74]:
bm25_name = f"bm25.bin"
bm25_path = os.path.join(data_path, bm25_name)
if os.path.isfile(bm25_path):
    with open(bm25_path, "rb") as file:
        bm25 = pickle.load(file)
    print("Embedding bm25 pickle load.")
else:
    print("Building bm25... It may take 1 minute and 30 seconds...")
    # bm25 must tokenizer first 
    # because it runs pool inside and this cuases unexpected result.
    tokenized_corpus = []
    for c in contexts:
        tokenized_corpus.append(tokenize_fn(c))
    bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
    with open(bm25_path, "wb") as file:
        pickle.dump(bm25, file)
    print("bm25 pickle saved.")

Embedding bm25 pickle load.


In [75]:
datasets['validation']['question'][0]

'처음으로 부실 경영인에 대한 보상 선고를 받은 회사는?'

In [14]:
def get_top_n(bm25, query, documents, n=20):
    assert bm25.corpus_size == len(documents), "The documents given don't match the index corpus!"

    scores = bm25.get_scores(query)

    top_n_idx = np.argsort(scores)[::-1][:n]
    doc_score = scores[top_n_idx]
        
    return doc_score, top_n_idx

In [15]:
plain = datasets['train']['question'][:-1000]
addtext = datasets['train']['question'][-1000:]

In [17]:
queries = addtext + datasets['validation']['question']

In [18]:
ground_truth = datasets['train']['context'][-1000:] + datasets['validation']['context']

In [76]:
queries = testsets['validation']['question']

In [19]:
len(queries), len(ground_truth)

(1240, 1240)

In [77]:
doc_scores = []
doc_indices = []
class Retriever:
    def __init__(self, tf,bm25,contexts, queries, k =20):
        self.tokenize_fn = tf
        self.bm25 = bm25
        self.contexts = contexts
        self.k = k
    def get_relevant_doc(self, query: str, k: Optional[int] = 1) -> Tuple[List, List]:
        tok_q = self.tokenize_fn(query)
        doc_score, doc_indices = self.bm25.get_top_n(tok_q, self.contexts, n = k)
        return doc_score, doc_indices
# parallel search
# 하나로 쪼개서 안에 들어가서 각각 토크나이즈를 한다.
retriever = Retriever(tokenize_fn, bm25, contexts, queries, topk)


Re BM

In [135]:
from retrieval import MyBm25
def get_sparse_embedding(bm25,contexts, tokenize_fn,data_path ="../",k1=1.5, b=0.75, epsilon=0.25) -> NoReturn:

    """
    Summary:
        Passage Embedding을 만들고
        TFIDF와 Embedding을 pickle로 저장합니다.
        만약 미리 저장된 파일이 있으면 저장된 pickle을 불러옵니다.
    """
    bm25_name = f"bm25.bin"
    bm25_path = os.path.join(data_path, bm25_name)
    if os.path.isfile(bm25_path):
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        print("Embedding bm25 pickle load.")
    else:
        print("Building bm25... It may take 1 minute and 30 seconds...")
        # bm25 must tokenizer first 
        # because it runs pool inside and this cuases unexpected result.
        tokenized_corpus = []
        for c in contexts:
            tokenized_corpus.append(tokenize_fn(c))
        bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print("bm25 pickle saved.")
    return bm25

In [136]:
bm25 = MyBm25(tokenized_corpus)

In [137]:
bm25 = MyBm25(tokenized_corpus)
se = get_sparse_embedding(bm25,contexts,tokenize_fn)
bm25 = se

Building bm25... It may take 1 minute and 30 seconds...
bm25 pickle saved.


In [139]:
queries = datasets['train']['question'] + datasets['validation']['question'] + testsets['validation']['question']

In [85]:
len(queries)

600

In [86]:
doc_scores = []
for q in tqdm(queries):
    tok_q = tokenize_fn(q)
    scores = bm25.get_scores(tok_q)
    doc_scores.append(scores)
print("done!")
#return doc_scores, doc_indices

  0%|          | 0/600 [00:00<?, ?it/s]

done!


In [87]:
doc_scores = torch.tensor(np.array(doc_scores))

In [88]:
ranks = torch.argsort(doc_scores, dim=1, descending=True).squeeze()

In [89]:

topk = 100

context_list = []

for index in range(len(ranks)):
    for i in range(topk):
        context_list.append(contexts[ranks[index][i]])


In [90]:
context_list[1]

'갈리프레이 (Gallifrey)는 영국의 SF 텔레비전 드라마 《닥터 후》에서 등장하는 행성이다. 드라마의 주인공인 닥터와 마스터를 비롯한 지금까지 등장한 모든 타임 로드의 고향이다. 카스터보로스 성단 내에서    "은하 중심에서 은하좌표로 10-0-11-0-0 하고도 0-2 지점"에 위치해 있으며   쌍성계를 이루고 있다. \n\n닥터의 고향 행성은 드라마 방영 초반에는 밝혀지지 않다가, 2대 닥터 에피소드인 The War Games (1969)에서 닥터의 고향 행성이 처음으로 등장했다.  닥터의 행성을 \'갈리프레이\'라는 이름으로 확실히 구분하게 된 것은 3대 닥터 시절의 The Time Warrior (1973–74)에 이르러서였다.  갈리프레이가 언제 처음 나타났는지에 대해서는 명확히 밝혀진 바가 없다. 행성 자체가 시간 여행을 통해 접근하는 경우가 많은 만큼, 갈리프레이에서의 \'지금\'은 지구에서는 과거나 미래나 어느 때든지 상대적으로 존재할 수 있다.\n\n2005년 드라마가 부활하면서 갈리프레이에 대한 설정도 옛 시즌의 설정을 조금씩 채워나가고 있다. 시즌 1 에피소드 "The End of the World"에서 9대 닥터는 갈리프레이의 모습을 "그 날이 오기 전까진" "바위와 먼지"밖에 없었다고 말하고, 이후로는 시간 전쟁이 일어나면서 50억년 뒤의 지구처럼 갈리프레이도 타임 로드와 함께 모두 "불타 버렸다"고 언급한다.  이후 갈리프레이는 등장하지 않다가 2006년 크리스마스 스페셜 "The Runaway Bride"에서 다시 언급된다.  시즌 3의 "The Sound of Drums"에서는 회상 장면 중에 처음 그 풍경이 묘사되며  The End of Time (2009–10)에서는 줄거리의 주요 무대로까지 등장한다.  It appeared briefly in the 시즌 7 마지막화 "The Name of the Doctor"에서도 짧게 등장하며, 1대 닥터와 수잔이 타디스를 훔치는 순간까지 등장한다.  "The Day of the D

In [93]:
len(context_list)

26825

In [92]:
context_list = list(set(context_list))

In [None]:
def wiki_preprocess(data_dict):
            text = data_dict["text"]
            text = re.sub(r'\n', ' ', text)
            text = re.sub(r"\\n", " ", text)
            text = re.sub(r'\\n\\n', ' ', text)
            text = re.sub(r'\n\n', " ", text)
            text = re.sub(r"\s+", " ", text)
            text = re.sub(r'#', ' ', text)
            data_dict["text"] = text
            return data_dict 

In [32]:
train_c, valid_c = context_list[:1000], context_list[1000:]

In [38]:
added_train = []
for c in train_c:
    txt = ' '.join((map(str, c)))
    added_train.append(txt)

added_valid = []
for c in valid_c:
    txt = ' '.join((map(str, c)))
    added_valid.append(txt)

In [41]:
len(datasets['train']['answers'][-1000:])

1000

In [42]:
traindata = Dataset.from_dict({'answers':datasets['train']['answers'][-1000:], 
                    'context':added_train,
                    'id' : datasets['train']['id'][-1000:],
                    'question':datasets['train']['question'][-1000:]})

validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':added_valid,
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})

top5 = DatasetDict({"train":traindata, "validation":validdata})

top5.save_to_disk("../data/added_dataset/")

In [49]:
traindata = Dataset.from_dict({'answers':datasets['train']['answers'][:-1000], 
                    'context':datasets['train']['context'][:-1000],
                    'id' : datasets['train']['id'][:-1000],
                    'question':datasets['train']['question'][:-1000]})

validdata = Dataset.from_dict({'answers':datasets['validation']['answers'], 
                    'context':datasets['validation']['context'],
                    'id' : datasets['validation']['id'],
                    'question':datasets['validation']['question']})

top5 = DatasetDict({"train":traindata, "validation":validdata})

top5.save_to_disk("../data/new_train_dataset/")

In [126]:
txt = Dataset.from_dict({'text':valid_c})

In [127]:
txt.save_to_disk("../data/valid_wiki/")

In [4]:
a = load_from_disk("../data/valid_wiki/")

In [51]:
a

Dataset({
    features: ['text'],
    num_rows: 43979
})

In [50]:
len(traindata), len(validdata)

(2952, 240)

In [90]:
with open('./new_wiki.txt', 'w') as f:
    for item in context_list:
        f.write("%s\n" % item)

In [54]:
cs = datasets['train']['context'] + datasets['validation']['context'] + context_list

In [56]:
len(list(set(cs)))

43985

In [58]:
context_list = list(set(cs))

valid

In [5]:
from torch.nn.functional import softmax

In [6]:
contexts = a['text']

In [7]:
ground_truth = datasets['validation']['context']

In [8]:
queries = datasets['validation']['question']

BM25

In [9]:
from retrieval import MyBm25
def get_sparse_embedding(bm25,contexts, tokenize_fn,data_path ="../",k1=1.5, b=0.75, epsilon=0.25) -> NoReturn:

    """
    Summary:
        Passage Embedding을 만들고
        TFIDF와 Embedding을 pickle로 저장합니다.
        만약 미리 저장된 파일이 있으면 저장된 pickle을 불러옵니다.
    """
    bm25_name = f"bm25.bin"
    bm25_path = os.path.join(data_path, bm25_name)
    if os.path.isfile(bm25_path):
        with open(bm25_path, "rb") as file:
            bm25 = pickle.load(file)
        print("Embedding bm25 pickle load.")
    else:
        print("Building bm25... It may take 1 minute and 30 seconds...")
        # bm25 must tokenizer first 
        # because it runs pool inside and this cuases unexpected result.
        tokenized_corpus = []
        for c in contexts:
            tokenized_corpus.append(tokenize_fn(c))
        bm25 = MyBm25(tokenized_corpus, k1 = k1, b = b, epsilon=epsilon)
        with open(bm25_path, "wb") as file:
            pickle.dump(bm25, file)
        print("bm25 pickle saved.")
    return bm25

In [14]:
model_checkpoint = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenize_fn = tokenizer.tokenize
tokenized_corpus = []
p_embs = []
for p in tqdm(contexts):
    tokenized_corpus.append(tokenize_fn(p, padding="max_length", truncation=True, return_tensors='pt'))

100%|██████████| 14941/14941 [00:28<00:00, 530.42it/s]


In [17]:
bm25 = MyBm25(tokenized_corpus)
se = get_sparse_embedding(bm25,contexts,tokenize_fn)
bm25 = se

Embedding bm25 pickle load.


In [18]:
doc_scores = []
for q in tqdm(queries):
    tok_q = tokenize_fn(q)
    scores = bm25.get_scores(tok_q)
    doc_scores.append(scores)
print("done!")
#return doc_scores, doc_indices

  0%|          | 0/240 [00:00<?, ?it/s]

done!


In [19]:

from transformers import AutoTokenizer,BertPreTrainedModel,BertModel
import numpy as np

#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)

class BertEncoder(BertPreTrainedModel):
  def __init__(self, config):
    super(BertEncoder, self).__init__(config)

    self.bert = BertModel(config)
    self.init_weights()
      
  def forward(self, input_ids, 
              attention_mask=None, token_type_ids=None): 
  
      outputs = self.bert(input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids)
      
      pooled_output = outputs[1]

      return pooled_output


# load pre-trained model on cuda (if available)
p_encoder = BertEncoder.from_pretrained(model_checkpoint)
q_encoder = BertEncoder.from_pretrained(model_checkpoint)

if torch.cuda.is_available():
  p_encoder.cuda()
  q_encoder.cuda()


Some weights of the model checkpoint at klue/bert-base were not used when initializing BertEncoder: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at klue/bert-base were not used when initializing BertEncoder: ['cls.predictions.tr

In [20]:
p_encoder.load_state_dict(torch.load("../dpr/neg20_p_encoder_14.pt"))
q_encoder.load_state_dict(torch.load("../dpr/neg20_q_encoder_14.pt"))

<All keys matched successfully>

In [21]:
if torch.cuda.is_available():
  p_encoder.cuda()
  q_encoder.cuda()


In [22]:
tokenize_fn = tokenizer.tokenize

In [23]:
with torch.no_grad():
  p_encoder.eval()
  tokenized_corpus = []
  p_embs = []
  for p in tqdm(contexts):
    p_t = tokenizer(p, padding="max_length", truncation=True, return_tensors='pt').to('cuda')
    p_emb = p_encoder(**p_t).to('cpu').numpy()
    p_embs.append(p_emb)
    tokenized_corpus.append(tokenize_fn(p, padding="max_length", truncation=True, return_tensors='pt'))
p_embs = torch.Tensor(p_embs).squeeze()  # (num_passage, emb_dim)


  0%|          | 0/14941 [00:00<?, ?it/s]

In [24]:
with torch.no_grad():
  q_encoder.eval()
  q_seqs_val = tokenizer(queries, padding="max_length", truncation=True, return_tensors='pt').to('cuda')
  q_emb = q_encoder(**q_seqs_val).to('cpu')  #(num_query, emb_dim)

p_embs = torch.Tensor(p_embs).squeeze()  # (num_passage, emb_dim)

print(p_embs.size(), q_emb.size())

torch.Size([14941, 768]) torch.Size([240, 768])


In [25]:
doc_scores = torch.tensor(np.array(doc_scores))
dot_prod_scores = torch.matmul(q_emb, torch.transpose(p_embs, 0, 1))
print(dot_prod_scores.size(),len(doc_scores),len(doc_scores[0]))

dpr_score = softmax(dot_prod_scores,dim=1)
bm25_score = softmax(doc_scores,dim=1)


total_score = []
for idx in range(len(queries)):
    total_score.append((dpr_score[idx]*0.1+bm25_score[idx]).tolist())
total_score = torch.tensor(np.array(total_score))


k_list = [1,10,15,20,30]
rate_list=[0.0,0.1,0.2,0.5]

score_Dict = {k:[] for k in k_list}


for k in tqdm(k_list):
    for rate in rate_list:
        total_score = []
        for idx in range(len(queries)):
            total_score.append((dpr_score[idx]*rate+bm25_score[idx]).tolist())
        total_score = torch.tensor(np.array(total_score))

        ranks = torch.argsort(total_score, dim=1, descending=True).squeeze()
        context_list = []

        for index in range(len(ranks)):
            k_list = []
            for i in range(k):
                k_list.append(contexts[ranks[index][i]])
            context_list.append(k_list)

        correct= 0
        correct_list = []
        for index in range(len(context_list)):
            for ctx_idx in range(len(context_list[0])) :
                if ground_truth[index] == context_list[index][ctx_idx]:
                    correct+=1 
                    correct_list.append(ctx_idx+1)
        score_Dict[k].append([f"dpr-rate : {rate}",(correct/len(context_list)),(sum(correct_list)/len(correct_list))])

torch.Size([240, 14941]) 240 14941


  0%|          | 0/5 [00:00<?, ?it/s]

In [152]:
# 재형님 인코더
score_Dict

{1: [['dpr-rate : 0.0', 0.55, 1.0],
  ['dpr-rate : 0.1', 0.5625, 1.0],
  ['dpr-rate : 0.2', 0.575, 1.0],
  ['dpr-rate : 0.5', 0.6083333333333333, 1.0]],
 10: [['dpr-rate : 0.0', 0.9041666666666667, 1.9262672811059909],
  ['dpr-rate : 0.1', 0.925, 1.9684684684684686],
  ['dpr-rate : 0.2', 0.9291666666666667, 1.9955156950672646],
  ['dpr-rate : 0.5', 0.9125, 1.8447488584474885]],
 15: [['dpr-rate : 0.0', 0.9083333333333333, 1.981651376146789],
  ['dpr-rate : 0.1', 0.9333333333333333, 2.049107142857143],
  ['dpr-rate : 0.2', 0.9333333333333333, 2.0401785714285716],
  ['dpr-rate : 0.5', 0.925, 1.9864864864864864]],
 20: [['dpr-rate : 0.0', 0.9166666666666666, 2.1227272727272726],
  ['dpr-rate : 0.1', 0.9375, 2.1244444444444444],
  ['dpr-rate : 0.2', 0.9375, 2.12],
  ['dpr-rate : 0.5', 0.9375, 2.1911111111111112]],
 30: [['dpr-rate : 0.0', 0.9541666666666667, 3.021834061135371],
  ['dpr-rate : 0.1', 0.9458333333333333, 2.3480176211453743],
  ['dpr-rate : 0.2', 0.9458333333333333, 2.33920704

In [26]:
score_Dict

{1: [['dpr-rate : 0.0', 0.55, 1.0],
  ['dpr-rate : 0.1', 0.5708333333333333, 1.0],
  ['dpr-rate : 0.2', 0.5958333333333333, 1.0],
  ['dpr-rate : 0.5', 0.65, 1.0]],
 10: [['dpr-rate : 0.0', 0.9041666666666667, 1.9262672811059909],
  ['dpr-rate : 0.1', 0.9166666666666666, 1.9272727272727272],
  ['dpr-rate : 0.2', 0.9166666666666666, 1.9227272727272726],
  ['dpr-rate : 0.5', 0.9125, 1.8812785388127853]],
 15: [['dpr-rate : 0.0', 0.9083333333333333, 1.981651376146789],
  ['dpr-rate : 0.1', 0.9208333333333333, 1.9864253393665159],
  ['dpr-rate : 0.2', 0.9208333333333333, 1.9819004524886878],
  ['dpr-rate : 0.5', 0.9208333333333333, 1.9728506787330318]],
 20: [['dpr-rate : 0.0', 0.9166666666666666, 2.1227272727272726],
  ['dpr-rate : 0.1', 0.925, 2.063063063063063],
  ['dpr-rate : 0.2', 0.9208333333333333, 1.9819004524886878],
  ['dpr-rate : 0.5', 0.9208333333333333, 1.9728506787330318]],
 30: [['dpr-rate : 0.0', 0.9541666666666667, 3.021834061135371],
  ['dpr-rate : 0.1', 0.9375, 2.36444444