In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from tqdm import trange

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertModel, RobertaModel,
    BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
)
from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
)

from typing import List

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [3]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.7.1].
device:[cuda:0].


In [4]:
from adamp import AdamP
class DenseRetrieval:
    def __init__(self,
        args,
        dataset,
        tokenizer,
        p_encoder,
        q_encoder
    ):
        """
        학습과 추론에 사용될 여러 셋업을 마쳐봅시다.
        """

        self.args = args
        self.dataset = dataset

        self.tokenizer = tokenizer
        self.p_encoder = p_encoder
        self.q_encoder = q_encoder

    def train(self, args=None, tokenizer = None):
        if args is None:
            args = self.args
        if tokenizer is None :
            tokenizer = self.tokenizer

        q_seqs = tokenizer(self.dataset['question'], padding="max_length", truncation=True, return_tensors='pt')
        p_seqs = tokenizer(self.dataset['context'], padding="max_length", truncation=True, return_tensors='pt')

        train_dataset = TensorDataset(p_seqs['input_ids'], p_seqs['attention_mask'], p_seqs['token_type_ids'], 
                        q_seqs['input_ids'], q_seqs['attention_mask'], q_seqs['token_type_ids'])
        train_dataloader = DataLoader(train_dataset, batch_size=args.per_device_train_batch_size)

        no_decay = ["bias" ,"LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in self.p_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
            {"params": [p for n, p in self.p_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
            {"params": [p for n, p in self.q_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
            {"params": [p for n, p in self.q_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
        ]
        optimizer = AdamP(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            # eps=args.adam_epsilon
        )

        # t_total = len(train_dataloader) * args.num_train_epochs
        # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
        
        global_step = 0

        self.p_encoder.zero_grad()
        self.q_encoder.zero_grad()
        torch.cuda.empty_cache()

        train_iterator = trange(int(args.num_train_epochs), desc="Epoch")

        for epoch, _ in enumerate(train_iterator):
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            # loss_value=0 # Accumulation할 때 진행
            losses = 0
            for step, batch in enumerate(epoch_iterator):
                self.q_encoder.train()
                self.p_encoder.train()
                
                if torch.cuda.is_available():
                    batch = tuple(t.cuda() for t in batch)

                p_inputs = {'input_ids': batch[0],
                            'attention_mask': batch[1],
                            'token_type_ids': batch[2]
                            }
                
                q_inputs = {'input_ids': batch[3],
                            'attention_mask': batch[4],
                            'token_type_ids': batch[5]}
            
                p_outputs = self.p_encoder(**p_inputs)  # (batch_size, emb_dim)
                q_outputs = self.q_encoder(**q_inputs)  # (batch_size, emb_dim)

                # Calculate similarity score & loss
                sim_scores = torch.matmul(q_outputs, torch.transpose(p_outputs, 0, 1))  # (batch_size, emb_dim) x (emb_dim, batch_size) = (batch_size, batch_size)

                # target: position of positive samples = diagonal element 
                targets = torch.arange(0, args.per_device_train_batch_size).long()
                if torch.cuda.is_available():
                    targets = targets.to('cuda')

                # almost same as cross entropy loss
                sim_scores = F.log_softmax(sim_scores, dim=1)
                loss = F.nll_loss(sim_scores, targets)

                losses += loss.item()
                if step % 100 == 0 :
                    print(f'{epoch}epoch loss: {losses/(step+1)}') # Accumulation할 경우 주석처리

                loss.backward()
                #################ACCUMULATION###############################
                # loss_value += loss
                # if (step+1) % args.gradient_accumulation_steps == 0 :
                #     optimizer.step()
                #     scheduler.step()
                #     self.q_encoder.zero_grad()
                #     self.p_encoder.zero_grad()
                #     global_step += 1
                #     print(loss_value/args.gradient_accumulation_steps)
                #     loss_value = 0
                ############################################################
                optimizer.step()
                # scheduler.step()
                self.q_encoder.zero_grad()
                self.p_encoder.zero_grad()
                global_step += 1
                
                torch.cuda.empty_cache()
                del p_inputs, q_inputs



In [5]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[
            "pooler_output"
        ]  # [CLS] token's hidden featrues(hidden state)

        # pooled_output = outputs[1] # TODO: check if it is 
        return pooled_output

In [6]:
dataset = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = dataset['train']

In [7]:
args = TrainingArguments(
    output_dir="dense_retireval",
    evaluation_strategy="epoch",
    learning_rate=5e-5, # recommended learning rate is 1e-5
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.01
)
model_checkpoint = "klue/bert-base"

# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
p_encoder = BertEncoder.from_pretrained(model_checkpoint).to(args.device)
q_encoder = BertEncoder.from_pretrained(model_checkpoint).to(args.device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertEncoder: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at klue/bert-base were not used when initializing BertEncoder: ['cls.predictions.bi

In [8]:
# Retriever는 아래와 같이 사용할 수 있도록 코드를 짜봅시다.
retriever = DenseRetrieval(
    args=args,
    dataset=train_dataset,
    tokenizer=tokenizer,
    p_encoder=p_encoder,
    q_encoder=q_encoder
)
retriever.train()

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=247.0, style=ProgressStyle(description_wi…

0epoch loss: 45.3947868347168
0epoch loss: 2.176331270301696
0epoch loss: 1.360393577945692


Epoch: 100%|██████████| 1/1 [05:57<00:00, 357.06s/it]







In [25]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

corpus = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

In [26]:
p_encoder = retriever.p_encoder
q_encoder = retriever.q_encoder
with torch.no_grad() :
    p_encoder.eval()

    p_embs = []
    for p in tqdm(corpus[:100]) :
        p = tokenizer([p], padding='max_length', truncation=True, return_tensors='pt').to('cuda')
        p_emb = p_encoder(**p).to('cpu').numpy()
        p_embs.append(p_emb)
p_embs = torch.Tensor(p_embs).squeeze()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [19]:
# get cosine similarity between two arrays
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# p_embs[0][0] == p_embs[1][0]

cos_sim(p_embs[5][0], p_embs[0][0])

1.0

In [24]:
tmp = tokenizer([corpus[3]], padding = 'max_length', truncation=True, return_tensors='pt').to('cuda')
tmp_emb = p_encoder(**tmp).to('cpu').detach().numpy()
tmp_emb

tmp2 = tokenizer([corpus[3]], padding = 'max_length', truncation=True, return_tensors='pt').to('cuda')
tmp2_emb = p_encoder(**tmp2).to('cpu').detach().numpy()
tmp2_emb



array([[ 0.7851726 , -0.48778328,  0.82468414, -0.9994512 ,  0.66243756,
         0.56024635, -0.99997944,  0.07688553,  0.9984877 , -0.9993106 ,
         0.01102075,  0.9790746 , -0.53845143, -0.3455528 ,  0.44703203,
        -0.99937475, -0.9993723 , -0.81930596,  0.7370982 ,  0.63658166,
         0.4322106 , -0.99978834, -0.7025193 ,  0.889094  , -0.85656047,
        -0.7376723 , -0.99999547, -0.99673414, -0.97281915, -0.99313813,
        -0.9802016 , -0.9999815 , -0.39314282,  0.99996483,  0.29816154,
        -0.88221055, -0.9993746 , -0.9999418 ,  0.86694795, -0.69002855,
         0.8707746 , -0.7348956 , -0.9999805 ,  0.9999823 ,  0.99774224,
        -0.9352857 , -0.99852043, -0.40910897,  0.8847097 , -0.48227432,
         0.9998008 , -0.39056444,  0.87851715,  0.503014  ,  0.88798594,
         0.35612586,  0.9643091 , -0.99999905, -0.01133974, -0.97378886,
        -0.7819594 , -0.06617206,  0.98293704, -0.94270927, -0.7929846 ,
        -0.15542118, -0.44325835, -0.87392616,  0.9

In [12]:
corpus[0]

'이 문서는 나라 목록이며, 전 세계 206개 나라의 각 현황과 주권 승인 정보를 개요 형태로 나열하고 있다.\n\n이 목록은 명료화를 위해 두 부분으로 나뉘어 있다.\n\n# 첫 번째 부분은 바티칸 시국과 팔레스타인을 포함하여 유엔 등 국제 기구에 가입되어 국제적인 승인을 널리 받았다고 여기는 195개 나라를 나열하고 있다.\n# 두 번째 부분은 일부 지역의 주권을 사실상 (데 팍토) 행사하고 있지만, 아직 국제적인 승인을 널리 받지 않았다고 여기는 11개 나라를 나열하고 있다.\n\n두 목록은 모두 가나다 순이다.\n\n일부 국가의 경우 국가로서의 자격에 논쟁의 여부가 있으며, 이 때문에 이러한 목록을 엮는 것은 매우 어렵고 논란이 생길 수 있는 과정이다. 이 목록을 구성하고 있는 국가를 선정하는 기준에 대한 정보는 "포함 기준" 단락을 통해 설명하였다. 나라에 대한 일반적인 정보는 "국가" 문서에서 설명하고 있다.'

In [11]:
p_embs

tensor([[ 0.9144,  0.0094,  0.6034,  ..., -0.4013,  0.8149, -0.9985],
        [ 0.9143,  0.0093,  0.6033,  ..., -0.4012,  0.8148, -0.9985],
        [ 0.9143,  0.0094,  0.6033,  ..., -0.4011,  0.8148, -0.9985],
        ...,
        [ 0.9144,  0.0094,  0.6035,  ..., -0.4010,  0.8148, -0.9985],
        [ 0.9144,  0.0091,  0.6034,  ..., -0.4011,  0.8149, -0.9985],
        [ 0.9144,  0.0093,  0.6036,  ..., -0.4013,  0.8148, -0.9985]])

In [79]:
check = []
for i, aa in enumerate(corpus) :
    a = tokenizer(aa, padding='max_length', truncation=True, return_tensors='pt').to('cuda')
    check.append(a)
    if i == 2 :
        break

In [27]:
check[0]['input_ids'][0][:10]

NameError: name 'check' is not defined

In [77]:
check[1]['input_ids'][0][:10]

tensor([    2,  1504, 10188,  2170, 11381,  3728,  3872,  2073, 20998,  2440],
       device='cuda:0')

In [28]:
def get_relavant_doc(query, q_encoder, p_embs, k=1) :

    with torch.no_grad() :
        q_encoder.eval()
        
        q_seqs_val = tokenizer(
                    [query],
                    padding="max_length",
                    truncation=True,
                    return_tensors="pt"
        ).to(args.device)
        q_emb = q_encoder(**q_seqs_val).to("cpu")  # (num_query=1, emb_dim)

    dot_prod_scores = torch.matmul(q_emb, torch.transpose(p_embs, 0, 1))
    rank = torch.argsort(dot_prod_scores, dim=1, descending=True).squeeze()

    return dot_prod_scores, rank[:k]

In [29]:
p_embs

tensor([[ 0.9113, -0.4006,  0.8705,  ..., -0.2941,  0.8215,  0.9828],
        [ 0.8962, -0.5467,  0.9159,  ..., -0.1864,  0.7883,  0.9975],
        [ 0.8993, -0.4746,  0.7946,  ..., -0.2417,  0.8237,  0.9943],
        ...,
        [ 0.8058, -0.2533,  0.8524,  ..., -0.2027,  0.8292,  0.9790],
        [ 0.8413, -0.2148,  0.8774,  ..., -0.1120,  0.7612,  0.9868],
        [ 0.7264, -0.1229,  0.8096,  ..., -0.1845,  0.8155,  0.9918]])

In [34]:
dataset['validation']['question'][0]

'처음으로 부실 경영인에 대한 보상 선고를 받은 회사는?'

In [39]:
doc_scores, doc_indices = get_relavant_doc(dataset['validation']['question'][3], q_encoder, p_embs, k = 2)


In [40]:
doc_indices

tensor([98, 97])

In [52]:
def get_relavant_doc(queries: List, q_encoder, p_embs, k=1) :

    with torch.no_grad() :
        q_encoder.eval()
        q_embs = []
        for q in queries :
            q = tokenizer([q], padding='max_length', truncation=True, return_tensors='pt').to('cuda')
            q_emb = q_encoder(**q).to('cpu').numpy()
            q_embs.append(q_emb)
    q_embs = torch.Tensor(q_embs).squeeze()

    result = torch.matmul(q_embs, torch.transpose(p_embs, 0, 1))
    if not isinstance(result, np.ndarray) :
        result = result.cpu().detach().numpy()

    doc_scores = []
    doc_indices = []
    for i in range(result.shape[0]) :
        sorted_result = np.argsort(result[i, :][::-1])
        doc_scores.append(result[i, :][sorted_result].tolist()[:k])
        doc_indices.append(sorted_result.tolist()[:k])

    return result, doc_scores, doc_indices

In [53]:
total = []
result, doc_scores, doc_indices = get_relavant_doc(dataset['validation']['question'], q_encoder, p_embs, k = 2)


In [54]:
doc_indices

[[52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157],
 [52442, 36157

In [22]:
doc_indices

[[52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [35624],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [29703],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [35624],
 [52442],
 [29711],
 [52442],
 [52442],
 [52442],
 [49808],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [48782],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [49808],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],
 [52442],


In [12]:

for idx, example in enumerate(
    tqdm(dataset['validation'], desc="Sparse retrieval: ")
):
    tmp = {
        # Query와 해당 id를 반환합니다.
        "question": example["question"],
        "id": example["id"],
        # Retrieve한 Passage의 id, context를 반환합니다.
        "context_id": doc_indices[idx],
        "context": " ".join(  # 기존에는 ' '.join()
            [corpus[pid] for pid in doc_indices[idx]]
        ),
    }
    if "context" in example.keys() and "answers" in example.keys():
        # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
        tmp["original_context"] = example["context"]
        tmp["answers"] = example["answers"]
    total.append(tmp)
cqas = pd.DataFrame(total)

TypeError: get_relavant_doc() missing 2 required positional arguments: 'q_encoder' and 'p_embs'

In [None]:
from datasets import (
    Sequence,
    Value,
    Features,
    Dataset,
    DatasetDict,
)
f = Features(
    {
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "context": Value(dtype="string", id=None),
        "id": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
    }
)
datasets = DatasetDict({"validation": Dataset.from_pandas(df, features=f)})

## Reader