In [10]:
# !pip install datasets==1.13.3 -q
# !pip install transformers==4.11.3 -q

In [28]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    BertModel, BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
    RobertaModel, RobertaPreTrainedModel

)
import transformers

In [29]:
transformers.__version__

'4.11.3'

In [30]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [31]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.7.1].
device:[cuda:0].


In [46]:
class DenseRetrieval:
  def __init__(self, args, dataset, num_neg, tokenizer, p_encoder, q_encoder):

    self.args = args
    self.dataset = dataset
    self.num_neg = num_neg

    self.tokenizer = tokenizer
    self.p_encoder = p_encoder
    self.q_encoder = q_encoder

    self.prepare_in_batch_negative(num_neg=num_neg)

  def prepare_in_batch_negative(self, dataset=None, num_neg=2, tokenizer=None):
    if dataset is None:
      dataset = self.dataset
    
    if tokenizer is None:
      tokenizer = self.tokenizer
    
    corpus = np.array(list(set([example for example in dataset['context']])))
    p_with_neg = []

    for c in dataset['context']:
      while True:
        neg_idxs = np.random.randint(len(corpus), size=num_neg)

        if c not in corpus[neg_idxs]:
          p_neg = corpus[neg_idxs]

          p_with_neg.append(c)
          p_with_neg.extend(p_neg)
          break

    q_seqs = tokenizer(
      dataset['question'],
      padding = 'max_length',
      truncation = True,
      return_tensors = 'pt',
      return_token_type_ids=False,
    )

    p_seqs = tokenizer(
      p_with_neg,
      padding = 'max_length',
      truncation=True,
      return_tensors = 'pt',
      return_token_type_ids=False,
    )

    max_len = p_seqs['input_ids'].size(-1)
    p_seqs['input_ids'] = p_seqs['input_ids'].view(-1, num_neg+1, max_len)
    p_seqs['attention_mask'] = p_seqs['attention_mask'].view(-1, num_neg+1, max_len)
    print(len(p_with_neg))
    print(p_seqs['input_ids'].shape)
    print(p_seqs['attention_mask'].shape)
    print(q_seqs['input_ids'].shape)
    print(q_seqs['attention_mask'].shape)

    train_dataset = TensorDataset(
      p_seqs['input_ids'], p_seqs['attention_mask'],
      q_seqs['input_ids'], q_seqs['attention_mask']
    )

    self.train_dataloader = DataLoader(
      train_dataset,
      shuffle=True,
      batch_size = self.args.per_device_train_batch_size
    )

    valid_seqs = tokenizer(
      dataset['context'],
      padding='max_length',
      truncation=True,
      return_tensors = 'pt'
    )

    passage_dataset = TensorDataset(
      valid_seqs['input_ids'],
      valid_seqs['attention_mask']
    )

    self.passage_dataloader = DataLoader(
      passage_dataset,
      batch_size = self.args.per_device_train_batch_size
    )

  def train(self, args=None):
    if args is None:
      args = self.args
    batch_size = args.per_device_train_batch_size

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
      {"params": [p for n, p in self.p_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
      {"params": [p for n, p in self.p_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
      {"params": [p for n, p in self.q_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
      {"params": [p for n, p in self.q_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]

    optimizer = AdamW(
      optimizer_grouped_parameters,
      lr=args.learning_rate,
      eps=args.adam_epsilon
    )
    t_total = len(self.train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps = args.warmup_steps,
      num_training_steps = t_total
    )

    global_step = 0

    self.p_encoder.zero_grad()
    self.q_encoder.zero_grad()
    torch.cuda.empty_cache()

    train_iterator = tqdm(range(int(args.num_train_epochs)), desc='Epoch')

    for _ in train_iterator:

      with tqdm(self.train_dataloader, unit='batch') as tepoch:
        for batch in tepoch:
          self.p_encoder.train()
          self.q_encoder.train()

          targets = torch.zeros(batch_size).long()
          targets = targets.to(args.device)

          p_inputs = {
            "input_ids": batch[0].view(batch_size * (self.num_neg + 1), -1).to(args.device),
            "attention_mask": batch[1].view(batch_size * (self.num_neg + 1), -1).to(args.device),
          }

          q_inputs = {
            "input_ids": batch[2].to(args.device),
            "attention_mask": batch[3].to(args.device),
          }

          p_outputs = self.p_encoder(**p_inputs)
          q_outputs = self.q_encoder(**q_inputs)

          p_outputs = p_outputs.view(batch_size, -1, self.num_neg+1)
          q_outputs = q_outputs.view(batch_size, 1, -1)

          sim_scores = torch.bmm(q_outputs, p_outputs).squeeze()
          sim_scores = sim_scores.view(batch_size, -1)
          sim_scores = F.log_softmax(sim_scores, dim=1)

          loss = F.nll_loss(sim_scores, targets)
          tepoch.set_postfix(loss=f'{str(loss.item())}')

          loss.backward()
          optimizer.step()
          scheduler.step()

          self.p_encoder.zero_grad()
          self.q_encoder.zero_grad()

          global_step += 1

          torch.cuda.empty_cache()

          del p_inputs, q_inputs

        print('loss:', loss)
        print('sim_scores:', sim_scores)
        print('targets:', targets)
        print('p_outputs:', p_outputs)
        print('q_outputs:', q_outputs)

  def get_relevant_doc(self, query, k=1, args=None, p_encoder=None, q_encoder=None):
    if args is None:
      args = self.args
    
    if p_encoder is None:
      p_encoder = self.p_encoder
    
    if q_encoder is None:
      q_encoder = self.q_encoder

    with torch.no_grad():
      p_encoder.eval()
      q_encoder.eval()

      q_seqs_val = self.tokenizer(
        [query],
        padding = 'max_length',
        truncation= True,
        return_tensors = 'pt',
        return_token_type_ids=False,
      ).to(args.device)
      q_emb = q_encoder(**q_seqs_val).to('cpu')

      p_embs = []
      for batch in self.passage_dataloader:
        batch = tuple(t.to(args.device) for t in batch)
        p_inputs = {
          'input_ids': batch[0],
          'attention_mask': batch[1]
        }
        p_emb = p_encoder(**p_inputs).to('cpu')
        p_embs.append(p_emb)
    
    p_embs = torch.stack(
      p_embs, dim = 0
    ).view(len(self.passage_dataloader.dataset), -1)

    dot_prod_scores = torch.matmul(q_emb, torch.transpose(p_embs, 0, 1))
    rank = torch.argsort(dot_prod_scores, dim=1, descending=True).squeeze()

    return rank[:k]


In [47]:
class RobertaEncoder(RobertaPreTrainedModel):
  def __init__(self, config):
    super().__init__(config)

    self.roberta = RobertaModel(config)
    self.init_weights()

  def forward(self, input_ids, attention_mask=None):
    outputs = self.roberta(
      input_ids,
      attention_mask = attention_mask
    )

    pooled_output = outputs[1]
    return pooled_output

In [48]:
train_dataset = load_from_disk('../data/train_dataset')['train']

# num_sample = 1500
sample_idx = np.random.choice(range(len(train_dataset)),100)# len(train_dataset))#num_sample)
train_dataset = train_dataset[sample_idx]

args = TrainingArguments(
  output_dir = 'dense_retrieval',
  evaluation_strategy = 'epoch',
  learning_rate=3e-4,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  num_train_epochs=20,
  weight_decay=0.01
)

model_checkpoint = 'klue/roberta-small'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
p_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to(args.device)
q_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to(args.device)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaEncoder: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaEncoder were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to 

In [49]:
retriever = DenseRetrieval(
  args=args,
  dataset=train_dataset,
  num_neg=2,
  tokenizer=tokenizer,
  p_encoder=p_encoder,
  q_encoder=q_encoder
)


300
torch.Size([100, 3, 512])
torch.Size([100, 3, 512])
torch.Size([100, 512])
torch.Size([100, 512])


In [51]:
retriever.train()

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?batch/s]

loss: tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
sim_scores: tensor([[  0.0000, -36.2715, -33.5280],
        [  0.0000, -33.6338, -30.8933],
        [  0.0000, -32.9475, -33.4464],
        [  0.0000, -35.5509, -33.0377]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
targets: tensor([0, 0, 0, 0], device='cuda:0')
p_outputs: tensor([[[ 0.7665, -0.3564, -0.3816],
         [-0.0367,  0.2894,  0.3136],
         [ 0.6012, -0.3007, -0.3840],
         ...,
         [ 0.4860, -0.2657, -0.4362],
         [ 0.3290,  0.1557, -0.0745],
         [ 0.0975,  0.3837, -0.0794]],

        [[ 0.7777, -0.3719, -0.3677],
         [-0.0216,  0.3977,  0.2892],
         [ 0.6853, -0.2343, -0.4033],
         ...,
         [ 0.5859, -0.3296, -0.4845],
         [ 0.1809,  0.1992, -0.0325],
         [ 0.1128,  0.3624, -0.0491]],

        [[ 0.7480, -0.1989, -0.1939],
         [-0.0035,  0.4077,  0.2799],
         [ 0.7513, -0.2256, -0.4738],
         ...,
         [ 0.4463, -0.2737, -0.4403],


  0%|          | 0/25 [00:00<?, ?batch/s]

loss: tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
sim_scores: tensor([[  0.0000, -33.9484, -32.3214],
        [  0.0000, -37.2396, -35.1630],
        [  0.0000, -36.4343, -34.8406],
        [  0.0000, -37.7212, -33.5250]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
targets: tensor([0, 0, 0, 0], device='cuda:0')
p_outputs: tensor([[[ 0.7283, -0.2782, -0.3960],
         [-0.1723,  0.3850,  0.3093],
         [ 0.7240, -0.3547, -0.4093],
         ...,
         [ 0.4994, -0.2296, -0.4526],
         [ 0.2878,  0.0827, -0.1738],
         [ 0.0856,  0.3798, -0.1744]],

        [[ 0.7771, -0.2875, -0.3463],
         [ 0.0338,  0.3397,  0.2189],
         [ 0.6516, -0.3365, -0.4305],
         ...,
         [ 0.5655, -0.3576, -0.3055],
         [ 0.3177,  0.2295, -0.2541],
         [ 0.1171,  0.3897, -0.0474]],

        [[ 0.7955, -0.1573, -0.3373],
         [-0.0280,  0.5440,  0.1915],
         [ 0.7047, -0.2997, -0.3354],
         ...,
         [ 0.5413, -0.2755, -0.5031],


  0%|          | 0/25 [00:00<?, ?batch/s]

loss: tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
sim_scores: tensor([[  0.0000, -36.1319, -34.3498],
        [  0.0000, -36.6715, -34.1869],
        [  0.0000, -35.6580, -33.2139],
        [  0.0000, -36.9912, -33.9816]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
targets: tensor([0, 0, 0, 0], device='cuda:0')
p_outputs: tensor([[[ 0.7786, -0.1885, -0.3027],
         [-0.1171,  0.3619,  0.2814],
         [ 0.6890, -0.2983, -0.4287],
         ...,
         [ 0.5079, -0.3488, -0.4871],
         [ 0.2321,  0.1499, -0.1742],
         [ 0.1187,  0.4083, -0.0246]],

        [[ 0.7824, -0.3777, -0.3374],
         [-0.1218,  0.3723,  0.2376],
         [ 0.6819, -0.3006, -0.3569],
         ...,
         [ 0.4983, -0.2839, -0.4383],
         [ 0.2174,  0.2396, -0.0073],
         [ 0.0654,  0.4904, -0.1698]],

        [[ 0.6927, -0.1552, -0.3062],
         [-0.1664,  0.1812,  0.2645],
         [ 0.6922, -0.3630, -0.3428],
         ...,
         [ 0.5324, -0.3162, -0.4934],


  0%|          | 0/25 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [None]:
query = "해바라기는 무슨꽃일까?"
results = retriever.get_relevant_doc(query=query, k=5)

In [None]:
print(f"[Search Query] {query}\n")

indices = results.tolist()
for i, idx in enumerate(indices):
    print(f"Top-{i + 1}th Passage (Index {idx})")
    pprint(retriever.dataset["context"][idx])

[Search Query] 해바라기는 무슨꽃일까?

Top-1th Passage (Index 1037)
('소비에트 연방은 미하일 고르바초프가 집권하고 난 뒤 대대적인 변화를 맞게 되었다. 우선 정부에 대한 비판을 허가하였으며, 페레스트로이카를 '
 '표방, 미국과의 지속적인 대화를 통해 해빙 조짐을 서서히 보이기 시작하게 되었다. 이 와중에서 고르바초프는 미국의 레이건 대통령과 만나 '
 '핵무기를 대폭 감축하는 데 합의하게 된다. 그리고 소련은 반세기 동안 적국이었던 대한민국과 1990년에 수교했다.\\n\\n한편, 침체된 '
 '자국의 경제를 중흥시키기 위해 소련은 공산주의 종주국으로서의 자리를 포기한다고 선언했고 이는 중앙유럽 공산 국가들의 급속한 붕괴를 '
 '불러왔다.\\n그러나 이러한 고르바초프의 행동에 대해 두려움을 느꼈던 소련의 공산당(볼셰비키)과 국가보안위원회(KGB) 그리고 군과 '
 '군산복합체는 쿠데타를 일으켜 고르바초프를 권좌에서 몰아내려고 하였으나, 소련 국민들의 거센 반대에 부딪히면서 실패하였다.\\n\\n쿠데타 '
 '저지 후, 옐친과 고르바초프는 소련의 미래에 대한 의견을 주고받았다. 옐친은 소련을 해체시키고, 새로운 독립국가들끼리의 연합을 구성하자고 '
 '제안하였으나, 고르바초프는 중앙정부의 힘을 최소화하는 것을 전제로 하여 소련을 존속시키자는 의견으로 맞섰다.\\n\\n한동안 양측은 의견 '
 '차이를 두고 팽팽하게 대립하였으나, 결국 옐친의 뜻대로 되어 고르바초프는 1991년 크리스마스에 대통령 자리에서 물러나게 되었고, '
 '70년간 세계를 호령하던 소비에트 연방은 붕괴되었고 12개 독립 국가로 구성된 독립 국가 연합(CIS)이 탄생하였다.')
Top-2th Passage (Index 1339)
('소비에트 연방은 미하일 고르바초프가 집권하고 난 뒤 대대적인 변화를 맞게 되었다. 우선 정부에 대한 비판을 허가하였으며, 페레스트로이카를 '
 '표방, 미국과의 지속적인 대화를 통해 해빙 조짐을 서서히 보이기 시작하게 되었