In [10]:
# !pip install datasets==1.13.3 -q
# !pip install transformers==4.11.3 -q

In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    BertModel, BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
    RobertaModel, RobertaPreTrainedModel

)
import transformers

In [2]:
transformers.__version__

'4.11.3'

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
es.ping()



True

In [3]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [4]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.6.0].
device:[cuda:0].


In [5]:
class DenseRetrieval:
  def __init__(self, args, dataset, num_neg, tokenizer, p_encoder, q_encoder):

    self.args = args
    self.dataset = dataset
    self.num_neg = num_neg

    self.tokenizer = tokenizer
    self.p_encoder = p_encoder
    self.q_encoder = q_encoder

    self.prepare_in_batch_negative(num_neg=num_neg)

  def prepare_in_batch_negative(self, dataset=None, num_neg=2, tokenizer=None):
    if dataset is None:
      dataset = self.dataset
    
    if tokenizer is None:
      tokenizer = self.tokenizer
    
    corpus = np.array(list(set([example for example in dataset['context']])))
    p_with_neg = []

    for c in dataset['context']:
      while True:
        neg_idxs = np.random.randint(len(corpus), size=num_neg)

        if c not in corpus[neg_idxs]:
          p_neg = corpus[neg_idxs]

          p_with_neg.append(c)
          p_with_neg.extend(p_neg)
          break

    q_seqs = tokenizer(
      dataset['question'],
      padding = 'max_length',
      truncation = True,
      return_tensors = 'pt',
      return_token_type_ids=False,
    )

    p_seqs = tokenizer(
      p_with_neg,
      padding = 'max_length',
      truncation=True,
      return_tensors = 'pt',
      return_token_type_ids=False,
    )

    # print('dataset["question"]: ', dataset['question'])
    # print('p_with_neg: ', p_with_neg)

    # print('q_seqs: ', q_seqs)
    # print('p_seqs: ', p_seqs)

    max_len = p_seqs['input_ids'].size(-1)
    p_seqs['input_ids'] = p_seqs['input_ids'].view(-1, num_neg+1, max_len)
    p_seqs['attention_mask'] = p_seqs['attention_mask'].view(-1, num_neg+1, max_len)
    # print(len(p_with_neg))
    # print(p_seqs['input_ids'].shape)
    # print(p_seqs['attention_mask'].shape)
    # print(q_seqs['input_ids'].shape)
    # print(q_seqs['attention_mask'].shape)

    train_dataset = TensorDataset(
      p_seqs['input_ids'], p_seqs['attention_mask'],
      q_seqs['input_ids'], q_seqs['attention_mask']
    )

    self.train_dataloader = DataLoader(
      train_dataset,
      shuffle=True,
      batch_size = self.args.per_device_train_batch_size
    )

    valid_seqs = tokenizer(
      dataset['context'],
      padding='max_length',
      truncation=True,
      return_tensors = 'pt'
    )

    passage_dataset = TensorDataset(
      valid_seqs['input_ids'],
      valid_seqs['attention_mask']
    )

    self.passage_dataloader = DataLoader(
      passage_dataset,
      batch_size = self.args.per_device_train_batch_size
    )

  def train(self, args=None):
    if args is None:
      args = self.args
    batch_size = args.per_device_train_batch_size

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
      {"params": [p for n, p in self.p_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
      {"params": [p for n, p in self.p_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
      {"params": [p for n, p in self.q_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
      {"params": [p for n, p in self.q_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]

    optimizer = AdamW(
      optimizer_grouped_parameters,
      lr=args.learning_rate,
      eps=args.adam_epsilon
    )
    t_total = len(self.train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps = args.warmup_steps,
      num_training_steps = t_total
    )

    global_step = 0

    self.p_encoder.zero_grad()
    self.q_encoder.zero_grad()
    torch.cuda.empty_cache()

    train_iterator = tqdm(range(int(args.num_train_epochs)), desc='Epoch')

    for _ in train_iterator:

      with tqdm(self.train_dataloader, unit='batch') as tepoch:
        for batch in tepoch:
          self.p_encoder.train()
          self.q_encoder.train()

          targets = torch.zeros(batch_size).long()
          targets = targets.to(args.device)

          print('targets: ', targets)
          

          p_inputs = {
            "input_ids": batch[0].view(batch_size * (self.num_neg + 1), -1).to(args.device),
            "attention_mask": batch[1].view(batch_size * (self.num_neg + 1), -1).to(args.device),
          }

          q_inputs = {
            "input_ids": batch[2].to(args.device),
            "attention_mask": batch[3].to(args.device),
          }

          print('p_inputs: ', p_inputs)
          print('q_inputs: ', q_inputs)

          p_outputs = self.p_encoder(**p_inputs)
          q_outputs = self.q_encoder(**q_inputs)

          print('q_outputs.shape before: ', q_outputs.shape)
          print('p_outputs.shape before: ', p_outputs.shape)

          p_outputs = p_outputs.view(batch_size, -1, self.num_neg+1)
          q_outputs = q_outputs.view(batch_size, 1, -1)

          print('targets.shape: ', targets.shape)
          print('q_outputs.shape: ', q_outputs.shape)
          print('p_outputs.shape: ', p_outputs.shape)

          sim_scores = torch.bmm(q_outputs, p_outputs).squeeze()
          print('sim_scores1: ',sim_scores)
          sim_scores = sim_scores.view(batch_size, -1)
          print('sim_scores2: ',sim_scores)
          sim_scores = F.log_softmax(sim_scores, dim=1)
          print('sim_scores3: ',sim_scores)

          loss = F.nll_loss(sim_scores, targets)
          tepoch.set_postfix(loss=f'{str(loss.item())}')

          loss.backward()
          optimizer.step()
          scheduler.step()

          self.p_encoder.zero_grad()
          self.q_encoder.zero_grad()

          global_step += 1

          torch.cuda.empty_cache()

          del p_inputs, q_inputs

        print('loss:', loss)
        print('sim_scores:', sim_scores)
        print('targets:', targets)
        print('p_outputs:', p_outputs)
        print('q_outputs:', q_outputs)

  def get_relevant_doc(self, query, k=1, args=None, p_encoder=None, q_encoder=None):
    if args is None:
      args = self.args
    
    if p_encoder is None:
      p_encoder = self.p_encoder
    
    if q_encoder is None:
      q_encoder = self.q_encoder

    with torch.no_grad():
      p_encoder.eval()
      q_encoder.eval()

      q_seqs_val = self.tokenizer(
        [query],
        padding = 'max_length',
        truncation= True,
        return_tensors = 'pt',
        return_token_type_ids=False,
      ).to(args.device)
      q_emb = q_encoder(**q_seqs_val).to('cpu')

      p_embs = []
      for batch in self.passage_dataloader:
        batch = tuple(t.to(args.device) for t in batch)
        p_inputs = {
          'input_ids': batch[0],
          'attention_mask': batch[1]
        }
        p_emb = p_encoder(**p_inputs).to('cpu')
        p_embs.append(p_emb)
    
    p_embs = torch.stack(
      p_embs, dim = 0
    ).view(len(self.passage_dataloader.dataset), -1)

    dot_prod_scores = torch.matmul(q_emb, torch.transpose(p_embs, 0, 1))
    rank = torch.argsort(dot_prod_scores, dim=1, descending=True).squeeze()

    return rank[:k]


In [6]:
class RobertaEncoder(RobertaPreTrainedModel):
  def __init__(self, config):
    super().__init__(config)

    self.roberta = RobertaModel(config)
    self.init_weights()

  def forward(self, input_ids, attention_mask=None):
    outputs = self.roberta(
      input_ids,
      attention_mask = attention_mask
    )

    pooled_output = outputs[1]
    return pooled_output

In [7]:
train_dataset = load_from_disk('../../data/train_dataset')['train']

# num_sample = 1500
# sample_idx = np.random.choice(range(len(train_dataset)),100)# len(train_dataset))#num_sample)
sample_idx = np.random.choice(range(len(train_dataset)), len(train_dataset))#num_sample)
train_dataset = train_dataset[sample_idx]

args = TrainingArguments(
  output_dir = 'dense_retrieval',
  evaluation_strategy = 'epoch',
  learning_rate=3e-4,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  num_train_epochs=20,
  weight_decay=0.01
)

model_checkpoint = 'klue/roberta-small'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
p_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to(args.device)
q_encoder = RobertaEncoder.from_pretrained(model_checkpoint).to(args.device)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaEncoder: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaEncoder were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

In [8]:
retriever = DenseRetrieval(
  args=args,
  dataset=train_dataset,
  # num_neg=12,
  num_neg=5,
  tokenizer=tokenizer,
  p_encoder=p_encoder,
  q_encoder=q_encoder
)


In [9]:
retriever.train()

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?batch/s]

targets:  tensor([0, 0, 0, 0], device='cuda:0')
p_inputs:  {'input_ids': tensor([[    0, 20561,  2181,  ...,     1,     1,     1],
        [    0,  1469,  2251,  ...,     1,     1,     1],
        [    0, 24612,  4472,  ...,  3839,  2470,     2],
        ...,
        [    0,  1878,  2067,  ...,     1,     1,     1],
        [    0,  4305,  2170,  ...,    13,  1497,     2],
        [    0,    27,  2429,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
q_inputs:  {'input_ids': tensor([[    0, 12646,  2116,  ...,     1,     1,     1],
        [    0,  9310,   837,  ...,     1,     1,     1],
        [    0,  3890,  2052,  ...,     1,     1,     1],
        [    0,  1381,  2479,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask':

KeyboardInterrupt: 

In [12]:
query = "해바라기는 무슨꽃일까?"
# query = '대한민국의 대통령은 누구인가?'
results = retriever.get_relevant_doc(query=query, k=5)

In [13]:
print(f"[Search Query] {query}\n")

indices = results.tolist()
for i, idx in enumerate(indices):
    print(f"Top-{i + 1}th Passage (Index {idx})")
    pprint(retriever.dataset["context"][idx])

[Search Query] 해바라기는 무슨꽃일까?

Top-1th Passage (Index 2346)
('마지막 유격전은 적의 수송대를 공격하는 작전이었다. 생도들은 정보원으로부터 북한군의 UN군의 서울 공격에 대비하여 마을 사람들을 화물차에 '
 '싣고 북으로 올라갈 것이라는 소식을 듣게 되고, 구출 작전을 구상하게 된다. 당시 유격대가 보유한 장비는 개인별 소총 1자루와 실탄 '
 '10여 발이 전부였지만, 국가와 국민을 지키겠다는 신념 하나로 전투에 임하게 된다. 전투는 야간에 시작되었다. 생도들은 적이 통과할 '
 '내곡리 마을 주변에 매복하고 적군을 기다렸다. 23시경 적군의 수송대가 내곡리 마을을 통과하려던 찰나, 생도들은 소총을 쏘며 습격을 '
 '감행하였다. 이때 적군의 혼란을 틈타 조영달 생도가 주민들에게 대피하라고 외친 덕에 많은 주민들이 구출될 수 있었다. 가지고 있던 장비를 '
 '모두 소모한 생도들은 불암산의 기지로 복귀하고자 하였으나 적의 흉탄에 남은 생도 모두가 장렬히 전사하였다. 이로써 국가와 국민을 위해 '
 '수도 서울과 육사를 방어하고자 항쟁한 생도들의 찬란한 유격전도 막을 내렸다. 때는 서울 수복 1주일 전이였다.')
Top-2th Passage (Index 106)
('괘불이란 야외에서 큰 법회나 의식을 열 때 쓰이는 대형불화를 말하며, 이 불화는 보살 형태의 단독상을 화면 전체에 꽉 차게 그려 넣은 '
 '것이다. \\n\\n보살상은 양 손으로 꽃가지를 받치고 서 있는 모습으로 상체를 크게 묘사한 반면 하체는 짧게 나타냈다. 머리에는 '
 '산(山) 모양의 화려한 장식이 달린 보관(寶冠)을 쓰고 있으며 네모진 얼굴을 하고 있다. 양쪽 어깨를 감싼 옷은 다양한 무늬로 장식되어 '
 '있고, 광배(光背)는 머리 광배와 몸광배를 구분하여 큼직하게 그렸다. 몸광배 안에는 꽃무늬, 구름무늬 등을 그려 공간을 채우고 있는데 '
 '옷의 화려한 무늬들과 어우러져 부처님 세계의 정경을 보는 것처럼 느껴진다. 광배 위쪽으로는 구름이 감

In [None]:
next(iter(retriever.train_dataloader))

[tensor([[[    0, 21847,  2181,  ...,  3655, 21847,     2],
          [    0,  7785,  2302,  ...,  2051,   991,     2],
          [    0,  1478,  2878,  ...,  2172,  2446,     2]],
 
         [[    0,  3719, 10695,  ...,     1,     1,     1],
          [    0,  1176,  2489,  ...,  2507,  2062,     2],
          [    0,  5865,  2079,  ...,  1513,  2259,     2]],
 
         [[    0, 15884,  2160,  ...,  2112, 14019,     2],
          [    0,  5352,  2504,  ...,     1,     1,     1],
          [    0,  1183, 11483,  ...,     1,     1,     1]],
 
         [[    0,   812,  3657,  ...,     1,     1,     1],
          [    0,  1504, 16550,  ...,     1,     1,     1],
          [    0,  5034,  2073,  ...,     1,     1,     1]]]),
 tensor([[[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]],
 
         [[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1]],
 
         [[1, 1, 1,  ..., 1, 1, 1],
          

In [16]:
import pprint as pp
print("sim_scores1:  tensor([[ 2.0670, -1.8882, -0.4755],\n        [ 1.1081, -0.2872, -0.5352],\n        [ 1.2624, -0.2268, -1.5149],\n        [ 2.0739, -1.5962, -0.9872]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 2.0670, -1.8882, -0.4755],\n        [ 1.1081, -0.2872, -0.5352],\n        [ 1.2624, -0.2268, -1.5149],\n        [ 2.0739, -1.5962, -0.9872]], device='cuda:0', grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[-0.0933, -4.0485, -2.6358],\n        [-0.3654, -1.7607, -2.0087],\n        [-0.2529, -1.7421, -3.0301],\n        [-0.0698, -3.7400, -3.1309]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 18.9575, -13.0442,  -6.3324],\n        [ 18.9219, -13.7115,  -7.6625],\n        [ 18.6899, -14.1833,  -6.6999],\n        [ 19.7926, -14.9210,  -6.0559]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 18.9575, -13.0442,  -6.3324],\n        [ 18.9219, -13.7115,  -7.6625],\n        [ 18.6899, -14.1833,  -6.6999],\n        [ 19.7926, -14.9210,  -6.0559]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -32.0017, -25.2900],\n        [  0.0000, -32.6333, -26.5844],\n        [  0.0000, -32.8732, -25.3898],\n        [  0.0000, -34.7136, -25.8485]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.7016, -15.9050,  -7.9630],\n        [ 18.6390, -16.8143,  -6.2301],\n        [ 18.5716, -15.4357,  -5.1185],\n        [ 18.6988, -16.4790,  -6.3037]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.7016, -15.9050,  -7.9630],\n        [ 18.6390, -16.8143,  -6.2301],\n        [ 18.5716, -15.4357,  -5.1185],\n        [ 18.6988, -16.4790,  -6.3037]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -35.6066, -27.6647],\n        [  0.0000, -35.4534, -24.8691],\n        [  0.0000, -34.0072, -23.6900],\n        [  0.0000, -35.1778, -25.0025]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.3020, -16.3367,  -6.9249],\n        [ 18.0863, -15.6936,  -6.4714],\n        [ 20.8831, -15.9750,  -6.4057],\n        [ 17.5353, -15.9932,  -5.3757]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.3020, -16.3367,  -6.9249],\n        [ 18.0863, -15.6936,  -6.4714],\n        [ 20.8831, -15.9750,  -6.4057],\n        [ 17.5353, -15.9932,  -5.3757]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -35.6387, -26.2269],\n        [  0.0000, -33.7799, -24.5577],\n        [  0.0000, -36.8581, -27.2888],\n        [  0.0000, -33.5285, -22.9110]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.4612, -18.5311,  -7.0692],\n        [ 17.4514, -16.1776,  -5.9871],\n        [ 17.8875, -18.1035,  -5.6846],\n        [ 19.6938, -18.1141,  -7.2004]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.4612, -18.5311,  -7.0692],\n        [ 17.4514, -16.1776,  -5.9871],\n        [ 17.8875, -18.1035,  -5.6846],\n        [ 19.6938, -18.1141,  -7.2004]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -37.9922, -26.5303],\n        [  0.0000, -33.6291, -23.4386],\n        [  0.0000, -35.9910, -23.5721],\n        [  0.0000, -37.8079, -26.8942]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 21.0263, -17.2546,  -7.3807],\n        [ 19.7196, -16.0824,  -5.2096],\n        [ 18.5708, -17.6870,  -6.0913],\n        [ 16.7673, -17.2788,  -5.5635]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 21.0263, -17.2546,  -7.3807],\n        [ 19.7196, -16.0824,  -5.2096],\n        [ 18.5708, -17.6870,  -6.0913],\n        [ 16.7673, -17.2788,  -5.5635]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -38.2809, -28.4070],\n        [  0.0000, -35.8020, -24.9292],\n        [  0.0000, -36.2579, -24.6622],\n        [  0.0000, -34.0461, -22.3308]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 18.3619, -17.5218,  -6.9513],\n        [ 17.7334, -18.8895,  -6.5805],\n        [ 19.0399, -18.0639,  -6.1909],\n        [ 16.6032, -16.8530,  -7.2458]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 18.3619, -17.5218,  -6.9513],\n        [ 17.7334, -18.8895,  -6.5805],\n        [ 19.0399, -18.0639,  -6.1909],\n        [ 16.6032, -16.8530,  -7.2458]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -35.8837, -25.3132],\n        [  0.0000, -36.6229, -24.3139],\n        [  0.0000, -37.1038, -25.2308],\n        [  0.0000, -33.4562, -23.8490]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.6403, -19.4150,  -8.1556],\n        [ 20.7628, -20.4260,  -6.9605],\n        [ 20.4725, -19.5822,  -6.1974],\n        [ 20.1756, -19.9032,  -7.3601]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.6403, -19.4150,  -8.1556],\n        [ 20.7628, -20.4260,  -6.9605],\n        [ 20.4725, -19.5822,  -6.1974],\n        [ 20.1756, -19.9032,  -7.3601]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -39.0552, -27.7959],\n        [  0.0000, -41.1888, -27.7232],\n        [  0.0000, -40.0548, -26.6699],\n        [  0.0000, -40.0787, -27.5357]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 20.6039, -20.3562,  -7.5317],\n        [ 18.2517, -18.4895,  -7.2396],\n        [ 19.2696, -18.7270,  -5.3625],\n        [ 18.6063, -20.9600,  -7.1433]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 20.6039, -20.3562,  -7.5317],\n        [ 18.2517, -18.4895,  -7.2396],\n        [ 19.2696, -18.7270,  -5.3625],\n        [ 18.6063, -20.9600,  -7.1433]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -40.9601, -28.1357],\n        [  0.0000, -36.7412, -25.4912],\n        [  0.0000, -37.9966, -24.6322],\n        [  0.0000, -39.5663, -25.7496]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.1420, -18.6040,  -6.0182],\n        [ 19.4384, -20.3461,  -7.5667],\n        [ 19.9583, -20.1868,  -5.9409],\n        [ 21.3782, -20.0411,  -7.1013]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.1420, -18.6040,  -6.0182],\n        [ 19.4384, -20.3461,  -7.5667],\n        [ 19.9583, -20.1868,  -5.9409],\n        [ 21.3782, -20.0411,  -7.1013]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -37.7459, -25.1601],\n        [  0.0000, -39.7845, -27.0051],\n        [  0.0000, -40.1451, -25.8991],\n        [  0.0000, -41.4194, -28.4796]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.6004, -20.2478,  -7.4371],\n        [ 21.7364, -21.1459,  -8.1965],\n        [ 19.3092, -19.3821,  -6.4901],\n        [ 21.4933, -19.5955,  -7.5651]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.6004, -20.2478,  -7.4371],\n        [ 21.7364, -21.1459,  -8.1965],\n        [ 19.3092, -19.3821,  -6.4901],\n        [ 21.4933, -19.5955,  -7.5651]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -39.8482, -27.0375],\n        [  0.0000, -42.8823, -29.9330],\n        [  0.0000, -38.6913, -25.7992],\n        [  0.0000, -41.0887, -29.0584]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 20.7627, -18.8483,  -7.5668],\n        [ 20.6127, -20.5880,  -7.3777],\n        [ 17.6649, -19.9797,  -6.5389],\n        [ 20.9623, -21.4547,  -7.2272]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 20.7627, -18.8483,  -7.5668],\n        [ 20.6127, -20.5880,  -7.3777],\n        [ 17.6649, -19.9797,  -6.5389],\n        [ 20.9623, -21.4547,  -7.2272]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -39.6110, -28.3295],\n        [  0.0000, -41.2007, -27.9905],\n        [  0.0000, -37.6446, -24.2039],\n        [  0.0000, -42.4171, -28.1895]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 21.2274, -21.3646,  -7.2595],\n        [ 20.4602, -19.8530,  -7.7108],\n        [ 21.4854, -20.9368,  -7.7451],\n        [ 21.9013, -20.5813,  -7.5967]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 21.2274, -21.3646,  -7.2595],\n        [ 20.4602, -19.8530,  -7.7108],\n        [ 21.4854, -20.9368,  -7.7451],\n        [ 21.9013, -20.5813,  -7.5967]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -42.5920, -28.4869],\n        [  0.0000, -40.3132, -28.1710],\n        [  0.0000, -42.4222, -29.2305],\n        [  0.0000, -42.4826, -29.4980]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 20.1489, -22.7956,  -6.4710],\n        [ 20.8912, -21.5647,  -7.8131],\n        [ 21.0385, -21.7423,  -6.8502],\n        [ 20.4188, -21.9876,  -7.7137]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 20.1489, -22.7956,  -6.4710],\n        [ 20.8912, -21.5647,  -7.8131],\n        [ 21.0385, -21.7423,  -6.8502],\n        [ 20.4188, -21.9876,  -7.7137]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -42.9445, -26.6199],\n        [  0.0000, -42.4559, -28.7044],\n        [  0.0000, -42.7808, -27.8887],\n        [  0.0000, -42.4063, -28.1324]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 20.3652, -20.7907,  -7.3486],\n        [ 20.2258, -21.4324,  -8.1898],\n        [ 22.0525, -22.6062,  -8.8879],\n        [ 19.7080, -20.6045,  -6.6445]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 20.3652, -20.7907,  -7.3486],\n        [ 20.2258, -21.4324,  -8.1898],\n        [ 22.0525, -22.6062,  -8.8879],\n        [ 19.7080, -20.6045,  -6.6445]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -41.1559, -27.7138],\n        [  0.0000, -41.6581, -28.4155],\n        [  0.0000, -44.6587, -30.9405],\n        [  0.0000, -40.3125, -26.3525]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 22.4861, -23.1768,  -8.7276],\n        [ 20.8697, -19.7706,  -7.4301],\n        [ 22.5807, -20.7921,  -7.6226],\n        [ 20.5334, -20.4921,  -8.2179]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 22.4861, -23.1768,  -8.7276],\n        [ 20.8697, -19.7706,  -7.4301],\n        [ 22.5807, -20.7921,  -7.6226],\n        [ 20.5334, -20.4921,  -8.2179]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -45.6629, -31.2138],\n        [  0.0000, -40.6404, -28.2998],\n        [  0.0000, -43.3728, -30.2033],\n        [  0.0000, -41.0255, -28.7513]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 19.9769, -22.5745,  -8.8266],\n        [ 21.8273, -20.9343,  -7.7861],\n        [ 20.9987, -21.2951,  -8.9782],\n        [ 20.3307, -21.7390,  -9.4534]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 19.9769, -22.5745,  -8.8266],\n        [ 21.8273, -20.9343,  -7.7861],\n        [ 20.9987, -21.2951,  -8.9782],\n        [ 20.3307, -21.7390,  -9.4534]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -42.5514, -28.8035],\n        [  0.0000, -42.7615, -29.6134],\n        [  0.0000, -42.2938, -29.9769],\n        [  0.0000, -42.0697, -29.7841]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 22.1752, -21.7726,  -8.8740],\n        [ 21.3944, -20.5539,  -8.4125],\n        [ 21.2713, -22.4065,  -8.4852],\n        [ 21.8581, -20.9206,  -8.7656]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 22.1752, -21.7726,  -8.8740],\n        [ 21.3944, -20.5539,  -8.4125],\n        [ 21.2713, -22.4065,  -8.4852],\n        [ 21.8581, -20.9206,  -8.7656]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -43.9478, -31.0493],\n        [  0.0000, -41.9482, -29.8069],\n        [  0.0000, -43.6779, -29.7565],\n        [  0.0000, -42.7787, -30.6237]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 20.8886, -22.0446,  -8.5062],\n        [ 22.2826, -22.1557,  -7.3194],\n        [ 21.3736, -22.4655,  -7.9215],\n        [ 22.0496, -22.2193,  -8.4336]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 20.8886, -22.0446,  -8.5062],\n        [ 22.2826, -22.1557,  -7.3194],\n        [ 21.3736, -22.4655,  -7.9215],\n        [ 22.0496, -22.2193,  -8.4336]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -42.9332, -29.3947],\n        [  0.0000, -44.4383, -29.6019],\n        [  0.0000, -43.8391, -29.2951],\n        [  0.0000, -44.2689, -30.4832]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 21.3588, -22.2370,  -6.8472],\n        [ 19.6908, -21.9729,  -6.7540],\n        [ 22.0681, -22.8626,  -8.4110],\n        [ 23.0115, -23.2496,  -8.9440]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 21.3588, -22.2370,  -6.8472],\n        [ 19.6908, -21.9729,  -6.7540],\n        [ 22.0681, -22.8626,  -8.4110],\n        [ 23.0115, -23.2496,  -8.9440]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -43.5958, -28.2060],\n        [  0.0000, -41.6637, -26.4449],\n        [  0.0000, -44.9307, -30.4791],\n        [  0.0000, -46.2611, -31.9556]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 23.3233, -21.0999,  -8.8561],\n        [ 20.8003, -21.8411,  -7.2726],\n        [ 20.2321, -21.5142,  -6.7270],\n        [ 21.0954, -24.0800,  -8.5347]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 23.3233, -21.0999,  -8.8561],\n        [ 20.8003, -21.8411,  -7.2726],\n        [ 20.2321, -21.5142,  -6.7270],\n        [ 21.0954, -24.0800,  -8.5347]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -44.4232, -32.1794],\n        [  0.0000, -42.6414, -28.0729],\n        [  0.0000, -41.7463, -26.9591],\n        [  0.0000, -45.1754, -29.6301]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 21.5006, -22.2052,  -8.2979],\n        [ 21.1089, -22.8062,  -7.6997],\n        [ 20.6536, -21.2667,  -8.6762],\n        [ 21.8066, -23.0844,  -8.1231]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 21.5006, -22.2052,  -8.2979],\n        [ 21.1089, -22.8062,  -7.6997],\n        [ 20.6536, -21.2667,  -8.6762],\n        [ 21.8066, -23.0844,  -8.1231]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -43.7057, -29.7985],\n        [  0.0000, -43.9151, -28.8086],\n        [  0.0000, -41.9203, -29.3298],\n        [  0.0000, -44.8910, -29.9297]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 18.4972, -22.1835,  -6.9772],\n        [ 21.6914, -21.1846,  -9.2879],\n        [ 20.9040, -22.9383,  -8.7836],\n        [ 21.8061, -23.1066,  -8.4577]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 18.4972, -22.1835,  -6.9772],\n        [ 21.6914, -21.1846,  -9.2879],\n        [ 20.9040, -22.9383,  -8.7836],\n        [ 21.8061, -23.1066,  -8.4577]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -40.6807, -25.4745],\n        [  0.0000, -42.8761, -30.9793],\n        [  0.0000, -43.8423, -29.6877],\n        [  0.0000, -44.9127, -30.2638]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 21.5885, -22.1243,  -7.6789],\n        [ 21.6998, -22.9157,  -7.4915],\n        [ 21.5375, -20.8942,  -7.2593],\n        [ 22.6453, -23.8574,  -7.7890]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 21.5885, -22.1243,  -7.6789],\n        [ 21.6998, -22.9157,  -7.4915],\n        [ 21.5375, -20.8942,  -7.2593],\n        [ 22.6453, -23.8574,  -7.7890]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -43.7128, -29.2674],\n        [  0.0000, -44.6155, -29.1913],\n        [  0.0000, -42.4317, -28.7968],\n        [  0.0000, -46.5027, -30.4342]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nsim_scores1:  tensor([[ 22.3031, -21.2729,  -8.4275],\n        [ 22.1473, -21.8015,  -9.0392],\n        [ 20.1393, -20.6704,  -8.0232],\n        [ 18.1470, -19.8829,  -7.3621]], device='cuda:0',\n       grad_fn=<SqueezeBackward0>)\nsim_scores2:  tensor([[ 22.3031, -21.2729,  -8.4275],\n        [ 22.1473, -21.8015,  -9.0392],\n        [ 20.1393, -20.6704,  -8.0232],\n        [ 18.1470, -19.8829,  -7.3621]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nsim_scores3:  tensor([[  0.0000, -43.5760, -30.7306],\n        [  0.0000, -43.9488, -31.1865],\n        [  0.0000, -40.8096, -28.1624],\n        [  0.0000, -38.0299, -25.5091]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\nloss: tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)\nsim_scores: tensor([[  0.0000, -43.5760, -30.7306],\n        [  0.0000, -43.9488, -31.1865],\n        [  0.0000, -40.8096, -28.1624],\n        [  0.0000, -38.0299, -25.5091]], device='cuda:0',\n       grad_fn=<LogSoftmaxBackward>)\ntargets: tensor([0, 0, 0, 0], device='cuda:0')\np_outputs: tensor([[[-0.2828,  0.2189,  0.3590],\n         [-0.4554, -0.1614,  0.1250],\n         [ 0.0780,  0.0511, -0.1944],\n         ...,\n         [ 0.4404,  0.2783,  0.3426],\n         [-0.3670,  0.0518, -0.1551],\n         [ 0.6381, -0.3617, -0.2321]],\n\n        [[-0.2635,  0.3162,  0.3221],\n         [-0.3746, -0.2633,  0.0335],\n         [ 0.2125,  0.0111, -0.2120],\n         ...,\n         [ 0.5239,  0.1418,  0.3787],\n         [-0.3427,  0.0350, -0.0782],\n         [ 0.5831, -0.3120, -0.3704]],\n\n        [[-0.1935,  0.2067,  0.2202],\n         [-0.3663, -0.2757,  0.0735],\n         [ 0.0890,  0.0329, -0.2128],\n         ...,\n         [ 0.4545,  0.2334,  0.3461],\n         [-0.2655,  0.0806, -0.2142],\n         [ 0.5106, -0.3753, -0.3272]],\n\n        [[-0.3044,  0.1907,  0.2350],\n         [-0.4767, -0.2779,  0.0664],\n         [-0.1426,  0.2462, -0.2211],\n         ...,\n         [ 0.4860,  0.3116,  0.3844],\n         [-0.4297, -0.1095, -0.1866],\n         [ 0.5486, -0.2870, -0.4021]]], device='cuda:0',\n       grad_fn=<ViewBackward>)\nq_outputs: tensor([[[ 0.1808,  0.1537,  0.0327,  ...,  0.2288, -0.3302,  0.5529]],\n\n        [[ 0.0656,  0.2280,  0.0654,  ...,  0.1543, -0.4032,  0.6550]],\n\n        [[ 0.0651,  0.2539, -0.0007,  ...,  0.1649, -0.3408,  0.5014]],\n\n        [[ 0.1571,  0.2728,  0.1050,  ...,  0.1525, -0.4152,  0.5329]]")

sim_scores1:  tensor([[ 2.0670, -1.8882, -0.4755],
        [ 1.1081, -0.2872, -0.5352],
        [ 1.2624, -0.2268, -1.5149],
        [ 2.0739, -1.5962, -0.9872]], device='cuda:0',
       grad_fn=<SqueezeBackward0>)
sim_scores2:  tensor([[ 2.0670, -1.8882, -0.4755],
        [ 1.1081, -0.2872, -0.5352],
        [ 1.2624, -0.2268, -1.5149],
        [ 2.0739, -1.5962, -0.9872]], device='cuda:0', grad_fn=<ViewBackward>)
sim_scores3:  tensor([[-0.0933, -4.0485, -2.6358],
        [-0.3654, -1.7607, -2.0087],
        [-0.2529, -1.7421, -3.0301],
        [-0.0698, -3.7400, -3.1309]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
sim_scores1:  tensor([[ 18.9575, -13.0442,  -6.3324],
        [ 18.9219, -13.7115,  -7.6625],
        [ 18.6899, -14.1833,  -6.6999],
        [ 19.7926, -14.9210,  -6.0559]], device='cuda:0',
       grad_fn=<SqueezeBackward0>)
sim_scores2:  tensor([[ 18.9575, -13.0442,  -6.3324],
        [ 18.9219, -13.7115,  -7.6625],
        [ 18.6899, -14.1833,  -6.6999],
   