In [75]:
# !pip install torch==1.7.1
# !pip install transformers==4.11.3
# !pip install huggingface-hub==0.0.19
# !pip install datasets==1.5.0

In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from tqdm import trange

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertModel, RobertaModel,
    BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
)
from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
)

from typing import List
from torch.utils.data import Sampler

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [3]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.7.1].
device:[cuda:0].


## Training

In [4]:
from transformers import AutoModel, RobertaPreTrainedModel, RobertaModel

In [5]:
class RoBertaEncoder(RobertaPreTrainedModel):
    def __init__(self, config):
        super(RoBertaEncoder, self).__init__(config)

        self.roberta = RobertaModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            #token_type_ids=None
        ): 

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            #token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [6]:
dataset = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = dataset['train']

In [7]:
class CustomSampler(Sampler) :
    def __init__(self, data_source, batch_size) :
        self.data_source = data_source
        self.batch_size = batch_size

    def __iter__(self) :
        n = len(self.data_source)
        index_list = []
        while True :
            out = True
            for i in range(self.batch_size) :
                tmp_data = random.randint(0, n-1)
                index_list.append(tmp_data)
            for f, s in zip(index_list, index_list[1:]) :
                if abs(s-f) <= 2 :
                    out = False
            if out == True :
                break

        while True : # 추가 삽입
            tmp_data = random.randint(0, n-1)
            if (tmp_data not in index_list) and \
                (abs(tmp_data-index_list[-i]) > 2 for i in range(1,self.batch_size+1)) \
            : 
                index_list.append(tmp_data)
            if len(index_list) == n :
                break
        return iter(index_list)

    def __len__(self) :
        return len(self.data_source)

In [8]:
# Anwer
class DenseRetrieval:
    def __init__(self,
        args,
        dataset,
        tokenizer,
        cross_encoder,
        sampler
    ):
        """
        학습과 추론에 사용될 여러 셋업을 마쳐봅시다.
        """

        self.args = args
        self.dataset = dataset

        self.tokenizer = tokenizer
        self.cross_encoder = cross_encoder
        self.sampler = sampler

    def train(self, args=None, tokenizer = None):
        if args is None:
            args = self.args
        if tokenizer is None :
            tokenizer = self.tokenizer
        
        tokenized_examples = tokenizer(
            self.dataset['question'],
            self.dataset['context'],
            truncation="only_second",
            max_length=512,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
            padding="max_length",
            return_tensors='pt'
        )

        train_dataset = TensorDataset(
            tokenized_examples['input_ids'],
            tokenized_examples['attention_mask'],
            # tokenized_examples['token_type_ids']
        )

        sampler = self.sampler(train_dataset, args.per_device_train_batch_size)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=args.per_device_train_batch_size,
                                      sampler = sampler,
                                      drop_last = True)
                                      
        no_decay = ["bias" ,"LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in self.cross_encoder.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
            {"params": [p for n, p in self.cross_encoder.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            # eps=args.adam_epsilon
        )

        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
        
        self.cross_encoder.zero_grad()
        
        train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
        self.cross_encoder.train()
        for epoch, _ in enumerate(train_iterator) :
            epoch_iterator = tqdm(train_dataloader, desc = 'Iteration')
            losses = 0
            for step, batch in enumerate(epoch_iterator) :
                # if torch.cuda.is_available() :
                #     batch = tuple(t.cuda() for t in batch)
                
                cross_inputs = {
                    'input_ids': batch[0],
                    'attention_mask' : batch[1],
                    # 'token_type_ids' : batch[2]
                }
                for k in cross_inputs.keys() :
                    cross_inputs[k] = cross_inputs[k].tolist()

                new_input_ids = []
                new_attention_mask = []
                # new_token_type_ids = []
                for i in range(len(cross_inputs['input_ids'])) :
                    sep_index = cross_inputs['input_ids'][i].index(tokenizer.sep_token_id) # [SEP] token의 index

                    for j in range(len(cross_inputs['input_ids'])) :
                        query_id = cross_inputs['input_ids'][i][:sep_index]
                        query_att = cross_inputs['attention_mask'][i][:sep_index]
                        # query_tok = cross_inputs['token_type_ids'][i][:sep_index]
        
                        context_id = cross_inputs['input_ids'][j][sep_index:]
                        context_att = cross_inputs['attention_mask'][j][sep_index:]
                        # context_tok = cross_inputs['token_type_ids'][j][sep_index:]
                        query_id.extend(context_id)
                        query_att.extend(context_att)
                        #query_tok.extend(context_tok)
                        new_input_ids.append(query_id)
                        new_attention_mask.append(query_att)
                        #new_token_type_ids.append(query_tok)

                change_cross_inputs = {
                    'input_ids' : torch.tensor(new_input_ids).to('cuda'),
                    'attention_mask' : torch.tensor(new_attention_mask).to('cuda'),
                    #'token_type_ids' : torch.tensor(new_token_type_ids).to('cuda')
                }

                cross_output = self.cross_encoder(**change_cross_inputs)
                cross_output = cross_output.view(-1, args.per_device_train_batch_size)
                targets = torch.arange(0, args.per_device_train_batch_size).long()
                                
                if torch.cuda.is_available():
                    targets = targets.to('cuda')

                score = F.log_softmax(cross_output, dim = 1)
                loss = F.nll_loss(score, targets)
                #########################No ACCUMULATION#########################
                # losses += loss.item()
                # if step % 100 == 0 :
                #     print(f'{epoch}epoch loss: {losses/(step+1)}') # Accumulation할 경우 주석처리
                
                # self.cross_encoder.zero_grad()
                # loss.backward()
                # optimizer.step()
                # scheduler.step()
                #################################################################

                #############################ACCUMULATION#########################
                loss.backward()
                if (step+1) % args.gradient_accumulation_steps == 0 :
                    optimizer.step()
                    scheduler.step()
                    self.cross_encoder.zero_grad()

                losses += loss.item()
                if (step+1) % 100 == 0 :
                    train_loss = losses / 100
                    print(f'training loss: {train_loss:4.4}')
                    losses = 0
                ##################################################################
        
        return self.cross_encoder

In [9]:
args = TrainingArguments(
    output_dir="dense_retireval",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    weight_decay=0.01
)
model_checkpoint = "klue/roberta-base"

# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
cross_encoder = RoBertaEncoder.from_pretrained(model_checkpoint).to('cuda')

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RoBertaEncoder: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RoBertaEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RoBertaEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RoBertaEncoder were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'linear.bias', 'roberta.pooler.dense.weight', 'linear.weight']
You should probably TRAIN this model on a d

In [10]:
# Retriever는 아래와 같이 사용할 수 있도록 코드를 짜봅시다.
retriever = DenseRetrieval(
    args=args,
    dataset=train_dataset,
    tokenizer=tokenizer,
    cross_encoder=cross_encoder,
    sampler = CustomSampler
)
c_encoder = retriever.train()

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 1.354
training loss: 0.5151
training loss: 0.09492
training loss: 0.03967
training loss: 0.03514
training loss: 0.0409
training loss: 0.01585
training loss: 0.01021
training loss: 0.01734
training loss: 0.01556
training loss: 0.01692
training loss: 0.006818
training loss: 0.005023
training loss: 0.006132


Epoch:   5%|▌         | 1/20 [12:08<3:50:37, 728.27s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.005817
training loss: 0.003975
training loss: 0.003885
training loss: 0.002478
training loss: 0.01263
training loss: 0.004545
training loss: 0.003227
training loss: 0.00351
training loss: 0.00366
training loss: 0.004688
training loss: 0.001338
training loss: 0.001578
training loss: 0.002402
training loss: 0.002075


Epoch:  10%|█         | 2/20 [24:12<3:38:07, 727.10s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.004777
training loss: 0.001582
training loss: 0.0003972
training loss: 0.002369
training loss: 0.002209
training loss: 0.002854
training loss: 0.001803
training loss: 0.00767
training loss: 0.002256
training loss: 0.001261
training loss: 0.006984
training loss: 0.003575
training loss: 0.004674
training loss: 0.005551


Epoch:  15%|█▌        | 3/20 [36:17<3:25:48, 726.39s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.001112
training loss: 0.01363
training loss: 0.006942
training loss: 0.005412
training loss: 0.002047
training loss: 0.01009
training loss: 0.004809
training loss: 0.003843
training loss: 0.002755
training loss: 0.01066
training loss: 0.001147
training loss: 0.0122
training loss: 0.003787
training loss: 0.0008652


Epoch:  20%|██        | 4/20 [48:23<3:13:38, 726.17s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.002253
training loss: 0.005388
training loss: 0.001857
training loss: 0.000212
training loss: 0.001442
training loss: 0.001998
training loss: 0.0003854
training loss: 0.002335
training loss: 0.0007804
training loss: 0.002007
training loss: 0.0008626
training loss: 0.002727
training loss: 0.0008021
training loss: 0.0009641


Epoch:  25%|██▌       | 5/20 [1:00:28<3:01:27, 725.86s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.000129
training loss: 0.0004826
training loss: 0.0001028
training loss: 0.0004118
training loss: 0.0009918
training loss: 0.0003398
training loss: 0.001857
training loss: 0.0002163
training loss: 0.004773
training loss: 0.001507
training loss: 0.001404
training loss: 0.000603
training loss: 0.001065
training loss: 0.0005819


Epoch:  30%|███       | 6/20 [1:12:32<2:49:16, 725.43s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.007248
training loss: 0.00392
training loss: 0.00227
training loss: 0.005356
training loss: 0.007034
training loss: 0.0003389
training loss: 0.001003
training loss: 0.000507
training loss: 0.0003259
training loss: 0.0006642
training loss: 0.002294
training loss: 0.001004
training loss: 0.007388
training loss: 0.003037


Epoch:  35%|███▌      | 7/20 [1:24:36<2:37:05, 725.06s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.002674
training loss: 0.001258
training loss: 0.002125
training loss: 0.0006727
training loss: 0.0005269
training loss: 0.002655
training loss: 0.0001712
training loss: 0.0008697
training loss: 5.653e-05
training loss: 0.002072
training loss: 0.001205
training loss: 0.0003903
training loss: 0.0008543
training loss: 0.006115


Epoch:  40%|████      | 8/20 [1:36:41<2:24:59, 724.99s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.009351
training loss: 0.001778
training loss: 0.002707
training loss: 0.00589
training loss: 0.002226
training loss: 0.0003075
training loss: 0.001037
training loss: 0.001926
training loss: 0.0003024
training loss: 0.0006375
training loss: 5.438e-05
training loss: 0.0005331
training loss: 0.0001851
training loss: 8.405e-05


Epoch:  45%|████▌     | 9/20 [1:49:58<2:16:52, 746.61s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.004367
training loss: 0.002008
training loss: 0.004158
training loss: 0.000341
training loss: 0.0001899
training loss: 0.00496
training loss: 0.002712
training loss: 0.001252
training loss: 0.0007909
training loss: 5.746e-05
training loss: 0.0001739
training loss: 0.009995
training loss: 0.001269
training loss: 0.0055


Epoch:  50%|█████     | 10/20 [2:03:17<2:07:01, 762.13s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0008144
training loss: 0.0003125
training loss: 0.00473
training loss: 0.003926
training loss: 0.000619
training loss: 0.0007535
training loss: 0.01494
training loss: 0.003071
training loss: 0.009061
training loss: 0.004217
training loss: 0.002277
training loss: 0.0002549
training loss: 0.001543
training loss: 0.007087


Epoch:  55%|█████▌    | 11/20 [2:25:11<2:19:09, 927.73s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0003072
training loss: 0.0004532
training loss: 0.0001933
training loss: 0.0001448
training loss: 0.0002347
training loss: 0.0001212
training loss: 0.001797
training loss: 8.453e-05
training loss: 0.001962
training loss: 0.01004
training loss: 0.008072
training loss: 0.0004092
training loss: 0.003945
training loss: 0.0003344


Epoch:  60%|██████    | 12/20 [2:39:45<2:01:33, 911.73s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0001777
training loss: 0.0006308
training loss: 0.0004805
training loss: 7.521e-05
training loss: 0.002287
training loss: 4.585e-05
training loss: 0.0004025
training loss: 0.0001202
training loss: 5.318e-05
training loss: 0.001971
training loss: 0.0001054
training loss: 7.598e-05
training loss: 0.0005939
training loss: 3.296e-05


Epoch:  65%|██████▌   | 13/20 [2:56:44<1:50:06, 943.77s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 2.583e-05
training loss: 0.0003872
training loss: 0.000134
training loss: 0.0002094
training loss: 7.105e-05
training loss: 3.888e-05
training loss: 0.0001751
training loss: 0.0001406
training loss: 2.384e-05
training loss: 1.767e-05
training loss: 0.0001308
training loss: 0.005513
training loss: 0.000349
training loss: 0.0005394


Epoch:  70%|███████   | 14/20 [3:14:04<1:37:16, 972.67s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0003621
training loss: 0.0002231
training loss: 0.02473
training loss: 0.005162
training loss: 0.001391
training loss: 0.0004777
training loss: 0.0002516
training loss: 0.00984
training loss: 0.01278
training loss: 0.003715
training loss: 0.0004995
training loss: 0.00438
training loss: 0.0006088
training loss: 0.0002072


Epoch:  75%|███████▌  | 15/20 [3:26:07<1:14:49, 897.95s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0001645
training loss: 0.0001586
training loss: 0.0002989
training loss: 0.0003386
training loss: 0.0002285
training loss: 0.0008457
training loss: 5.035e-05
training loss: 4.157e-05
training loss: 8.652e-05
training loss: 6.556e-05
training loss: 0.0001278
training loss: 0.000269
training loss: 2.526e-05
training loss: 0.000282


Epoch:  80%|████████  | 16/20 [3:38:11<56:22, 845.61s/it]  




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 9.806e-05
training loss: 0.001152
training loss: 5.189e-05
training loss: 0.00363
training loss: 0.0001025
training loss: 0.0001656
training loss: 0.002554
training loss: 2.332e-05
training loss: 0.000737
training loss: 0.0003363
training loss: 4.891e-05
training loss: 0.0002088
training loss: 5.881e-05
training loss: 0.00249


Epoch:  85%|████████▌ | 17/20 [3:50:15<40:27, 809.26s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 5.019e-05
training loss: 2.555e-05
training loss: 0.0001633
training loss: 8.922e-05
training loss: 0.001392
training loss: 5.31e-05
training loss: 3.285e-05
training loss: 0.00102
training loss: 0.005623
training loss: 1.856e-05
training loss: 0.00108
training loss: 0.0003001
training loss: 0.001687
training loss: 0.004968


Epoch:  90%|█████████ | 18/20 [4:02:19<26:07, 783.74s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 0.0003317
training loss: 1.468e-05
training loss: 0.004599
training loss: 2.777e-05
training loss: 3.829e-05
training loss: 0.0001469
training loss: 0.0001087
training loss: 0.0002238
training loss: 0.0002038
training loss: 0.0001487
training loss: 0.0005938
training loss: 5.553e-05
training loss: 0.0009226
training loss: 0.0009183


Epoch:  95%|█████████▌| 19/20 [4:14:23<12:45, 765.73s/it]




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1442.0, style=ProgressStyle(description_w…

training loss: 4.58e-05
training loss: 0.0004138
training loss: 2.674e-05
training loss: 0.0007351
training loss: 4.6e-05
training loss: 0.0002842
training loss: 3.887e-05
training loss: 7.635e-05
training loss: 7.302e-05
training loss: 2.911e-05
training loss: 0.0003286
training loss: 3.881e-05
training loss: 5.031e-05
training loss: 0.001332


Epoch: 100%|██████████| 20/20 [4:26:28<00:00, 799.40s/it]







In [11]:
torch.save(c_encoder, '/opt/ml/custom/c_roberta_encoder_e40_b16.pt')

## 실험

In [11]:
valid_corpus = list(set([example['context'] for example in dataset['validation']]))[:10]
sample_idx = random.choice(range(len(dataset['validation'])))
query = dataset['validation'][sample_idx]['question']
ground_truth = dataset['validation'][sample_idx]['context']

if not ground_truth in valid_corpus:
  valid_corpus.append(ground_truth)

print(query)
print(ground_truth)

볼드윈이 "당신들은 나를 야유합니까?"라는 말을 한 연도는?
퇴임에서 볼드윈의 세월은 조용하였다. 네빌 체임벌린이 사망하면서 전쟁 이전의 유화 정책에서 볼드윈의 지각된 부분은 제2차 세계 대전이 일어난 동안과 그 후에 그를 인기없는 인물로 만들었다. 신문의 캠페인은 그를 전쟁 생산에 자신의 시골 저택의 철문을 기부하지 않은 것으로 사냥하였다. 전쟁이 일어난 동안 윈스턴 처칠은 에이먼 데 벌레라의 아일랜드의 지속적인 중립을 향한 더욱 힘든 경향을 취하는 영국의 조언에 그를 단 한번 상담하였다.\n\n1945년 6월 부인 루시 여사가 사망하였다. 이제 볼드윈 자신은 관절염을 겪어 걸어다는 데 지팡이가 필요하였다. 조지 5세의 동상의 공개식에 1947년 런던에서 자신의 최종 공개적인 출연을 이루었다. 관중들은 전직 총리를 알아주어 그를 응원하였으나 이 당시 볼드윈은 귀머거리였고, 그들에게 "당신들은 나를 야유합니까?"라고 의문하였다. 1930년 케임브리지 대학교의 총장으로 만들어진 그는 1947년 12월 14일 80세의 나이에 우스터셔주 스투어포트온세번 근처 애슬리홀에서 수면 중 자신의 사망까지 이 수용력에 지속하였다. 그는 화장되었고, 그의 재는 우스터 대성당에 안치되었다.


In [12]:
with torch.no_grad() :
    c_encoder.eval()
    
    score_list = []
    for i in range(len(valid_corpus)) :
        passage = valid_corpus[i]
        tokenized_examples = tokenizer(
            query,
            passage,
            truncation="only_second",
            max_length=512,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
            padding="max_length",
            return_tensors='pt'
        )

        score = 0
        for i in range(len(tokenized_examples['input_ids'])) :
            c_input = {
                'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
            }
            tmp_score = c_encoder(**c_input).to('cpu')
            score += tmp_score
        score = score / len(tokenized_examples['input_ids'])
        score_list.append(score)
    sort_result = torch.sort(torch.tensor(score_list), descending=True)

    scores, index_list = sort_result[0], sort_result[1]

  'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
  'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
  'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')


In [13]:
k = 5
print("[Search query]\n", query, "\n")
print("[Ground truth passage]")
print(ground_truth, "\n")

for i in range(k):
  print("Top-%d passage with score %.4f" % (i+1, scores[i]))
  print(valid_corpus[index_list[i]])

[Search query]
 볼드윈이 "당신들은 나를 야유합니까?"라는 말을 한 연도는? 

[Ground truth passage]
퇴임에서 볼드윈의 세월은 조용하였다. 네빌 체임벌린이 사망하면서 전쟁 이전의 유화 정책에서 볼드윈의 지각된 부분은 제2차 세계 대전이 일어난 동안과 그 후에 그를 인기없는 인물로 만들었다. 신문의 캠페인은 그를 전쟁 생산에 자신의 시골 저택의 철문을 기부하지 않은 것으로 사냥하였다. 전쟁이 일어난 동안 윈스턴 처칠은 에이먼 데 벌레라의 아일랜드의 지속적인 중립을 향한 더욱 힘든 경향을 취하는 영국의 조언에 그를 단 한번 상담하였다.\n\n1945년 6월 부인 루시 여사가 사망하였다. 이제 볼드윈 자신은 관절염을 겪어 걸어다는 데 지팡이가 필요하였다. 조지 5세의 동상의 공개식에 1947년 런던에서 자신의 최종 공개적인 출연을 이루었다. 관중들은 전직 총리를 알아주어 그를 응원하였으나 이 당시 볼드윈은 귀머거리였고, 그들에게 "당신들은 나를 야유합니까?"라고 의문하였다. 1930년 케임브리지 대학교의 총장으로 만들어진 그는 1947년 12월 14일 80세의 나이에 우스터셔주 스투어포트온세번 근처 애슬리홀에서 수면 중 자신의 사망까지 이 수용력에 지속하였다. 그는 화장되었고, 그의 재는 우스터 대성당에 안치되었다. 

Top-1 passage with score 9.6913
퇴임에서 볼드윈의 세월은 조용하였다. 네빌 체임벌린이 사망하면서 전쟁 이전의 유화 정책에서 볼드윈의 지각된 부분은 제2차 세계 대전이 일어난 동안과 그 후에 그를 인기없는 인물로 만들었다. 신문의 캠페인은 그를 전쟁 생산에 자신의 시골 저택의 철문을 기부하지 않은 것으로 사냥하였다. 전쟁이 일어난 동안 윈스턴 처칠은 에이먼 데 벌레라의 아일랜드의 지속적인 중립을 향한 더욱 힘든 경향을 취하는 영국의 조언에 그를 단 한번 상담하였다.\n\n1945년 6월 부인 루시 여사가 사망하였다. 이제 볼드윈 자신은 관절염을 겪어 걸어다는 데 지팡이가 필요하였다. 조지 5세의

In [28]:
index_list

tensor([10,  0,  4,  1,  5,  9,  6,  3,  7,  2,  8])

In [27]:
tokenized_examples['input_ids'][0].unsqueeze(dim=0).shape

torch.Size([1, 512])

In [None]:
top_k_index_list = []
for i in range(len(index_list)) :
    temp = index_list[i][:k]
    top_k_index_list.appedn(temp)

## 실제

In [30]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

corpus = list(
    dict.fromkeys([v["text"] for v in wiki.values()])
)  # set 은 매번 순서가 바뀌므로

In [32]:
question_data = dataset['validation']['question']
with torch.no_grad() :
    c_encoder.eval()

    result_scores = []
    result_indices = []
    for i in tqdm(range(len(question_data))) :
        question = question_data[i]

        question_score = []
        for i in tqdm(range(len(corpus))) :
            passage = corpus[i]
            tokenized_examples = tokenizer(
                question,
                passage,
                truncation="only_second",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
                padding="max_length",
                return_tensors='pt'
            )

            score = 0
            for i in range(len(tokenized_examples['input_ids'])) :
                c_input = {
                    'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                    'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                    'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
                }
                tmp_score = c_encoder(**c_input).to('cpu')
                score += tmp_score
            score = score / len(tokenized_examples['input_ids'])
            question_score.append(score)

        sort_result = torch.sort(torch.tensor(question_score), descending=True)
        scores, index_list = sort_result[0], sort_result[1]

        result_scores.append(scores)
        result_indices.append(index_list)

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=56737.0), HTML(value='')))

  'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
  'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
  'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')


KeyboardInterrupt: 

In [None]:
top_k_index_list = []
for i in range(len(index_list)) :
    temp = index_list[i][:k]
    top_k_index_list.appedn(temp)

In [None]:
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": top_k_index_list[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [corpus[pid] for pid in top_k_index_list[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_100 = pd.DataFrame(total)

## Elastic

In [11]:
dataset = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = dataset['train']

In [12]:
data = pd.read_csv('/opt/ml/custom/top100_wikipedia.csv')

In [13]:
doc_indices = []
for i in range(len(data)) :
    tmp = eval(data['document_id'][i])
    doc_indices.append(tmp)

In [14]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

corpus = []
for v in wiki.values() :
    corpus.append(v['text'])

In [16]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [12]:
c_encoder = torch.load('/opt/ml/custom/c_encoder_e20.pt')

In [15]:
model_checkpoint = "klue/bert-base"
# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
question_data = dataset['validation']['question']
with torch.no_grad() : 
    c_encoder.eval()

    result_scores = []
    result_indices = []
    for i in tqdm(range(len(question_data))) :
        question = question_data[i]
        question_score = []
        for indice in tqdm(doc_indices[i]) :
            passage = corpus[indice]
            tokenized_examples = tokenizer(
                question,
                passage,
                truncation="only_second",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
                padding="max_length",
                return_tensors='pt'
            )
            score = 0
            for i in range(len(tokenized_examples['input_ids'])) :
                c_input = {
                    'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                    'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                    'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
                }
                tmp_score = c_encoder(**c_input).to('cpu')
                score += tmp_score
            score = score / len(tokenized_examples['input_ids'])
            question_score.append(score)
        sort_result = torch.sort(torch.tensor(question_score), descending=True)
        scores, index_list = sort_result[0], sort_result[1]

        result_scores.append(scores.tolist())
        result_indices.append(index_list.tolist())        

In [33]:
final_indices = []
for i in range(len(doc_indices)) :
    t_list = [doc_indices[i][result_indices[i][k]] for k in range(7)]
    final_indices.append(t_list)

In [34]:
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": final_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [corpus[pid] for pid in final_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_50 = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=240.0, style=ProgressStyle(descri…




In [35]:
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

0.875


In [36]:
cqas_50.to_csv('b16_special_shuffle_elastic_ce40_t7.csv', index = False)

## Test

In [7]:
dataset = load_from_disk('/opt/ml/data/test_dataset')

In [8]:
data = pd.read_csv('/opt/ml/custom/test_elastic_top100.csv')

doc_indices = []
for i in range(len(data)) :
    tmp = eval(data['document_id'][i])
    doc_indices.append(tmp)

In [9]:
# test에 대해서만 실행
for i in tqdm(range(len(doc_indices))) :
    for j in range(len(doc_indices[i])) :
        doc_indices[i][j] = int(doc_indices[i][j])

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




In [10]:
with open('/opt/ml/data/wikipedia_documents.json', "r", encoding="utf-8") as f:
    wiki = json.load(f)

corpus = []
for v in wiki.values() :
    corpus.append(v['text'])

In [11]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [12]:
c_encoder = torch.load('/opt/ml/custom/c_encoder_e40_b16.pt')
model_checkpoint = "klue/bert-base"
# 혹시 위에서 사용한 encoder가 있다면 주석처리 후 진행해주세요 (CUDA ...)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
question_data = dataset['validation']['question']
with torch.no_grad() : 
    c_encoder.eval()

    result_scores = []
    result_indices = []
    for i in tqdm(range(len(question_data))) :
        question = question_data[i]
        question_score = []
        for indice in tqdm(doc_indices[i]) :
            passage = corpus[int(indice)]
            tokenized_examples = tokenizer(
                question,
                passage,
                truncation="only_second",
                max_length=512,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
                padding="max_length",
                return_tensors='pt'
            )
            score = 0
            for i in range(len(tokenized_examples['input_ids'])) :
                c_input = {
                    'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                    'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                    'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
                }
                tmp_score = c_encoder(**c_input).to('cpu')
                score += tmp_score
            score = score / len(tokenized_examples['input_ids'])
            question_score.append(score)
        sort_result = torch.sort(torch.tensor(question_score), descending=True)
        scores, index_list = sort_result[0], sort_result[1]

        result_scores.append(scores.tolist())
        result_indices.append(index_list.tolist())        

### result_indices 저장 및 불러오기

In [26]:
import csv
with open('listfile.csv', 'w', newline='') as f: 
    writer = csv.writer(f)
    writer.writerow(result_indices)



In [29]:
with open('listfile.csv', 'r', encoding='utf-8') as f:
    rdr = csv.reader(f)
    for i, line in enumerate(rdr) :
        if i == 0 :
            kk = line

### 끝

In [14]:
final_indices = []
for i in range(len(doc_indices)) :
    t_list = [doc_indices[i][result_indices[i][k]] for k in range(5)]
    final_indices.append(t_list)

In [15]:
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": final_indices[idx],
            "context": [corpus[pid] for pid in final_indices[idx]]
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_50 = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=600.0, style=ProgressStyle(descri…




In [24]:
cqas_50.to_csv('elastic_crossencoder.csv', index = False)

In [59]:
total = []
for idx, example in enumerate(
        tqdm(dataset['validation'], desc="Dense retrieval: ")
    ):
        tmp = {
            # Query와 해당 id를 반환합니다.
            "question": example["question"],
            "id": example["id"],
            # Retrieve한 Passage의 id, context를 반환합니다.
            "context_id": final_indices[idx],
            "context": " ".join(  # 기존에는 ' '.join()
                [corpus[pid] for pid in final_indices[idx]]
            ),
        }
        if "context" in example.keys() and "answers" in example.keys():
            # validation 데이터를 사용하면 ground_truth context와 answer도 반환합니다.
            tmp["original_context"] = example["context"]
            tmp["answers"] = example["answers"]
        total.append(tmp)

cqas_50 = pd.DataFrame(total)

HBox(children=(FloatProgress(value=0.0, description='Dense retrieval: ', max=600.0, style=ProgressStyle(descri…




In [None]:
correct_length = []
for i in range(len(cqas_50)) :
    if cqas_50['original_context'][i] in cqas_50['context'][i] :
        correct_length.append(i)
print(len(correct_length) / len(dataset['validation']))

In [62]:
cqas_50.to_csv('test_b16_special_shuffle_elastic_ce40_t5.csv', index = False)

In [2]:
import pandas as pd
df = pd.read_csv("/opt/ml/custom/test_b16_special_shuffle_elastic_ce40_t5.csv")

In [None]:
for i in range(len(df)):\
    df["context_id"][i] = eval(df["context_id"][i])
    df["answers"][i] = eval(df["answers"][i])

In [1]:
import pandas as pd
dddd = pd.read_csv('/opt/ml/data/train_dataset/Aug_Encoder.csv')

In [3]:
len(dddd)

32689