In [1]:
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from tqdm import trange

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertModel, RobertaModel,
    BertPreTrainedModel,
    AdamW, get_linear_schedule_with_warmup,
    TrainingArguments,
)
from datasets import (
    Dataset,
    load_from_disk,
    concatenate_datasets,
)

from typing import List
from torch.utils.data import Sampler

In [2]:
# 난수 고정
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    random.seed(random_seed)
    np.random.seed(random_seed)
    
set_seed(42) # magic number :)

In [3]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.7.1].
device:[cuda:0].


## Training

In [4]:
class BertEncoder(BertPreTrainedModel):
    def __init__(self, config):
        super(BertEncoder, self).__init__(config)

        self.bert = BertModel(config)
        self.init_weights()
        classifier_dropout=(
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = torch.nn.Dropout(classifier_dropout)
        self.linear = torch.nn.Linear(config.hidden_size, 1)
      
    def forward(
            self,
            input_ids, 
            attention_mask=None,
            token_type_ids=None
        ): 

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        output = self.linear(pooled_output)
        return output

In [5]:
model_checkpoint = "klue/bert-base"

cross_encoder = BertEncoder.from_pretrained(model_checkpoint).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertEncoder: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertEncoder were not initialized from the model checkpoint at klue/bert-base and are newly initialized: 

In [6]:
dataset = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = dataset['train']

In [7]:
class CustomSampler(Sampler) :
    def __init__(self, data_source, batch_size) :
        self.data_source = data_source
        self.batch_size = batch_size

    def __iter__(self) :
        n = len(self.data_source)
        index_list = []
        while True :
            out = True
            for i in range(self.batch_size) :
                tmp_data = random.randint(0, n-1)
                index_list.append(tmp_data)
            for f, s in zip(index_list, index_list[1:]) :
                if abs(s-f) <= 2 :
                    out = False
            if out == True :
                break

        while True : # 추가 삽입
            tmp_data = random.randint(0, n-1)
            if (tmp_data not in index_list) and \
                (abs(tmp_data-index_list[-i]) > 2 for i in range(1,self.batch_size+1)) \
            : 
                index_list.append(tmp_data)
            if len(index_list) == n :
                break
        return iter(index_list)

    def __len__(self) :
        return len(self.data_source)

In [14]:
tokenized_examples = tokenizer(
    train_dataset['question'][0],
    train_dataset['context'][0],
    truncation="only_second",
    max_length=384,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
    padding="max_length",
    return_tensors='pt'
)

In [18]:
j = 0
c_input = {
    'input_ids' : tokenized_examples['input_ids'][j],
    'attention_mask' : tokenized_examples['attention_mask'][j],
    'token_type_ids' : tokenized_examples['token_type_ids'][j]
}

In [23]:
a = [10, 7, 3, 98]
aa = torch.tensor(a)

In [28]:
torch.sort(aa, descending = True)

torch.return_types.sort(
values=tensor([98, 10,  7,  3]),
indices=tensor([3, 0, 1, 2]))

In [16]:
len(tokenized_examples)

5

In [8]:
tokenized_examples = tokenizer(
    train_dataset['question'],
    train_dataset['context'],
    truncation="only_second",
    max_length=512,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
    padding="max_length",
    return_tensors='pt'
)
t_dataset = TensorDataset(
    tokenized_examples['input_ids'],
    tokenized_examples['attention_mask'],
    tokenized_examples['token_type_ids']
)
sampler = CustomSampler(t_dataset, 4)
train_dataloader = DataLoader(
    t_dataset,
    batch_size=4,
    sampler=sampler
)

In [9]:
batch = next(iter(train_dataloader))
# if torch.cuda.is_available():
#     batch = tuple(t.cuda() for t in batch)
cross_inputs = {
    'input_ids' : batch[0],
    'attention_mask' : batch[1],
    'token_type_ids' : batch[2]
}
for k in cross_inputs.keys() :
    cross_inputs[k] = cross_inputs[k].tolist()

In [10]:
new_input_ids = []
new_attention_mask = []
new_token_type_ids = []
targets = []
for i in range(len(cross_inputs['input_ids'])) :
    sep_index = cross_inputs['input_ids'][i].index(3) # [SEP] token의 index

    for j in range(len(cross_inputs['input_ids'])) :
        query_id = cross_inputs['input_ids'][i][:sep_index]
        query_att = cross_inputs['attention_mask'][i][:sep_index]
        query_tok = cross_inputs['token_type_ids'][i][:sep_index]
        if i == j :
            targets.append(1)
        else :
            targets.append(0)
        context_id = cross_inputs['input_ids'][j][sep_index:]
        context_att = cross_inputs['attention_mask'][j][sep_index:]
        context_tok = cross_inputs['token_type_ids'][j][sep_index:]
        query_id.extend(context_id)
        query_att.extend(context_att)
        query_tok.extend(context_tok)
        new_input_ids.append(query_id)
        new_attention_mask.append(query_att)
        new_token_type_ids.append(query_tok)

In [11]:
change_cross_inputs = {
    'input_ids' : torch.tensor(new_input_ids).to('cuda'),
    'attention_mask' : torch.tensor(new_attention_mask).to('cuda'),
    'token_type_ids' : torch.tensor(new_token_type_ids).to('cuda')
}

In [13]:
cross_output = cross_encoder(**change_cross_inputs)

In [14]:
targets = torch.tensor(targets).to('cuda')

In [18]:
cross_output = cross_output.view(-1, 4)
targets = torch.arange(0, 4).long()
                
if torch.cuda.is_available():
    targets = targets.to('cuda')

score = F.log_softmax(cross_output, dim = 1)

loss = F.nll_loss(score, targets)

In [21]:
a = cross_output.view(-1, len(targets))

In [20]:
cross_output.squeeze().shape

torch.Size([16])

In [22]:
a

tensor([[ 0.2821, -1.2217, -0.4544, -0.9775, -0.6063, -1.0076, -0.8091, -0.4097,
         -0.9165, -1.4223, -0.0757,  0.0267, -1.0324, -1.3876, -1.1907,  0.1837]],
       device='cuda:0', grad_fn=<ViewBackward>)

In [19]:
targets.shape

torch.Size([16])

In [23]:
loss = torch.nn.CrossEntropyLoss()
loss(a, targets)

ValueError: Expected input batch_size (1) to match target batch_size (16).

In [32]:
targets.unsqueeze(dim=0)

tensor([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]], device='cuda:0')

In [33]:
torch.nn.CrossEntropyLoss(a, targets.unsqueeze(dim=0))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [29]:
print(torch.nn.CrossEntropyLoss(a, targets))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [17]:
a

tensor([[ 0.2821, -1.2217, -0.4544, -0.9775, -0.6063, -1.0076, -0.8091, -0.4097,
         -0.9165, -1.4223, -0.0757,  0.0267, -1.0324, -1.3876, -1.1907,  0.1837]],
       device='cuda:0', grad_fn=<ViewBackward>)

In [14]:
cross_output.shape

torch.Size([16, 1])

In [18]:
a.dim()

2

In [21]:
a

RuntimeError: CUDA error: device-side assert triggered

In [19]:
targets.shape

torch.Size([16])

In [20]:
loss = F.nll_loss(cross_output, targets)

/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:108: cunn_ClassNLLCriterion_updateOutput_kernel: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:108: cunn_ClassNLLCriterion_updateOutput_kernel: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:108: cunn_ClassNLLCriterion_updateOutput_kernel: block: [0,0,0], thread: [10,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/THCUNN/ClassNLLCriterion.cu:108: cunn_ClassNLLCriterion_updateOutput_kernel: block: [0,0,0], thread: [15,0,0] Assertion `t >= 0 && t < n_classes` failed.


In [18]:
loss

RuntimeError: CUDA error: device-side assert triggered

In [31]:
score.shape

torch.Size([16, 1])

In [33]:
cross_output

tensor([[-0.1685],
        [ 0.2362],
        [ 0.2308],
        [ 0.0576],
        [ 0.3606],
        [-0.5582],
        [ 0.3576],
        [ 0.1884],
        [ 0.2598],
        [ 0.3477],
        [-0.0143],
        [ 0.0780],
        [ 0.0130],
        [ 0.0984],
        [ 0.1371],
        [-0.7408]], device='cuda:0', grad_fn=<AddmmBackward>)

In [48]:
sep_index = cross_inputs['input_ids'][1].index(2) # [SEP] token의 index
query = cross_inputs['input_ids'][1][:sep_index]
context = cross_inputs['input_ids'][1][sep_index:]
query.extend(context)

In [49]:
query == cross_inputs['input_ids'][1]

True

In [None]:
str(a_list)

In [41]:
cross_inputs['input_ids'][1].index(2)

AttributeError: 'Tensor' object has no attribute 'index'

In [None]:
with torch.no_grad() :
    c_encoder.eval()
    
    score_list = []
    for i in range(len(valid_corpus)) :
        passage = valid_corpus[i]
        tokenized_examples = tokenizer(
            query,
            passage,
            truncation="only_second",
            max_length=512,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            #return_token_type_ids=False,  # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
            padding="max_length",
            return_tensors='pt'
        )

        score = 0
        for i in range(len(tokenized_examples['input_ids'])) :
            c_input = {
                'input_ids' : torch.tensor(tokenized_examples['input_ids'][i].unsqueeze(dim=0)).to('cuda'),
                'attention_mask' : torch.tensor(tokenized_examples['attention_mask'][i].unsqueeze(dim=0)).to('cuda'),
                'token_type_ids' : torch.tensor(tokenized_examples['token_type_ids'][i].unsqueeze(dim=0)).to('cuda')
            }
            tmp_score = c_encoder(**c_input).to('cpu')
            score += tmp_score
        score = score / len(tokenized_examples['input_ids'])
        score_list.append(score)
    sort_result = torch.sort(torch.tensor(score_list), descending=True)

    scores, index_list = sort_result[0], sort_result[1]

In [31]:
(20 * 240) / 60 / 24

3.3333333333333335