# 1. Custom Head Layer Test

## 1.1 준비

### 1.1.1 Requirements

In [32]:
import sys
sys.path.append('..')

import os
import random
import numpy as np
from tqdm import tqdm
from importlib import import_module

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    AdamW,
    TrainingArguments,
    HfArgumentParser
)

from datasets import load_from_disk, load_metric

from preprocessor import BaselinePreprocessor
from postprocessor import post_processing_function

### 1.1.2 Tokenizer, Model

In [33]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = 'klue/roberta-large'

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
config = AutoConfig.from_pretrained(model_name)
config.dropout_ratio = 0.5
model = AutoModelForQuestionAnswering.from_pretrained(model_name, config=config)
optimizer = AdamW(params=model.parameters(), lr=1e-5)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a

### 1.1.3 Datasets

In [34]:
pad_on_right = tokenizer.padding_side == "right"
question_column_name = "question"
context_column_name = "context"
answer_column_name = "answers"

def prepare_train_features(examples):
    # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다.
    # 각 example들은 이전의 context와 조금씩 겹치게됩니다.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=False, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
        padding="max_length"
    )

    # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # token의 캐릭터 단위 position를 찾을 수 있도록 offset mapping을 사용합니다.
    # start_positions과 end_positions을 찾는데 도움을 줄 수 있습니다.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 데이터셋에 "start position", "enc position" label을 부여합니다.
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)  # cls index

        # sequence id를 설정합니다 (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 하나의 example이 여러개의 span을 가질 수 있습니다.
        sample_index = sample_mapping[i]
        answers = examples[answer_column_name][sample_index]

        # answer가 없을 경우 cls_index를 answer로 설정합니다(== example에서 정답이 없는 경우 존재할 수 있음).
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # text에서 정답의 Start/end character index
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # text에서 current span의 Start token index
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # text에서 current span의 End token index
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # 정답이 span을 벗어났는지 확인합니다(정답이 없는 경우 CLS index로 label되어있음).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # token_start_index 및 token_end_index를 answer의 끝으로 이동합니다.
                # Note: answer가 마지막 단어인 경우 last offset을 따라갈 수 있습니다(edge case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # truncation과 padding(length가 짧을때만)을 통해 toknization을 진행하며, stride를 이용하여 overflow를 유지합니다.
    # 각 example들은 이전의 context와 조금씩 겹치게됩니다.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=False, # roberta모델을 사용할 경우 False, bert를 사용할 경우 True로 표기해야합니다.
        padding="max_length"
    )
    # 길이가 긴 context가 등장할 경우 truncate를 진행해야하므로, 해당 데이터셋을 찾을 수 있도록 mapping 가능한 값이 필요합니다.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # evaluation을 위해, prediction을 context의 substring으로 변환해야합니다.
    # corresponding example_id를 유지하고 offset mappings을 저장해야합니다.
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        # sequence id를 설정합니다 (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        # 하나의 example이 여러개의 span을 가질 수 있습니다.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        # Set to None the offset_mapping을 None으로 설정해서 token position이 context의 일부인지 쉽게 판별 할 수 있습니다.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]
    return tokenized_examples

datasets = load_from_disk('/opt/ml/data/train_dataset')
train_dataset = datasets['train']
eval_dataset = datasets['validation'] 
eval_dataset_for_predict = datasets['validation']

column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    prepare_train_features,
    batched=True,
    num_proc=1,
    remove_columns=column_names,
    load_from_cache_file=False
)

eval_dataset = eval_dataset.map(
    prepare_train_features,
    batched=True,
    num_proc=1,
    remove_columns=column_names,
    load_from_cache_file=False
)

eval_dataset_for_predict = eval_dataset_for_predict.map(
    prepare_validation_features,
    batched=True,
    num_proc=1,
    remove_columns=column_names,
    load_from_cache_file=False
)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [4]:
train_dataset

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 7978
})

In [5]:
eval_dataset

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 474
})

In [6]:
eval_dataset_for_predict

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping'],
    num_rows: 474
})

### 1.1.4 DataLoaders

In [35]:
data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(
    train_dataset,
    collate_fn = data_collator,
    batch_size=4
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn = data_collator,
    batch_size=4
)

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name, config=config)
model.qa_outputs

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a

Linear(in_features=1024, out_features=2, bias=True)

## 1.2 Custom Head Class Test

### 1.2.1 파라미터 갯수 확인

In [20]:
class CustomHeadBase(nn.Module):
    def __init__(self, config):
        super(CustomHeadBase, self).__init__()
        self.fc = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, hidden_states):
        return self.fc(hidden_states)

class CustomHeadCNN(nn.Module):
    def __init__(self, config):
        super(CustomHeadCNN, self).__init__()
        self.relu = nn.ReLU()
        self.conv_1 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3,
            kernel_size=1, 
            padding=0)  # stride: default 1
        self.conv_3 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3, 
            kernel_size=3, 
            padding=1)
        self.conv_5 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3 + 1,  # concat 합칠 때 맞아 떨어지도록
            kernel_size=5, 
            padding=2)
        self.fc = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, x):
        x = x.transpose(1, 2).contiguous()
        conv1_out = self.relu(self.conv_1(x).transpose(1, 2).contiguous())
        conv3_out = self.relu(self.conv_3(x).transpose(1, 2).contiguous())
        conv5_out = self.relu(self.conv_5(x).transpose(1, 2).contiguous())
        output = self.fc(torch.cat((conv1_out, conv3_out, conv5_out), -1))
        return output


# qa_outputs 변경
custom_head = CustomHeadBase(config)
model.qa_outputs = custom_head

total_params = 0
for name, param in model.qa_outputs.named_parameters():
    flat = torch.flatten(param)
    total_params += flat.shape[0]

print(f"base total params : {total_params:,d}")

#############################################################

custom_head = CustomHeadCNN(config)
model.qa_outputs = custom_head

total_params = 0
for name, param in model.qa_outputs.named_parameters():
    flat = torch.flatten(param)
    total_params += flat.shape[0]

print(f"cnn total params : {total_params:,d}")

base total params : 2,050
cnn total params : 3,150,850


In [64]:
class CustomHeadCNNWithMaxPool(nn.Module):
    def __init__(self, config):
        super(CustomHeadCNNWithMaxPool, self).__init__()
        self.relu = nn.ReLU()
        self.conv_1 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3,
            kernel_size=1, 
            padding=0)  # stride: default 1
        self.conv_3 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3, 
            kernel_size=3, 
            padding=1)
        self.conv_5 = nn.Conv1d(
            in_channels=config.hidden_size, 
            out_channels=config.hidden_size // 3 + 1,  # concat 합칠 때 맞아 떨어지도록
            kernel_size=5, 
            padding=2)
        self.maxpool = nn.MaxPool1d(kernel_size=1)
        self.dropout = nn.Dropout(p=config.dropout_ratio)
        self.fc = nn.Linear(config.hidden_size, config.num_labels)
        self.sample = nn.Linear(config.hidden_size, config.num_labels)


    def forward(self, x):
        out_sample = self.sample(x)
        x = x.transpose(1,2).contiguous()
        
        conv1_out = self.maxpool(self.relu(self.conv_1(x).transpose(1, 2).contiguous()))
        conv3_out = self.maxpool(self.relu(self.conv_3(x).transpose(1, 2).contiguous()))
        conv5_out = self.maxpool(self.relu(self.conv_5(x).transpose(1, 2).contiguous()))

        output = self.fc(self.dropout(torch.cat((conv1_out, conv3_out, conv5_out), -1))) # concat, dropout
        
        return out_sample


# qa_outputs 변경
config.dropout_ratio = 0.5
custom_head = CustomHeadCNNWithMaxPool(config)
model.qa_outputs = custom_head

# layer별 출력 크기 확인
for batch in train_dataloader:
    model.train()
    outputs = model(**batch)
    #print(outputs.keys())
    break

torch.Size([4, 384, 2])
