<a href="https://colab.research.google.com/github/YoonJiHwan98/Goorm_NLP/blob/main/QA_model_Albert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install wandb


In [None]:

!pip install konlpy

In [None]:

#!wandb login --relogin
#% env WANDB_PROJECT=klue-mrc
#% env WANDB_ENTITY=team5_groom

#eaa023049260e35a4fe8a7ff980d18c15934e74a


In [None]:
# 라이브러리 임포트

import pandas as pd
import sys
from tqdm import tqdm, trange
import torch
from torch.utils.data import DataLoader
import wandb
from transformers import AdamW
from statistics import mean

구글 드라이브 공유폴더 사용하기

https://sundries-in-myidea.tistory.com/96

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Loading Data Set

In [None]:
#%%
from typing import List, Tuple, Dict, Any
import json
import random
import re


class KoMRC:
    def __init__(self, data, indices: List[Tuple[int, int, int]]):
        self._data = data
        self._indices = indices

    # Json을 불러오는 메소드
    @classmethod
    def load(cls, file_path: str):
        with open(file_path, "r", encoding="utf-8") as fd:
            data = json.load(fd)

        indices = []
        for d_id, document in enumerate(data["data"]):
            for p_id, paragraph in enumerate(document["paragraphs"]):
                for q_id, _ in enumerate(paragraph["qas"]):
                    indices.append((d_id, p_id, q_id))

        return cls(data, indices)

    # 데이터 셋을 잘라내는 메소드
    @classmethod
    def split(cls, dataset, eval_ratio: float = 0.1, seed=42):
        indices = list(dataset._indices)
        random.seed(seed)
        random.shuffle(indices)
        train_indices = indices[int(len(indices) * eval_ratio) :]
        eval_indices = indices[: int(len(indices) * eval_ratio)]

        return cls(dataset._data, train_indices), cls(dataset._data, eval_indices)

    def __getitem__(self, index: int) -> Dict[str, Any]:
        d_id, p_id, q_id = self._indices[index]
        paragraph = self._data["data"][d_id]["paragraphs"][p_id]
        d_id, p_id, q_id = self._indices[index]
        
        # \n, \n\n 을 찾아서 공백으로 처리
        # \n을 찾기 때문에 \n\n인 경우 공백이 2개 생김.....
        # tokenization했을 때 차이가 있는지 확인...
        p = re.compile("[\\n]")
        # p = re.compile("[#:^$@*※~&ㆍ!』\"\'\\n…\○]")
        context = paragraph["context"]
        context = p.sub(" ", context)
        qa = paragraph["qas"][q_id]

        """
        guid 부분
        train dataset -- guid과
        ai-hub train dataset -- context_id 이 다름
        """
        # context_id 가 qas 안에 key 형태로 있음
        if paragraph.get("context_id"):
            guid = str(paragraph["context_id"])
        else:
            guid = qa["guid"]

        question = qa["question"]

        # answers에 필요한 것만 추출
        if isinstance(qa['answers'], dict):
            answers = [{
                "text": qa["answers"]["text"],
                "answer_start": qa["answers"]["answer_start"],
            }]
        else:
            answers = qa["answers"]

        return {
            "guid": guid,
            "context": context,
            "question": question,
            "answers": answers,
        }

    def __len__(self) -> int:
        return len(self._indices)

In [None]:
# file_path = "./drive/MyDrive/구름 자연어처리 프로젝트 공유 폴더/Reading Comprehension/data/TL_span_extraction_short.json"
file_path = "./drive/MyDrive/구름 자연어처리 프로젝트 공유 폴더/Reading Comprehension/data/train.json"

dataset = KoMRC.load(file_path)
# print("Number of Samples:", len(dataset))
print(dataset[0])

In [None]:
train_dataset, dev_dataset = KoMRC.split(dataset)
print("Number of Train Samples:", len(train_dataset))
print("Number of Dev Samples:", len(dev_dataset))


In [None]:
train_dataset[0]

# Preprocessing the Training Data

In [None]:
% pip install sentencepiece

In [None]:
from transformers import AlbertTokenizer , AutoTokenizer

model_name = "albert-base-v2" ## token 해보기 
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
tokenizer.model_max_length = 512

In [None]:
#data preferation 
train_context = [train_dataset[i]['context'] for i in range(len(train_dataset))]
train_question = [train_dataset[i]['question'] for i in range(len(train_dataset))]
train_answers = [train_dataset[i]['answers'] for i in range(len(train_dataset))]

# train_encodings = tokenizer(train_question, train_context, max_length=1024, truncation=True, padding="max_length", return_token_type_ids=True)
train_encodings = tokenizer(train_question, train_context, truncation="only_second", padding="max_length", return_tensors='pt')


dev_context = [dev_dataset[i]['context'] for i in range(len(dev_dataset))]
dev_question = [dev_dataset[i]['question'] for i in range(len(dev_dataset))]
dev_answers = [dev_dataset[i]['answers'] for i in range(len(dev_dataset))]

dev_encodings = tokenizer(dev_question, dev_context, truncation="only_second", padding="max_length", return_tensors='pt')


In [None]:
# 정답에 들어가는 데이터는 start, end position.

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        # 모델 학습을 위해 정답 데이터를 만들겠습니다.
        # 정답 데이터는 start음절과 end 음절로 구성되어 있습니다.
        # 모델은 전체 토큰 중에서 start token과 end token을 찾아내는 것을 목표로 학습하게 됩니다.
        gold_text = answer[0]['text']
        start_idx = answer[0]['answer_start']
        end_idx = start_idx + len(gold_text)
        

        # sometimes squad answers are off by a character or two – fix this
        # 실제 본문에서 해당 음절 번호로 잘라냈을 때, 정답과 같은지 검사해서 start, end를 보정합니다 :-)
        # '이순신은 조선 중기의 무신이다' -> '이순신' -> start: 0, end: 4
        if context[start_idx:end_idx] == gold_text:
            answer[0]['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer[0]['answer_start'] = start_idx - 1
            answer[0]['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer[0]['answer_start'] = start_idx - 2
            answer[0]['answer_end'] = end_idx - 2     # When the gold label is off by two characters
    return answers

train_answers = add_end_idx(train_answers, train_context)
dev_answers = add_end_idx(dev_answers, dev_context)

In [None]:
train_answers[:5]

In [None]:
# 기계 독해의 핵심부분.
# 우리가 원하는건 음절단위로 읽는거지만 bert tokenizer는 wordpiece 단위.
# 그래서 음절 단위에 있는 숫자를 tokenindex로 바꿔줘야 함.
# 그래야지 해당 token index가 정답label의 시작이다. 라는 걸 알 수 있다.

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    # 이제 음절 index를 token index와 mapping하는 작업을 해보도록 하겠습니다 :-)
    for i in range(len(answers)):
        # tokenizer의 char_to_token 함수를 호출하면 음절 숫자를 token index로 바꿔줄 수 있습니다.
        start_positions.append(encodings.char_to_token(i, answers[i][0]['answer_start'], len(train_encodings['input_ids'][0])))
        end_positions.append(encodings.char_to_token(i, answers[i][0]['answer_end'], len(train_encodings['input_ids'][0])))
        
        # 아래 부분은 truncation을 위한 과정입니다.
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i][0]['answer_end'] + 1)

        # 추가된 예외 처리, 예를들어서 tokenizer와 model input의 max_length가 512인데, start와 end position이 600과 610 이면 둘다 max_length로 변경해야함.
        # 어차피 max_length가 512인 모델은 정답을 볼 수 없음.  
        ## 길이가 길 경우 대비해서 이를 나눈다. 그래서 token , strider 옵션 

        if start_positions[-1] is None or start_positions[-1] > tokenizer.model_max_length: 
            start_positions[-1] = tokenizer.model_max_length
        
        if end_positions[-1] is None or end_positions[-1] > tokenizer.model_max_length:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

train_encodings = add_token_positions(train_encodings, train_answers)
dev_encodings = add_token_positions(dev_encodings, dev_answers)

In [None]:
import torch

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset_encoding = MyDataset(train_encodings)
dev_dataset_encoding = MyDataset(dev_encodings)

In [None]:
train_dataset[0]

In [None]:
import torch
torch.manual_seed(42)

In [None]:
batch_size = 32
accumulation = 4

train_loader = DataLoader(train_dataset_encoding, batch_size=batch_size//accumulation, shuffle=True, num_workers=0)
dev_loader = DataLoader(dev_dataset_encoding, batch_size=batch_size//accumulation, shuffle=False, num_workers=0)

In [None]:
class TokenizedKoMRC(KoMRC):  
    def __init__(self, data, indices: List[Tuple[int, int, int]]) -> None:
        super().__init__(data, indices)
        self._tagger = konlpy.tag.Mecab()

    def _tokenize_with_position(self, sentence: str) -> List[Tuple[str, Tuple[int, int]]]:
        position = 0
        tokens = []
        for morph in self._tagger.morphs(sentence):
            position = sentence.find(morph, position)
            tokens.append((morph, (position, position + len(morph))))
            position += len(morph)
        return tokens
            
    def __getitem__(self, index: int) -> Dict[str, Any]:
        sample = super().__getitem__(index)

        context, position = zip(*self._tokenize_with_position(sample['context']))
        context, position = list(context), list(position)
        question = self._tagger.morphs(sample['question'])

        if sample['answers'] is not None:
            answers = []
            for answer in sample['answers']:
                for start, (position_start, position_end) in enumerate(position):
                    if position_start <= answer['answer_start'] < position_end:
                        break
                else:
                    print(context, answer)
                    raise ValueError("No mathced start position")

                target = ''.join(answer['text'].split(' '))
                source = ''
                for end, morph in enumerate(context[start:], start):
                    source += morph
                    if target in source:
                        break
                else:
                    print(context, answer)
                    raise ValueError("No Matched end position")

                answers.append({
                    'start': start,
                    'end': end
                })
        else:
            answers = None
        
        return {
            'guid': sample['guid'],
            'context_original': sample['context'],
            'context_position': position,
            'question_original': sample['question'],
            'context': context,
            'question': question,
            'answers': answers
        }


# QA Model 학습

In [None]:
from transformers import AlbertConfig, AlbertModel

# # Initializing an ALBERT-base style configuration
# albert_base_configuration = AlbertConfig(
#     hidden_size=768,
#     num_attention_heads=12,
#     intermediate_size=3072,
#     attention_probs_dropout_prob = 0.1
# )

# # Initializing a model from the ALBERT-base style configuration
# #model = AlbertModel(albert_base_configuration)

# torch.manual_seed(42)

# # Accessing the model configuration
# configuration = model.config

In [None]:

# torch.manual_seed(42)

# config = AlbertConfig(  ## 파라미터 변경할때는 여기서 파라미터 변경
#      max_position_embeddings=512,
#      hidden_size=768,
#      num_attention_heads=12,
#      intermediate_size=3072,
#      attention_probs_dropout_prob = 0.1,
#      hidden_dropout_prob   = 0.1
# )
# model = AlbertForQuestionAnswering(config)
# model.to(device)

In [None]:
from transformers import (AlbertConfig, AlbertForQuestionAnswering
                          )

# model = RobertaForQuestionAnswering.from_pretrained('xlm-roberta-base', max_length = 1024)
# model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base', max_length = 1024)


#model = AlbertForQuestionAnswering.from_pretrained(model_name, max_length = 512)


model = AlbertForQuestionAnswering.from_pretrained(
        pretrained_model_name_or_path=model_name,
        max_length = 512,
        attention_probs_dropout_prob=0.1,
        hidden_dropout_prob=0.1
    )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:

model.train() 
learning_rate = 1e-4 # 1e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [None]:
df_loss = pd.DataFrame(columns = ['epoch','train_loss','dev_loss'])
df_loss

In [None]:
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.75 ** epoch)

In [None]:
from statistics import mean
from torch.nn.utils import clip_grad_norm_

train_epoch = 20
lowest_valid_loss = 9999.

train_losses = []
dev_losses = []

step = 0

model.train()
for epoch in range(train_epoch):
    print("Epoch", epoch)
    running_loss = 0.
    losses = []
    progress_bar = tqdm(train_loader, desc='Train')
    for batch in progress_bar:
        inputs = {key: value.cuda() for key, value in batch.items()}
        output = model(**inputs)        

        loss = output.loss
        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ')
            exit(1)

        (loss / accumulation).backward()
        running_loss += loss.item()

        step += 1

        if step % accumulation:
            continue

        clip_grad_norm_(model.parameters(), max_norm=1.)

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        
        losses.append(running_loss / accumulation)
        running_loss = 0.
        progress_bar.set_description(f"Train - Loss: {losses[-1]:.3f}")
    
    #scheduler.step() # you can set it like this!
    train_losses.append(mean(losses))
    print(f"train score: {train_losses[-1]:.3f}")

    # Evaluation
    losses = []
    for batch in tqdm(dev_loader, desc="Evaluation"):        
        with torch.no_grad():
            inputs = {key: value.cuda() for key, value in batch.items()}
            output = model(**inputs)        

            loss = output.loss

        losses.append(loss.item())
    
    #scheduler.step(mean(losses))
    dev_losses.append(mean(losses))
    print(f"Evaluation score: {dev_losses[-1]:.3f}")      


model.save_pretrained(f'/content/drive/MyDrive/구름 자연어처리과정/프로젝트/Reading Comprehension/dump/Albert_model.{epoch}')

df_loss['train_loss'] = train_losses
df_loss['dev_loss'] = dev_losses
df_loss.to_csv('albert_20_loss_Dropout.csv',index=False) 

In [None]:
df_loss

In [None]:
df_loss.to_csv('/content/drive/MyDrive/구름 자연어처리과정/프로젝트/Reading Comprehension/사전 평가용/albert_30_loss_No_dropout.csv',index=False)

In [None]:
df_loss['train_loss'] = train_losses
df_loss['dev_loss'] = dev_losses

In [None]:
#train_losses
#dev_losses
#train_dataset[0]

## Answer Inference
모델의 Output을 활용해서 질문의 답을 찾는 코드를 작성하자.

In [None]:
model = AlbertForQuestionAnswering.from_pretrained('/content/drive/MyDrive/구름 자연어처리과정/프로젝트/Reading Comprehension/dump/Albert_model_dropout')
model.cuda()
model.eval()

In [None]:
dev_dataset[0]

# Test 출력 파일 작성

In [None]:
import os

In [None]:
#%%
from typing import List, Tuple, Dict, Any
import json
import random
import re


class KoMRC_for_test:
    def __init__(self, data, indices: List[Tuple[int, int, int]]):
        self._data = data
        self._indices = indices

    # Json을 불러오는 메소드
    @classmethod
    def load(cls, file_path: str):
        with open(file_path, "r", encoding="utf-8") as fd:
            data = json.load(fd)

        indices = []
        for d_id, document in enumerate(data["data"]):
            for p_id, paragraph in enumerate(document["paragraphs"]):
                for q_id, _ in enumerate(paragraph["qas"]):
                    indices.append((d_id, p_id, q_id))

        return cls(data, indices)

    # 데이터 셋을 잘라내는 메소드
    @classmethod
    def split(cls, dataset, eval_ratio: float = 0.1, seed=42):
        indices = list(dataset._indices)
        random.seed(seed)
        random.shuffle(indices)
        train_indices = indices[int(len(indices) * eval_ratio) :]
        eval_indices = indices[: int(len(indices) * eval_ratio)]

        return cls(dataset._data, train_indices), cls(dataset._data, eval_indices)

    def __getitem__(self, index: int) -> Dict[str, Any]:
        d_id, p_id, q_id = self._indices[index]
        paragraph = self._data["data"][d_id]["paragraphs"][p_id]
        d_id, p_id, q_id = self._indices[index]
        
        # \n, \n\n 을 찾아서 공백으로 처리
        # \n을 찾기 때문에 \n\n인 경우 공백이 2개 생김.....
        # tokenization했을 때 차이가 있는지 확인...
        p = re.compile("[\\n]")
        # p = re.compile("[#:^$@*※~&ㆍ!』\"\'\\n…\○]")
        context = paragraph["context"]
        context = p.sub(" ", context)
        qa = paragraph["qas"][q_id]

        """
        guid 부분
        train dataset -- guid과
        ai-hub train dataset -- context_id 이 다름
        """
        # context_id 가 qas 안에 key 형태로 있음
        if paragraph.get("context_id"):
            guid = str(paragraph["context_id"])
        else:
            guid = qa["guid"]

        question = qa["question"]

        # answers에 필요한 것만 추출
        if isinstance(qa['answers'], dict):
            answers = [{
                "text": qa["answers"]["text"],
                "answer_start": qa["answers"]["answer_start"],
            }]
        else:
            answers = qa["answers"]

        return {
            "guid": guid,
            "context": context,
            "question": question,
            "answers": answers,
        }

    def __len__(self) -> int:
        return len(self._indices)

In [None]:
test_data_path = '/content/drive/MyDrive/구름 자연어처리 프로젝트 공유 폴더/Reading Comprehension/data/divided/val.json'

In [None]:
#test_data = dev_dataset 

test_data = KoMRC_for_test.load(test_data_path)

In [None]:
test_data[0]

In [None]:
def preprocess_validation_examples(examples, max_length, stride):
    ids = [examples[i]['guid'] for i in range(len(examples))]
    contexts = [examples[i]['context'] for i in range(len(examples))]
    questions = [examples[i]['question'] for i in range(len(examples))]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(ids[sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
test_encodings = preprocess_validation_examples(test_data, 512, 50)

In [None]:
test_encodings[0]

In [None]:
example_id = test_encodings.pop('example_id')
offset_mapping = test_encodings.pop('offset_mapping')

In [None]:
import torch

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = MyDataset(test_encodings)

In [None]:
test_dataset[0]

In [None]:
batch_size = 32
accumulation = 4
test_loader = DataLoader(test_dataset, batch_size=batch_size//accumulation, shuffle=False, num_workers=2)

In [None]:
import numpy as np
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
start_logits = None
end_logits = None
for batch in tqdm(test_loader, desc="test"):        
    with torch.no_grad():
        inputs = {key: value.cuda() for key, value in batch.items()}
        outputs = model(**inputs)        

        loss = outputs.loss

    if start_logits is None and end_logits is  None:
        start_logits = outputs.start_logits.cpu().numpy()
        end_logits = outputs.end_logits.cpu().numpy()
    else:
        start_logits = np.append(start_logits, outputs.start_logits.cpu().numpy(), axis=0)
        end_logits = np.append(end_logits, outputs.end_logits.cpu().numpy(), axis=0)


In [None]:
len(start_logits)

In [None]:
print(len(start_logits))
# print(len(end_logits))
#print(start_logits[21928])
print(end_logits[:5])

In [None]:
#example_id[-10:]

In [None]:
import collections

example_to_features = collections.defaultdict(list)
for idx, id in enumerate(example_id):
    example_to_features[id].append(idx)

In [None]:
example_id[:5]

In [None]:
type(start_logits)
# print(end_logits[0])
# print(offset_mapping[0])

In [None]:
example_to_features

In [None]:
start_logits[0]

In [None]:
offset_mapping[0]

In [None]:
import numpy as np

n_best = 20
max_answer_length = 100
predicted_answers = []

for example in test_data:
    example_id = example["guid"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = offset_mapping[feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )
    if len(answers) == 0:
        print('0')
    else : best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [None]:
import pandas as pd

df = pd.DataFrame(predicted_answers)
df

In [None]:
df.to_csv('/content/drive/MyDrive/구름 자연어처리과정/프로젝트/Reading Comprehension/사전 평가용/Albert_apoch20_Nodropout.csv',index=False)

In [None]:
dev_dataset[1]['context']

In [None]:
df_answer = pd.read_csv('/content/drive/MyDrive/구름 자연어처리 프로젝트 공유 폴더/Reading Comprehension/pre-evaluation/devset_정답.csv')
df_answer 

In [None]:
test_answers = [test_data[i]['answers'] for i in range(len(test_data))]

In [None]:
test_answers[0]