In [1]:
Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = "</s>"
EOS = "</s>"
MASK = "<unused0>"
SENT = "<unused1>"
PAD = "<pad>"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import torch
from tqdm import tqdm


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", max_len=1024,
            bos_token=BOS, eos_token=EOS, unk_token='<unk>',
            pad_token=PAD, mask_token=MASK)
model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset

security_news = pd.read_excel('./description1.xlsx')  #보안뉴스 220
naver_news = load_dataset("daekeun-ml/naver-news-summarization-ko") #네이버 뉴스

naver_news = naver_news['train'][:220]   #네이버 뉴스 100개

#뉴스 기사 데이터셋

Found cached dataset csv (/home/user/.cache/huggingface/datasets/daekeun-ml___csv/daekeun-ml--naver-news-summarization-ko-ffd4c4ee3530322a/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 732.20it/s]


In [4]:
data = {"Q": [], "A": []}
for i in range(220):        #네이버 뉴스 100개
    data['Q'].append(naver_news['title'][i]+naver_news['document'][i])
    data['A'].append(naver_news['summary'][i])

for i in range(220):        #보안 뉴스 220개
    data['Q'].append(security_news['document'][i])
    data['A'].append(security_news['summary'][i])
    
data = pd.DataFrame(data)
#document랑 summary만 남기고 합치기

In [5]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class GPT2Dataset(Dataset):
    def __init__(self, chat, max_len=1024):
        self.data = chat
        self.max_len = 1024
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.eos = EOS
        self.mask = MASK
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        turn = self.data.iloc[idx]
        q = turn['Q']
        a = turn['A']
        
        q_toked = self.tokenizer.tokenize(self.q_token + q + self.sent_token)
        q_len = len(q_toked)

        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)
        
        # 질문의 길이가 최대길이보다 크면
        if q_len > self.max_len:
            a_len = self.max_len - q_len  # 답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:  # 질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]  # 질문길이를 최대길이의 반으로
                q_len = len(q_toked)
                a_len = self.max_len - q_len  # 답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 질문의 길이 + 답변의 길이가 최대길이보다 크면
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len  # 답변의 길이를 최대길이 - 질문길이
            if a_len <= 0:  # 질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]  # 질문길이를 최대길이의 반으로
                q_len = len(q_toked)
                a_len = self.max_len - q_len  # 답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 답변 labels = [mask, mask, ...., mask, ..., <bos>,..답변.. <eos>, <pad>....]
        labels = [
            self.mask,
        ] * q_len + a_toked[1:]

        # mask = 질문길이 0 + 답변길이 1 + 나머지 0
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        # 답변 labels을 index 로 만든다.
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        # 최대길이만큼 PADDING
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        # 질문 + 답변을 index 로 만든다.
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 PADDING
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        # 질문+답변, 마스크, 답변
        return (token_ids, np.array(mask), labels_ids)


# 데이터셋 만들기
# label이랑 데이터셋은 모두 1024 사이즈로 만들고 패딩

def collate_batch(batch):
    data = [item[0] for item in batch]
    mask = [item[1] for item in batch]
    label = [item[2] for item in batch]
    return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_set = GPT2Dataset(data, max_len=1024)

train_dataloader = DataLoader(
    train_set,
    batch_size=2,
    shuffle=True,
    collate_fn=collate_batch,
)

In [10]:

learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch = 100
Sneg = -1e18


for epoch in range(epoch):
    dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch}")
    for batch_idx, samples in enumerate(dataloader):
        optimizer.zero_grad()
        token_ids, mask, label = samples
        token_ids, mask, label = token_ids, mask, label
        out = model(token_ids)
        out = out.logits
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        optimizer.step()
        
        

Epoch 0:   0%|          | 0/220 [00:01<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'item'

In [None]:
torch.save(
    {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch,
    },
    "./chatbot_model.pth",
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

# 특별 토큰 정의
Q_TKN = "<usr>"
A_TKN = "<sys>"
SENT = "<unused1>"
EOS = "</s>"
BOS = "</s>"

# 토크나이저와 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token=BOS,
    eos_token=EOS,
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<unused0>",
)

In [None]:
import torch
model_path = "./chatbot_model.pth" 
model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()

In [3]:
import pandas as pd

data = pd.read_excel("./test_data.xlsx")

In [None]:
import torch
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

Q_TKN = "<usr>"
A_TKN = "<sys>"
SENT = "<unused1>"
EOS = "</s>"
BOS = "</s>"

result_list = []

koGPT2_TOKENIZER = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token=BOS,
    eos_token=EOS,
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<unused0>",
)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

with torch.no_grad():
    for i in range(len(data)):
        q = data[0][i].strip()
        if q == "quit":
            break
        a = ""
        while 1:
            input_ids = torch.LongTensor(
                koGPT2_TOKENIZER.encode(Q_TKN + q + SENT + A_TKN + a)
            ).unsqueeze(dim=0)
            input_ids = input_ids[0]
            input_ids = input_ids.to(device)
            pred = model(input_ids)
            pred = pred.logits
            gen = koGPT2_TOKENIZER.convert_ids_to_tokens(
                torch.argmax(pred, dim=-1).squeeze()
            )[-1]
            if gen == EOS:
                break
            a += gen.replace("▁", " ")
            
        result_list.append("Chatbot > {}".format(a.strip()))

In [None]:
result_list = pd.DataFrame(result_list)

In [None]:
result_list.to_excel("./test_result.xlsx")

In [None]:
tokenizer.all_special_tokens

In [None]:
tokenizer.all_special_ids