In [1]:
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel

In [2]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return torch.load(checkpoint_file, map_location="cpu")


In [3]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

1
1
3
----------
</s>
<usr>
<pad>
<sys>


# 챗봇 데이터 로드

In [4]:
import pandas as pd
import tqdm
import urllib.request

In [5]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [6]:
print('챗봇 샘플의 개수 :', len(train_data))

챗봇 샘플의 개수 : 11823


# 챗봇 데이터 전처리

In [7]:
batch_size = 32  # 한 번에 처리할 데이터 샘플의 수를 지정

import tqdm  # 학습 진행 상황을 시각적으로 보여주는 모듈
import torch  # PyTorch 라이브러리
from torch.utils.data import Dataset, DataLoader  # 데이터셋과 데이터로더를 다루는 모듈

# 대화 데이터를 위한 사용자 정의 데이터셋 클래스 정의
class ChatDataset(Dataset):
    def __init__(self, train_data, tokenizer):
        self.train_data = train_data  # 학습 데이터를 저장
        self.tokenizer = tokenizer  # 텍스트를 토큰으로 변환할 토크나이저 저장

    def __len__(self):
        return len(self.train_data)  # 데이터셋의 크기(샘플 수)를 반환

    def __getitem__(self, idx):
        question = self.train_data.Q.iloc[idx]  # 인덱스에 해당하는 질문 텍스트 가져오기
        answer = self.train_data.A.iloc[idx]  # 인덱스에 해당하는 답변 텍스트 가져오기
        bos_token = self.tokenizer.bos_token_id  # 문장의 시작을 나타내는 토큰 ID
        eos_token = self.tokenizer.eos_token_id  # 문장의 끝을 나타내는 토큰 ID
        # 질문과 답변을 하나의 문자열로 연결하여 토큰화
        sent = self.tokenizer.encode('' + question + '' + answer, add_special_tokens=False)
        # 시작과 끝 토큰을 포함한 텐서를 반환
        return torch.tensor([bos_token] + sent + [eos_token], dtype=torch.long)

# 배치의 시퀀스를 패딩하여 같은 길이로 맞추는 함수 정의
def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

batch_size = 32  # 배치 크기를 다시 설정
chat_dataset = ChatDataset(train_data, tokenizer)  # 데이터셋 인스턴스 생성
data_loader = DataLoader(chat_dataset, batch_size=batch_size, collate_fn=collate_fn)  # 데이터로더 생성

# 챗봇 학습하기

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, eps=1e-08)

steps = len(train_data) // batch_size + 1
print(steps)

370


In [9]:
EPOCHS = 3
# GPU가 사용 가능한지 확인하고, 가능하면 "cuda"를 선택, 그렇지 않으면 CPU를 선택
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 선택된 디바이스(GPU 또는 CPU)로 이동
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [10]:
# 주어진 에포크 수만큼 학습 루프를 반복
for epoch in range(EPOCHS):
    epoch_loss = 0  # 에포크 손실 초기화

    # 데이터 로더에서 배치를 하나씩 가져와서 학습을 진행
    for batch in tqdm.tqdm(data_loader, total=steps):
        # 배치를 선택한 디바이스로 이동
        batch = batch.to(device)
        # 레이블을 배치와 동일하게 설정 (입력을 그대로 레이블로 사용)
        labels = batch.clone()
        # 옵티마이저의 기울기 초기화
        optimizer.zero_grad()
        # 모델에 입력을 주고, 출력과 손실값을 계산
        result = model(input_ids=batch, labels=labels)
        loss = result.loss  # 계산된 손실값
        batch_loss = loss.mean()  # 배치 손실 계산

        # 손실값에 대해 역전파를 통해 기울기 계산
        batch_loss.backward()
        # 옵티마이저를 통해 가중치 업데이트
        optimizer.step()
        # 에포크 손실에 이번 배치의 손실값을 추가
        epoch_loss += batch_loss.item() / steps

    # 현재 에포크가 끝난 후 평균 손실값 출력
    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

100%|████████████████████████████████████████████████████████████████████████████████| 370/370 [00:48<00:00,  7.62it/s]


[Epoch:    1] cost = 2.31870702


100%|████████████████████████████████████████████████████████████████████████████████| 370/370 [00:47<00:00,  7.78it/s]


[Epoch:    2] cost = 1.83392793


100%|████████████████████████████████████████████████████████████████████████████████| 370/370 [00:48<00:00,  7.70it/s]

[Epoch:    3] cost = 1.45199434





# 챗봇 실행하기

In [11]:
# '<usr>'는 사용자 입력을 '<sys>'는 시스템 응답을 나타내는 태그로 감싸서 대화 형태로 변환
text = '오늘도 좋은 하루!'
sent = '<usr>' + text + '<sys>'

# 문장의 시작을 알리는 bos_token_id와 토큰화 된 문장을 이어 붙이고 정수 인코딩.
# 즉, </s>를 맨 앞에 부착 후 정수 인코딩.
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)

# 모델을 사용해 주어진 입력에 대한 응답을 생성 (최대 50개의 토큰, 조기 종료 조건 설정)
output = model.generate(input_ids, max_length=50, early_stopping=True, eos_token_id=tokenizer.eos_token_id)
tokenizer.decode(output[0].tolist())

'</s><usr> 오늘도 좋은 하루!<sys><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [12]:
print(decoded_sentence.split('<sys> ')[1].replace('</s>', ''))

NameError: name 'decoded_sentence' is not defined

In [13]:
def return_answer_by_chatbot(user_text):
    sent = '' + user_text + ''
    input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent, add_special_tokens=False)
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    output = model.generate(input_ids, max_length=50, do_sample=True, top_k=2)
    sentence = tokenizer.decode(output[0].tolist())
    chatbot_response = sentence.split(' ')[1].replace('', '')
    return chatbot_response

In [14]:
return_answer_by_chatbot('안녕! 반가워~')

'안녕!'

In [15]:
return_answer_by_chatbot('너는 누구야?')

'너는'

In [16]:
return_answer_by_chatbot('너무 심심한데 나랑 놀자')

'너무'

In [17]:
return_answer_by_chatbot('영화 해리포터 재밌어?')

'영화'

In [18]:
return_answer_by_chatbot('너 딥 러닝 잘해?')

'너'