In [286]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [287]:
df = pd.read_csv('chatbot_data.csv')
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [288]:
print(df['label'].value_counts())

label
0    5290
1    3570
2    2963
Name: count, dtype: int64


In [289]:
features = df['Q'].to_list()
labels = df['label'].to_list()

In [290]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

In [291]:
corpus =  [word_tokenize(x) for x in features]
# Convert the corpus to word index sequences
word_index = {}
corpus_indices = []
for text in features:
    indices = [word_index.setdefault(word, len(word_index)) for word in word_tokenize(text)]
    corpus_indices.append(torch.tensor(indices))

# Pad the sequences to a fixed length
MAX_SEQ_LEN = 15
padded_seqs = pad_sequence(corpus_indices, batch_first=True, padding_value=0)


In [292]:
print(corpus[0])
print(corpus_indices[0])
print(padded_seqs[0])

['12시', '땡', '!']
tensor([0, 1, 2])
tensor([0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


### Model

In [293]:
class ChatModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, dropout):
        super(ChatModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.conv1 = nn.Conv1d(embed_size, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 128, kernel_size=3, padding=1)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = self.maxpool(x)
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.flatten(start_dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [294]:
class ChatDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [295]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ChatModel(len(word_index), 128, 3, 0.2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

EPOCHS = 10
BATCH_SIZE = 32

In [296]:
dataset = ChatDataset(padded_seqs, torch.tensor(labels))
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [297]:
for epoch in range(EPOCHS):
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        output = model(batch_x)
        loss = criterion(output, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch+1:02}, loss: {loss.item():.3f}')

Epoch: 01, loss: 0.973
Epoch: 02, loss: 0.506
Epoch: 03, loss: 0.441
Epoch: 04, loss: 0.443
Epoch: 05, loss: 0.024
Epoch: 06, loss: 0.006
Epoch: 07, loss: 0.119
Epoch: 08, loss: 0.013
Epoch: 09, loss: 0.011
Epoch: 10, loss: 0.017


In [298]:
text = "안녕"
text = word_tokenize(text)
text = [word_index.get(word, 0) for word in text]  # 0 for unknown words
text += [0] * (MAX_SEQ_LEN - len(text) + 1)
print(text)

text_tensor = torch.tensor(text).unsqueeze(0)  # Add batch dimension
text_tensor = text_tensor.to(device)

model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    output = model(text_tensor)

probabilities = torch.softmax(output, dim=1)
predicted_class = torch.argmax(probabilities, dim=1)

print("Predicted class:", predicted_class.item())


[3885, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted class: 0
