In [97]:
import json
import pandas as pd
import numpy as np

In [98]:
with open('train_game.json') as jsonfile:
    json_data = json.load(jsonfile)

In [99]:
datas = []
for item in json_data:
    sentence = item['sentence']
    facet = item['tokens'][0]['facet'] if item['tokens'] else None
    datas.append([sentence, facet])

df = pd.DataFrame(datas, columns=['sentence', 'facet'])
df = df.dropna()

In [100]:
df.head()

Unnamed: 0,sentence,facet
0,"퀘스트 사냥꾼은 대체로 컨트롤 성향 덱에 더 강한 모습을 보이나, 컨트롤 성기사엔 ...",직업
1,"나이트 페이 성약의 단 이동 연결망 1단계를 강화하면 그루터, 숲의 끝자락, 고요한...",소속
2,딜칭호는 아마 설 즈음에 풀릴거 같구요,아이템
3,2년 전 네로제에서 켈트사제를 때려잡았던 것처럼 2년 후 세시키의 활약을 기대해보겠...,캐릭터
4,그럴 경우 토마가 보호막 + 불원소 부여 역할로 쓸만하지 않을까 싶습니다.,시스템용어


In [101]:
print(df['facet'].unique().shape)

(17,)


### Word Embedding

In [102]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

In [103]:
features = df['sentence'].values.tolist()
labels = df['facet'].values.tolist()

In [104]:
corpus = [word_tokenize(x) for x in features]

word_index = {}
corpus_indices = []
for text in features:
    indices = [word_index.setdefault(word, len(word_index)) for word in word_tokenize(text)]
    corpus_indices.append(torch.tensor(indices))

padded_seqs = pad_sequence(corpus_indices, batch_first=True, padding_value=0)

In [105]:
print(features[0])
print(corpus_indices[0])
print(padded_seqs[0])
MAX_SEQ_LEN = padded_seqs.shape[1]
print(MAX_SEQ_LEN)      # 52

퀘스트 사냥꾼은 대체로 컨트롤 성향 덱에 더 강한 모습을 보이나, 컨트롤 성기사엔 조금 약하며 나가 악마사냥꾼에도 다소 밀리는 편이다.
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  3, 11, 12, 13, 14, 15, 16,
        17, 18, 19])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  3, 11, 12, 13, 14, 15, 16,
        17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
52


### Label Encoding

In [106]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(labels)
labels = encoder.transform(labels)
print(labels[:5])

[15  6  9 16  7]


### Model

In [107]:
class ChatModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes, dropout):
        super(ChatModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.conv1 = nn.Conv1d(embed_size, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 128, kernel_size=3, padding=1)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(128 * 13, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = self.maxpool(x)
        x = F.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.flatten(start_dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [108]:
class ChatDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [109]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ChatModel(len(word_index), 128, 17, 0.2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

EPOCHS = 10
BATCH_SIZE = 32

In [110]:
dataset = ChatDataset(padded_seqs, torch.tensor(labels, dtype=torch.long))
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [111]:
for epoch in range(EPOCHS):
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device).long()
        
        output = model(batch_x)
        loss = criterion(output, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch+1:02}, loss: {loss.item():.3f}')

Epoch: 01, loss: 2.222
Epoch: 02, loss: 1.922
Epoch: 03, loss: 2.112
Epoch: 04, loss: 1.535
Epoch: 05, loss: 1.219
Epoch: 06, loss: 0.840
Epoch: 07, loss: 0.451
Epoch: 08, loss: 0.149
Epoch: 09, loss: 0.060
Epoch: 10, loss: 0.058


### Test

In [119]:
test_sentence = "그래서 드워프가 좋은거임"
test = word_tokenize(test_sentence)
test = [word_index.get(word, 0) for word in test]
test += [0] * (MAX_SEQ_LEN - len(test))
print(test)

test_tensor = torch.tensor(test, dtype=torch.long).unsqueeze(0)
test_tensor = test_tensor.to(device)
print(test_tensor.shape)

model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    output = model(test_tensor)

probabilities = torch.softmax(output, dim=1)
predicted_class = torch.argmax(probabilities, dim=1)

print("Predicted class:", predicted_class.item())
print("predicted_class label : ", encoder.inverse_transform([predicted_class.item()]))
print("Predicted probability:", probabilities[0][predicted_class].item())

[523, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
torch.Size([1, 52])
Predicted class: 16
predicted_class label :  ['캐릭터']
Predicted probability: 0.08962780982255936
