In [11]:
import re
import numpy as np

from tensorflow.keras.utils import to_categorical

# 데이터 분석에 불필요한 정보 제거
def normalizeString(s):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣 ^☆; ^a-zA-Z~.!?]+')
    match = hangul.search(s)

    result = []

    if not match:
        result = hangul.sub('', s)

    return result

def preprocess_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    input_texts = []
    train_labels = []
    input_chars = set()

    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    for line in lines[:2500]:
        tmp_text = line.split('\t')

        if len(tmp_text) > 1:
            input_text = normalizeString(tmp_text[0])
            target_label = tmp_text[1]

        if len(input_text) > 0 and len(target_label) > 0:
            input_texts.append(input_text)
            train_labels.append(target_label)
            for char in input_text:
                if char not in input_chars:
                    input_chars.add(char)

    input_chars = sorted(list(input_chars))
    num_input_tokens = len(input_chars)
    num_sequence = max([len(txt) for txt in input_texts])

    print('Number of samples:', len(input_texts))
    print('Number of unique input tokens:', num_input_tokens)
    print('Max sequence length for inputs:', num_sequence)

    input_token_index = dict(
        [(char, i) for i, char in enumerate(input_chars)])

    train_data = np.zeros((len(input_texts),
                           num_sequence),
                          dtype='float32')

    for i, input_text in enumerate(input_texts):
        for t, char in enumerate(input_text):
            train_data[i, t] = input_token_index[char]

    train_labels = to_categorical(train_labels)

    return train_data, train_labels, num_sequence, num_input_tokens, input_token_index

In [13]:
data_path = '/content/drive/MyDrive/Colab Notebooks/train_data.csv'
train_data, train_labels, num_sequence, num_input_tokens, input_token_index = preprocess_data(data_path)

Number of samples: 2452
Number of unique input tokens: 91
Max sequence length for inputs: 15


In [14]:
import json

with open('./assets/input_token_index.json', 'w') as fp:
  json.dump(input_token_index, fp)

with open('./assets/model_parameter.json', 'w') as fp:
  json.dump({
      'num_input_tokens' : num_input_tokens,
      'num_sequence' : num_sequence,
      'embedding_dim' : 512
  }, fp)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop
# 4개 분류
model = Sequential([
    layers.Embedding(num_input_tokens, 512, input_length=num_sequence),
    layers.Conv1D(32, 2, activation='relu'),
    layers.MaxPool1D(2),
    layers.Conv1D(32, 2, activation='relu'),
    layers.MaxPool1D(2),
    layers.Conv1D(32, 2, activation='relu'),
    layers.GlobalMaxPool1D(),
    layers.Dense(32, activation='relu'),
    layers.Dense(4, activation='softmax')
])

In [21]:
optimizer = RMSprop(lr=1e-4)

In [22]:
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])

In [23]:
model.fit(train_data, train_labels, epochs=20, batch_size=100, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f5f1a785390>

In [24]:
model.save('./assets/intent_model.h5')