<a href="https://colab.research.google.com/github/baeseungyou/study/blob/main/12%EC%A3%BC%EC%B0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import urllib.request
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
train_data = pd.read_csv('chatbot.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,Q,A,label
0,0,12시 땡!,하루가 또 가네요.,0
1,1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,4,PPL 심하네,눈살이 찌푸려지죠.,0


In [3]:
print('챗봇 샘플의 개수 :', len(train_data))

챗봇 샘플의 개수 : 11823


In [4]:
questions = []
for sentence in train_data['Q']:
    sentence = re.sub(r"([?.!,])", r"\1", sentence)
    sentence = sentence.strip()
    questions.append(sentence)

In [5]:
answers = []
for sentence in train_data['A']:
  sentence = re.sub(r"([?.!,])", r"\1", sentence)
  sentence = sentence.strip()
  answers.append(sentence)

In [6]:
len(questions)

11823

In [7]:
print(questions[:5])
print(answers[:5])

['12시 땡!', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']
['하루가 또 가네요.', '위로해 드립니다.', '여행은 언제나 좋죠.', '여행은 언제나 좋죠.', '눈살이 찌푸려지죠.']


In [8]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    questions + answers, target_vocab_size=2**13)

START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

VOCAB_SIZE = tokenizer.vocab_size + 2

In [9]:
print('시작 토근 번호 :', START_TOKEN)
print('종료 토큰 번호 :', END_TOKEN)
print('단어 집합의 크기 : ', VOCAB_SIZE)

시작 토근 번호 : [8170]
종료 토큰 번호 : [8171]
단어 집합의 크기 :  8172


In [10]:
print('Tokenized sample question: {}'. format(tokenizer.encode(questions[20])))

Tokenized sample question: [5759, 607, 3502, 138, 681, 3740, 846]


In [11]:
print('Tokenized sample answer: {}'.format(tokenizer.encode(answers[20])))

Tokenized sample answer: [2337, 510, 7947]


In [12]:
sample_string = questions[20]
tokenized_string = tokenizer.encode(sample_string)
print('정수 인코딩 후의 문장 {}'.format(tokenized_string))
original_string = tokenizer.decode(tokenized_string)
print ('기존 문장: {}'.format(original_string))

정수 인코딩 후의 문장 [5759, 607, 3502, 138, 681, 3740, 846]
기존 문장: 가스비 비싼데 감기 걸리겠어


In [13]:
sample_string = answers[20]
tokenized_string = tokenizer.encode(sample_string)
print ('정수 인코딩 후의 문장 {}'.format(tokenized_string))
original_string = tokenizer.decode(tokenized_string)
print('기존 문장: {}'.format(original_string))

정수 인코딩 후의 문장 [2337, 510, 7947]
기존 문장: 따뜻하게 사세요!


In [14]:
for ts in tokenized_string:
  print (' {} ---> {}'.format(ts, tokenizer.decode([ts])))

 2337 ---> 따뜻하게 
 510 ---> 사세요
 7947 ---> !


In [17]:
MAX_LENGTH = 40

def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []

  for (sentence1, sentence2) in zip(inputs, outputs):
    sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
    sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN

    tokenized_inputs.append(sentence1)
    tokenized_outputs.append(sentence2)

  tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
  tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_outputs, maxlen = MAX_LENGTH, padding='post')

  return tokenized_inputs, tokenized_outputs

In [18]:
questions, answers = tokenize_and_filter(questions, answers)

In [19]:
print('질문 데이터의 크기(shape) : ', questions.shape)
print('답변 데이터의 크기(shape) : ', answers.shape)

질문 데이터의 크기(shape) :  (11823, 40)
답변 데이터의 크기(shape) :  (11823, 40)


In [20]:
print(questions[0])
print(answers[0])

[8170 7909 4200 3054 7947 8171    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[8170 3837   71 7888 7960 8171    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [21]:
print('단어 집합의 크기(Vocab size): {}'.format(VOCAB_SIZE))
print('전체 샘플의 수(Number of samples): {}'.format(len(questions)))

단어 집합의 크기(Vocab size): 8172
전체 샘플의 수(Number of samples): 11823


In [22]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs': questions,
        'dec_inputs': answers[:, :-1]
    },
    {
        'outputs': answers[:, 1:]
    },
))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [23]:
print(answers[0])
print(answers[:1][:, :-1])
print(answers[:1][:, 1:])

[8170 3837   71 7888 7960 8171    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[[8170 3837   71 7888 7960 8171    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
[[3837   71 7888 7960 8171    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]


In [26]:
!pip install transformers



In [28]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

def transformer(vocab_size, num_layers, dff, d_model, num_heads, dropout):

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, d_model),
        tf.keras.layers.LSTM(dff),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    return model

tf.keras.backend.clear_session()

NUM_LAYERS = 2
D_MODEL = 256
NUM_HEADS = 8
DFF = 512
DROPOUT = 0.1

model = transformer(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    dff=DFF,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT
)

In [35]:
import tensorflow as tf

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

MAX_LENGTH = 40

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

def accuracy(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
  y_pred = y_pred[:, :tf.shape(y_true)[1], :]
  # Correct the typo in metrics
  return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

# Define the loss function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

model.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy])

In [36]:
EPOCHS = 50

model.fit(dataset, epochs=EPOCHS)

Epoch 1/50


AttributeError: Exception encountered when calling Embedding.call().

[1m'dict' object has no attribute 'dtype'[0m

Arguments received by Embedding.call():
  • inputs={'inputs': 'tf.Tensor(shape=(None, 40), dtype=int32)', 'dec_inputs': 'tf.Tensor(shape=(None, 39), dtype=int32)'}

In [2]:
def evaluate(sentence):
  sentence = preprocess_sentence(sentence)

  sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)

  output = tf.expand_dims(START_TOKEN, 0)

  for i in range(MAX_LENGTH):
    predictions = model(input=[sentence, output], training=False)

    predictions = predictions[:, -1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    if tf.equal(predicted_id, END_TOKEN[0]):
      break

      output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

  def predict(sentence):
    prediction = evaluate(sentence)

    predicted_sentence = tokenizer.decode(
        [i for i in prediction if i < tokenizer.vocab_size])

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence

In [3]:
def preprocess_sentence(sentence):
  sentence = re.sub(r"([?.1,])", r" \ 1", sentence)
  sentence = sentence.strip()
  return sentence

In [5]:
output = predict('가스비 비싼데 감기 걸리겠어') # 'predicet'를 'predict'로 변경

NameError: name 'predict' is not defined