In [None]:
# Python ≥3.5 is required

import sys

assert sys.version_info >= (3, 5)


# Scikit-Learn ≥0.20 is required

import sklearn

assert sklearn.__version__ >= "0.20"


try:

# %tensorflow_version only exists in Colab.

  %tensorflow_version 2.x

  !pip install -q -U tensorflow-addons

  IS_COLAB = True

except Exception:

  IS_COLAB = False


# TensorFlow ≥2.0 is required

import tensorflow as tf

from tensorflow import keras

assert tf.__version__ >= "2.0"


if not tf.config.list_physical_devices('GPU'):

  print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")

  if IS_COLAB:

    print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")


# Common imports

import numpy as np

import os


# to make this notebook's output stable across runs

np.random.seed(42)

tf.random.set_seed(42)


# To plot pretty figures

%matplotlib inline

import matplotlib as mpl

import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)

mpl.rc('xtick', labelsize=12)

mpl.rc('ytick', labelsize=12)


# Where to save the figures

PROJECT_ROOT_DIR = "."

CHAPTER_ID = "nlp"

IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

os.makedirs(IMAGES_PATH, exist_ok=True)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):

  path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)

  print("Saving figure", fig_id)

  if tight_layout:

    plt.tight_layout()

  plt.savefig(path, format=fig_extension, dpi=resolution)



No GPU was detected. LSTMs and CNNs can be very slow without a GPU.
Go to Runtime > Change runtime and select a GPU hardware accelerator.


In [None]:
IMAGES_PATH

'./images/nlp'

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

n_steps=5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window:window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1],window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
  print("-"*20, "Batch", index, "\nX_batch")
  print(X_batch.numpy())
  print("="*5, "\nY_batch")
  print(Y_batch.numpy())

-------------------- Batch 0 
X_batch
[[6 7 8 9]
 [2 3 4 5]
 [4 5 6 7]]
===== 
Y_batch
[[ 7  8  9 10]
 [ 3  4  5  6]
 [ 5  6  7  8]]
-------------------- Batch 1 
X_batch
[[ 0  1  2  3]
 [ 8  9 10 11]
 [10 11 12 13]]
===== 
Y_batch
[[ 1  2  3  4]
 [ 9 10 11 12]
 [11 12 13 14]]


## 데이터셋 준비

In [None]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(filepath) as f:
  shakespeare_text = f.read(
  )

In [None]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [None]:
print(type(shakespeare_text))
print(len(shakespeare_text))

<class 'str'>
1115394


In [None]:
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

모든 글자를 정수로 인코딩

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
# char_level : 단어 수준 인코딩 대신 글자 수준 인코딩
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20,6,9,8,3]])

['f i r s t']

In [None]:
type([20,6])

list

In [None]:
type([[20,6]])

list

In [None]:
max_id = len(tokenizer.word_index) # 고유 글자 개수
max_id

39

In [None]:
dataset_size = tokenizer.document_count # 전체 글자 개수
dataset_size

1115394

In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text]))-1 # 1~39 -> 0~ 38

훈련, 검증, 테스트로 나누어아하지만 순서를 텍스트에 있는 글자를 섞으면 안됨

-> 

처음에 등장한 로직 이용

In [None]:
train_size = dataset_size * 90 // 100
# 텍스트의 처음 90%를 트레인 셋으로 사용

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
dataset

<TensorSliceDataset shapes: (), types: tf.int64>

window() 메서드로 짧은 텍스트 윈도를 갖는 데이터셋 생성

RNN은 이 부분 문자열 길이만큼만 역전파를 위해 펼쳐짐 -> Truncated BackPropagation Through Time

In [None]:
n_steps = 100
window_length = n_steps + 1 # target = 1글자 앞의 input
dataset = dataset.repeat().window(window_length, shift = 1, 
                         drop_remainder = True)

첫번째 윈도우는 0~100 번째 글자 포함

두번째 윈도우는 1~101 번째 글자 포함

drop_remainder = True 로 해놓으면 모든 윈도우가 동일하게 101개의 글자를 포함

False로 지정하면 100개, 99개, 98개 , ... , 식으로 점점 줄어 마지막 윈도우는 글자 1개만 포함

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

window() 메서드는 각각 하나의 데이터셋으로 표현되는 윈도우를 포함하는 데이터 셋을 만듬 -> 리스트의 리스트와 비슷한 ***중첩 데이터셋***

하지만 모델은 데이터셋이 아니라 ***텐서***를 기대하기 때문에 중첩 데이터셋을 ***플랫 데이터셋***으로 변환하는 flat_map() 메서드를 호출해야함

ex) {{1,2}, {3,4,5,6}} -> flat_map() -> {1,2,3,4,5,6}

ex) lambda ds: ds.batch(2) 함수를 flat_map()에 전달 -> {{1,2},{3,4,5,6}} -> {{1,2},{3,4},{5,6}} 으로 변환

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
batch_size = 32
dataset = dataset.shuffle(1000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:,1:]))

윈도우를 배치로 만들고 -> ```dataset.shuffle(1000).batch(batch_size)```

입력(처음 100개의 글자) 와 타깃(마지막 글자) 분리 -> ```dataset.map(lambda windows: (windows[:,:-1], windows[:,1:]))```

In [None]:
딩dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

원-핫 벡터를 사용해 글자 인코딩

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
  print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


## Char-RNN 모델 만들고 훈련

이전 글자 100개를 기반해 다음 글자를 예측하기 위해 유닛 128개를 가진 GRU 층 2개, 입력과 은닉에 20% 드롭아웃 사용
(하이퍼 파라미터 수정 가능)

출력층은 TimeDistributed 클래스를 적용한 Dense 층

텍스트의 고유한 글자 수는 39개 이므로 이 층은 39개의 유닛(max_id)를 가져야 함

출력 확률의 합은 1이어야 하므로 Dense 층의 출력은 소프트맥스



In [None]:
model = keras.models.Sequential([
  keras.layers.GRU(128, return_sequences=True, input_shape=[None,max_id],
                   dropout=0.2, recurrent_dropout=0.2),
  
  keras.layers.GRU(128, return_sequences=True,
                   dropout=0.2, recurrent_dropout=0.2),
  keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                  activation="softmax"))                                 
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer = "adam")
history = model.fit(dataset, steps_per_epoch=train_size // batch_size,
                    epochs=10)
# 시간 굉장히 오래걸림

Epoch 1/10
  147/31370 [..............................] - ETA: 3:45:40 - loss: 2.8027

KeyboardInterrupt: ignored

## 상태가 있는 RNN
- RNN이 한 훈련 배치를 처리한 후에 마지막 상태를 다음 훈련 배치의 초기 상태로 사용
- 역전파는 짧은 시퀀스에서 일어나지만 모델이 장기간 패턴을 학습할 수 있음


dataset을 만들 때 window() 메서드에서 shift = 1 대신에 shift = n_steps를 사용하여 순차적이고 겹치지 않는 입력 시퀀스 생성


## 감성분석

In [None]:
tf.random.set_seed(42)

In [None]:
(X_train, y_test),(X_valid, y_test) = keras.datasets.imdb.load_data()

In [None]:
print(len(X_train[0]))
print(X_train[0][:10])

218
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


디코딩

In [None]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_+3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>","<sos>","<unk>")):
  id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [None]:
" ".join([id_to_word[id_] for id_ in X_train[1][:100]])

"<sos> big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal measures the hair is big lots of boobs"

In [None]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [None]:
id_to_word[22]

'film'

### 전처리 함수작성
+ 현실에서는 항상 전처리 과정을 거쳐야함
+ 전처리를 모델 자체에 포함시키는 방법

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

텐서플로 데이터셋의 원본 IMDb리뷰를 텍스트(바이트스트링) 으로 적재

In [None]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [None]:
datasets.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [None]:
info.splits["train"].num_examples

25000

In [None]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [None]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):

  for review, label in zip(X_batch.numpy(), y_batch.numpy()):

    print("Review:", review.decode("utf-8")[:200], "...")

    print("Label:", label, "= Positive" if label else "= Negative")

    print()



Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [None]:
X_batch

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell a

In [None]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

- 훈련 속도를 높이기 위해 각 리뷰에서 처음 300 글자만 남김
- <br /> 태그를 공백으로 바굼
- 문자와 작은 따옴표가 아닌 다른 모든 문자를 공백으로 바꿈
- ``` X_batch = tf.strings.split(X_batch) ``` 로 리뷰를 공백으로 나눔
- 이떄 ragged tensor 반환
- 이 텐서를 밀집 텐서로 바꾸고 동일한 길이가 되도록 패딩 토큰 pad 로 모든 리뷰를 패딩

In [None]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

어휘사전 구축
+ 전체 훈련셋을 한 번 순회하면서 preprocess 함수를 적용
+ Counter 로 단어의 등장 횟수를 셈

In [None]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
len(vocabulary)

53893

In [120]:
vocab_size = 10000
truncated_vocabulary = [
  word for word, count in vocabulary.most_common()[:vocab_size]                           
]

In [121]:
truncated_vocabulary[:10]

[b'<pad>', b'the', b'a', b'of', b'and', b'to', b'I', b'is', b'in', b'this']

In [122]:
word_to_id = {word : index for index, word in enumerate(truncated_vocabulbary)}
for word in b"This movie was faaaaaantastic".split():
  print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [124]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulbary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [125]:
table.lookup(tf.constant([b"This movie was faaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10791]])>

각 단어에 ID 부여

1000개의 oov(out of vocabulary) 버킷을 사용하는 룩업 테이블을 만듬

In [127]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [130]:
for X_batch, y_batch in train_set.take(1):
  print(X_batch)
  print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [133]:
embed_size = 128
model = keras.models.Sequential([
  keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                         mask_zero=True,
                         input_shape=[None]),
  keras.layers.GRU(128, return_sequences=True),
  keras.layers.GRU(128),
  keras.layers.Dense(1, activation="sigmoid")                                                        
])

model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch = train_size // 32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


- 첫 번째 층은 단어 ID를 임베딩으로 변환하는 Embedding 층
- 임베딩 행렬은 단어 ID 당 vocab_size + num_oov_buckets) 하나의 행과 임베딩 차원(128, 하이퍼파라미터) 당 하나의 열을 가짐
- 모델의 입력은 [배치크기, 타임스텝 수] 크기를 가진 2D 텐서 이지만 출력은 [배치 크기, 타임 스텝 수, 임베딩 크기] 의 크기를 가진 3D 텐서가 됨
