# Author
Created by HakHyeon Song.

Artificial Intelligence, Fall Semester, 2021.

# 라이브러리 포함하기

In [None]:
import numpy as np

from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# 텍스트를 정수로 변환하기

### Text를 정수로 인코딩: text의 각 단어에는 고유한 정수가 할당

- Keras Tokenizer를 이용해 단어에서 고유한 정수로 mapping
- fit_on_texts(): text 분석
- texts_to_sequences(): text sequence를 정수 sequence로 변환

In [None]:
text_data = """I am dreaming of a white Christmas\n
Just like the ones I used to know\n
Where the tree tops glisten\n
And children listen\n
To hear sleight bells in the snow oh the snow\n
I said I am dreaming of a white Christmas\n
With every Christmas card I write\n
May your days be merry and bright\n
And may all your Christmas be white\n
"""

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
encoded = tokenizer.texts_to_sequences([text_data])[0]
print(encoded)

[1, 6, 7, 8, 9, 4, 2, 15, 16, 3, 17, 1, 18, 10, 19, 20, 3, 21, 22, 23, 5, 24, 25, 10, 26, 27, 28, 29, 3, 11, 30, 3, 11, 1, 31, 1, 6, 7, 8, 9, 4, 2, 32, 33, 2, 34, 1, 35, 12, 13, 36, 14, 37, 5, 38, 5, 12, 39, 13, 2, 14, 4]


## 어휘 크기 알아내기

Tokenizer의 word_index를 이용해 단어의 정보 및 크기를 확인

In [None]:
print(tokenizer.word_index)
vocab_size = len(tokenizer.word_index)+1
print('vocab_size', vocab_size)

{'i': 1, 'christmas': 2, 'the': 3, 'white': 4, 'and': 5, 'am': 6, 'dreaming': 7, 'of': 8, 'a': 9, 'to': 10, 'snow': 11, 'may': 12, 'your': 13, 'be': 14, 'just': 15, 'like': 16, 'ones': 17, 'used': 18, 'know': 19, 'where': 20, 'tree': 21, 'tops': 22, 'glisten': 23, 'children': 24, 'listen': 25, 'hear': 26, 'sleight': 27, 'bells': 28, 'in': 29, 'oh': 30, 'said': 31, 'with': 32, 'every': 33, 'card': 34, 'write': 35, 'days': 36, 'merry': 37, 'bright': 38, 'all': 39}
vocab_size 40


## 단어 시퀀스 생성

한 단어를 입력으로, 한 단어를 출력하는 단어 sequence 생성

In [None]:
seqences = list()
for i in range(1, len(encoded)):
  seqence = encoded[i-1:i+1]
  seqences.append(seqence)
print(seqences)
print('length', len(seqences))

[[1, 6], [6, 7], [7, 8], [8, 9], [9, 4], [4, 2], [2, 15], [15, 16], [16, 3], [3, 17], [17, 1], [1, 18], [18, 10], [10, 19], [19, 20], [20, 3], [3, 21], [21, 22], [22, 23], [23, 5], [5, 24], [24, 25], [25, 10], [10, 26], [26, 27], [27, 28], [28, 29], [29, 3], [3, 11], [11, 30], [30, 3], [3, 11], [11, 1], [1, 31], [31, 1], [1, 6], [6, 7], [7, 8], [8, 9], [9, 4], [4, 2], [2, 32], [32, 33], [33, 2], [2, 34], [34, 1], [1, 35], [35, 12], [12, 13], [13, 36], [36, 14], [14, 37], [37, 5], [5, 38], [38, 5], [5, 12], [12, 39], [39, 13], [13, 2], [2, 14], [14, 4]]
length 61


## 훈련 데이터와 정답 생성하기

생성한 sequence를 이용해 훈련 데이터의 입력, 결과를 생성

In [None]:
seqences = np.array(seqences)
X, Y = seqences[:,0], seqences[:,1]
print("X: ", X)
print("Y: ", Y)

X:  [ 1  6  7  8  9  4  2 15 16  3 17  1 18 10 19 20  3 21 22 23  5 24 25 10
 26 27 28 29  3 11 30  3 11  1 31  1  6  7  8  9  4  2 32 33  2 34  1 35
 12 13 36 14 37  5 38  5 12 39 13  2 14]
Y:  [ 6  7  8  9  4  2 15 16  3 17  1 18 10 19 20  3 21 22 23  5 24 25 10 26
 27 28 29  3 11 30  3 11  1 31  1  6  7  8  9  4  2 32 33  2 34  1 35 12
 13 36 14 37  5 38  5 12 39 13  2 14  4]


## 신경망 모델 정의

- Embedding: 번호가 붙여진 단어를 입력 받아 실수로 된 단어 벡터를 출력
- LSTM: 50개 유닛 설정
- Dense: softmax 활성화 함수를 사용하여 출력을 확률로 출력

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             400       
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 40)                2040      
                                                                 
Total params: 14,640
Trainable params: 14,640
Non-trainable params: 0
_________________________________________________________________


## 컴파일 및 학습
- 다중 범주 분류 문제 => sparse_categorical_crossentropy

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, Y, epochs=500, verbose=2)

Epoch 1/500
2/2 - 2s - loss: 3.6894 - accuracy: 0.0164 - 2s/epoch - 1s/step
Epoch 2/500
2/2 - 0s - loss: 3.6880 - accuracy: 0.0328 - 17ms/epoch - 9ms/step
Epoch 3/500
2/2 - 0s - loss: 3.6867 - accuracy: 0.0656 - 7ms/epoch - 3ms/step
Epoch 4/500
2/2 - 0s - loss: 3.6853 - accuracy: 0.1148 - 8ms/epoch - 4ms/step
Epoch 5/500
2/2 - 0s - loss: 3.6842 - accuracy: 0.1803 - 8ms/epoch - 4ms/step
Epoch 6/500
2/2 - 0s - loss: 3.6829 - accuracy: 0.1803 - 7ms/epoch - 3ms/step
Epoch 7/500
2/2 - 0s - loss: 3.6817 - accuracy: 0.1967 - 7ms/epoch - 3ms/step
Epoch 8/500
2/2 - 0s - loss: 3.6803 - accuracy: 0.2131 - 9ms/epoch - 5ms/step
Epoch 9/500
2/2 - 0s - loss: 3.6790 - accuracy: 0.1803 - 8ms/epoch - 4ms/step
Epoch 10/500
2/2 - 0s - loss: 3.6778 - accuracy: 0.1967 - 8ms/epoch - 4ms/step
Epoch 11/500
2/2 - 0s - loss: 3.6762 - accuracy: 0.2131 - 7ms/epoch - 4ms/step
Epoch 12/500
2/2 - 0s - loss: 3.6750 - accuracy: 0.2131 - 8ms/epoch - 4ms/step
Epoch 13/500
2/2 - 0s - loss: 3.6735 - accuracy: 0.2295 - 10ms

<keras.callbacks.History at 0x7fe8c28700d0>

## 테스트

In [None]:
test_text = 'White'
encoded = tokenizer.texts_to_sequences([test_text])[0]
encoded = np.array([encoded])
print('encoded', encoded)

encoded [[4]]


In [None]:
onehot_output = model.predict(encoded)
print('onehot_output=', onehot_output)

onehot_output= [[2.43190379e-06 1.09642526e-04 9.82548177e-01 4.18424461e-05
  9.08993607e-05 1.75916048e-05 2.70592049e-04 5.33498242e-04
  4.58541354e-06 6.56811055e-04 2.13650040e-08 3.72943869e-05
  4.07397374e-06 6.34466574e-07 3.21009965e-03 8.65137190e-05
  8.30893256e-08 1.61437656e-05 1.07132793e-04 1.71174048e-04
  8.64964295e-06 1.21974754e-05 2.20286296e-07 5.02700061e-07
  4.20046433e-07 2.86070212e-06 1.45969010e-04 5.62594050e-06
  6.66869309e-05 9.80421788e-08 7.20282742e-06 1.46743929e-04
  1.57661038e-04 5.27075190e-06 1.45558224e-04 1.29304113e-04
  1.12480782e-02 6.94038090e-06 6.49332321e-07 1.24187764e-07]]


In [None]:
output = np.argmax(onehot_output)
print('output=', output)

output= 2


In [None]:
for word, index in tokenizer.word_index.items():
  if index == output:
    print(test_text, "=>", word)

White => christmas
