# 9-7 글자 단위 RNN(Char RNN)

## 글자 단위 RNN 언어 모델(Char RNNLM)

### 1. 데이터 처리

In [1]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

In [3]:
urllib.request.urlretrieve('http://www.gutenberg.org/files/11/11-0.txt', filename='11-0.txt')
f = open('./11-0.txt', 'rb')

lines = []

for line in f:
    line = line.strip() # \r, \n 제거
    line = line.lower()
    line = line.decode('ascii', 'ignore') #\xe2\x80\x99 등과 같은 바이트열 제거
    if len(line)>0:
        lines.append(line)
f.close()

lines[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. you may copy it, give it away or re-use it under the terms',
 'of the project gutenberg license included with this ebook or online at']

In [4]:
# 하나의 문자열로 통합
text = ' '.join(lines)
print('문자열의 길이 또는 총 글자의 개수: %d' % len(text))

문자열의 길이 또는 총 글자의 개수: 159484


In [6]:
# 문자열로부터 '글자' 집합 생성
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print('글자 집합의 크기: {}'.format(vocab_size))

char_to_index = dict((c, i) for i, c in enumerate(char_vocab)) # 글자에 정수 인덱스 부여
print(char_to_index)

글자 집합의 크기: 56
{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55}


In [7]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [8]:
'''
    1. 문장 샘플의 길이 정하기
    2. 해당 길이만큼 문자열 전체를 전부 등분
'''
seq_length = 60 # 문장의 길이
n_samples = int(np.floor((len(text)-1) / seq_length)) # 문자열 60등분; 총 샘플의 개수
print('문장 샘플의 수: {}'.format(n_samples))

문장 샘플의 수: 2658


In [14]:
train_X = []
train_y = []

for i in range(n_samples):
    X_sample = text[i*seq_length : (i+1)*seq_length] # 60개씩
    X_encoded = [char_to_index[c] for c in X_sample] # 하나의 문장 샘플에 대해 정수 인코딩
    train_X.append(X_encoded)
    
    # 오른쪽으로 한 칸 shift
    y_sample = text[i*seq_length+1 : (i+1)*seq_length+1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

print(train_X[0], '\n')
print(train_y[0]) # shift

[49, 37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30] 

[37, 34, 0, 45, 47, 44, 39, 34, 32, 49, 0, 36, 50, 49, 34, 43, 31, 34, 47, 36, 0, 34, 31, 44, 44, 40, 0, 44, 35, 0, 30, 41, 38, 32, 34, 48, 0, 30, 33, 51, 34, 43, 49, 50, 47, 34, 48, 0, 38, 43, 0, 52, 44, 43, 33, 34, 47, 41, 30, 43]


In [15]:
# Embedding층 설계X
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)
print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩
'''
    샘플 개수: 2658개
    입력 시퀀스의 길이: 60
    각 벡터의 차원: 56 (원-핫 인코딩)
'''

train_X의 크기(shape) : (2658, 60, 56)
train_y의 크기(shape) : (2658, 60, 56)


### 2. 모델 생성

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [17]:
model = Sequential()
model.add(LSTM(256, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 16s - loss: 3.0824 - accuracy: 0.1821
Epoch 2/80
84/84 - 14s - loss: 2.7546 - accuracy: 0.2393
Epoch 3/80
84/84 - 14s - loss: 2.3951 - accuracy: 0.3288
Epoch 4/80
84/84 - 14s - loss: 2.2476 - accuracy: 0.3630
Epoch 5/80
84/84 - 14s - loss: 2.1346 - accuracy: 0.3909
Epoch 6/80
84/84 - 14s - loss: 2.0457 - accuracy: 0.4125
Epoch 7/80
84/84 - 14s - loss: 1.9808 - accuracy: 0.4296
Epoch 8/80
84/84 - 14s - loss: 1.9118 - accuracy: 0.4486
Epoch 9/80
84/84 - 14s - loss: 1.8553 - accuracy: 0.4640
Epoch 10/80
84/84 - 14s - loss: 1.8017 - accuracy: 0.4787
Epoch 11/80
84/84 - 14s - loss: 1.7545 - accuracy: 0.4907
Epoch 12/80
84/84 - 14s - loss: 1.7092 - accuracy: 0.5018
Epoch 13/80
84/84 - 14s - loss: 1.6640 - accuracy: 0.5137
Epoch 14/80
84/84 - 14s - loss: 1.6294 - accuracy: 0.5225
Epoch 15/80
84/84 - 14s - loss: 1.5900 - accuracy: 0.5336
Epoch 16/80
84/84 - 14s - loss: 1.5498 - accuracy: 0.5441
Epoch 17/80
84/84 - 14s - loss: 1.5136 - accuracy: 0.5532
Epoch 18/80
84/84 - 14s

<tensorflow.python.keras.callbacks.History at 0x7f21540b6100>

In [18]:
def sentence_generation(model, length):
    ix = [np.random.randint(vocab_size)] # 글자에 대한 랜덤 인덱스 생성
    y_char = [index_to_char[ix[-1]]] # 랜덤 인덱스로부터 글자 생성
    print(ix[-1], '번 글자', y_char[-1], '로 예측을 시작!')
    X = np.zeros((1, length, vocab_size)) # (1, length, 56) 크기의 X 생성 (=LSTM의 입력 시퀀스)
    
    for i in range(length):
        X[0][i][ix[-1]] = 1 # X[0][i][예측한 글자의 인덱스] = 1; 예측한 글자를 다음 입력 시퀀스에 추가
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
        
    return ('').join(y_char)

sentence_generation(model, 100)

45 번 글자 p 로 예측을 시작!
plet. his, i do, alice thought this moment the door of the house opened, and the bil the chireer tha

'plet. his, i do, alice thought this moment the door of the house opened, and the bil the chireer that'

## 글자 단위 RNN(Char RNN)으로 텍스트 생성하기

### 1. 데이터 처리

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [19]:
text='''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [20]:
tokens = text.split() # \n 제거
text = ' '.join(tokens)
print(text)

I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [21]:
char_vocab = sorted(list(set(text))) # 중복을 제거한 글자 집합 생성
print(char_vocab) # 알파벳, 구두점 단위

vocab_size = len(char_vocab)
print('글자 집합의 크기 : {}'.format(vocab_size))

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
글자 집합의 크기 : 33


In [22]:
char_to_index = dict((c, i) for i, c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


stude -> n 예측<br>
tuden -> t 예측

In [32]:
# 입력 시퀀스의 길이가 10이 되도록 데이터 구성
length = 11 # 입력시퀀스 + 입력 대상 글자
sequences = []

for i in range(length, len(text)):
    seq = text[i-length:i] # 길이 11의 문자열 생성
    sequences.append(seq)
    
print('총 훈련 샘플의 수; %d' % len(sequences))
sequences[:10]

총 훈련 샘플의 수; 426


['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [33]:
# 정수 인코딩
X = []

for line in sequences:
    temp_X = [char_to_index[char] for char in line]
    X.append(temp_X)

for line in X[:5]:
    print(line)

[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18]
[0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28]
[16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17]
[14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0]
[28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]


In [34]:
# 라벨 분리
sequences = np.array(X)
X = sequences[:,:-1]
y = sequences[:,-1]

for line in X[:5]:
    print(line)
print(y[:5])

[ 8  0 16 14 28  0 24 23  0 31]
[ 0 16 14 28  0 24 23  0 31 18]
[16 14 28  0 24 23  0 31 18 28]
[14 28  0 24 23  0 31 18 28 17]
[28  0 24 23  0 31 18 28 17  0]
[18 28 17  0 21]


In [35]:
# 원-핫 인코딩
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)

print(X.shape)
'''
    샘플 수 : 426개
    입력 시퀀스의 길이: 10
    각 벡터의 차원: 33
'''

(426, 10, 33)


### 2. 모델 설계

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [37]:
model = Sequential()
model.add(LSTM(80, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
14/14 - 1s - loss: 3.4576 - accuracy: 0.1526
Epoch 2/100
14/14 - 0s - loss: 3.2922 - accuracy: 0.1972
Epoch 3/100
14/14 - 0s - loss: 3.0505 - accuracy: 0.1972
Epoch 4/100
14/14 - 0s - loss: 2.9816 - accuracy: 0.1972
Epoch 5/100
14/14 - 0s - loss: 2.9433 - accuracy: 0.1972
Epoch 6/100
14/14 - 0s - loss: 2.9259 - accuracy: 0.1972
Epoch 7/100
14/14 - 0s - loss: 2.9177 - accuracy: 0.1972
Epoch 8/100
14/14 - 0s - loss: 2.8888 - accuracy: 0.1972
Epoch 9/100
14/14 - 0s - loss: 2.8698 - accuracy: 0.1972
Epoch 10/100
14/14 - 0s - loss: 2.8388 - accuracy: 0.1972
Epoch 11/100
14/14 - 0s - loss: 2.8045 - accuracy: 0.1972
Epoch 12/100
14/14 - 0s - loss: 2.7640 - accuracy: 0.2019
Epoch 13/100
14/14 - 0s - loss: 2.7170 - accuracy: 0.2089
Epoch 14/100
14/14 - 0s - loss: 2.6814 - accuracy: 0.2300
Epoch 15/100
14/14 - 0s - loss: 2.6442 - accuracy: 0.2512
Epoch 16/100
14/14 - 0s - loss: 2.5779 - accuracy: 0.2512
Epoch 17/100
14/14 - 0s - loss: 2.5259 - accuracy: 0.2535
Epoch 18/100
14/14 - 0s

<tensorflow.python.keras.callbacks.History at 0x7f211019b6d0>

In [38]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):
# 모델, 인덱스 정보, 문장 길이, 초기 시퀀스, 반복 횟수
    init_text = seed_text # 문장 생성에 사용할 초기 시퀀스
    sentence = ''

    for _ in range(n): # n번 반복
        encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_index))
        result = model.predict_classes(encoded, verbose=0)
        # 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 글자)를 result에 저장.
        for char, index in char_to_index.items(): # 만약 예측한 글자와 인덱스와 동일한 글자가 있다면
            if index == result: # 해당 글자가 예측 글자이므로 break
                break
        seed_text=seed_text + char # 현재 시퀀스 + 예측 글자를 현재 시퀀스로 변경
        sentence=sentence + char # 예측 글자를 문장에 저장
        # for문이므로 이 작업을 다시 반복

    sentence = init_text + sentence
    return sentence

print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))



I get on with life as a programmer, I like to use words about beer. But when I stap m o aa
