# LSTM文本生成实战
任务：基于flare文本数据，建立LSTM模型，预测序列文字  
1、完成数据预处理，将文字序列数据转化为可用于LSTM输入的数据  
2、查看文字数据预处理后的数据结构，并进行数据分离操作  
3、针对字符串输入（"flare is a teacher in ai industry.   
   He obtained his phd in Australia."），预测其对应的后续字符  
备注:模型结构：单层LSTM，输出有20个神经元，每次使用前20个字符预测第21个字符


In [26]:
#load the data
data = open('flare').read()
#移除换行符
data = data.replace('\n', '').replace('\r', '')
data[:50]

'flare is a teacher in ai industry. He obtained his'

In [2]:
#字符去重处理
letters = list(set(data))
num_letters = len(letters)
print(letters, '\n', num_letters)

['l', 'f', 'H', 'i', 't', 'c', 's', 'A', 'u', 'e', 'r', 'a', 'n', 'p', 'y', 'b', 'o', 'S', 'm', ' ', 'd', '.', 'h'] 
 23


In [3]:
#建立字典
#int to char
int_to_char = {a: b for a, b in enumerate(letters)}
#char to int
char_to_int = {b: a for a, b in enumerate(letters)}
print(int_to_char, '\n', char_to_int)

{0: 'l', 1: 'f', 2: 'H', 3: 'i', 4: 't', 5: 'c', 6: 's', 7: 'A', 8: 'u', 9: 'e', 10: 'r', 11: 'a', 12: 'n', 13: 'p', 14: 'y', 15: 'b', 16: 'o', 17: 'S', 18: 'm', 19: ' ', 20: 'd', 21: '.', 22: 'h'} 
 {'l': 0, 'f': 1, 'H': 2, 'i': 3, 't': 4, 'c': 5, 's': 6, 'A': 7, 'u': 8, 'e': 9, 'r': 10, 'a': 11, 'n': 12, 'p': 13, 'y': 14, 'b': 15, 'o': 16, 'S': 17, 'm': 18, ' ': 19, 'd': 20, '.': 21, 'h': 22}


In [6]:
import numpy as np
from keras.utils import to_categorical


#滑动窗口提取数据
def extract_data(data, slide):
    x = []
    y = []
    for i in range(len(data) - slide):
        x.append([a for a in data[i:i + slide]])
        y.append(data[i + slide])
    return x, y


#字符到数字的批量转化
def char_to_int_Data(x, y, char_to_int):
    x_to_int = []
    y_to_int = []
    for i in range(len(x)):
        x_to_int.append([char_to_int[char] for char in x[i]])
        y_to_int.append([char_to_int[char] for char in y[i]])
    return x_to_int, y_to_int


#实现输入字符文章的批量处理，输入整个字符，滑动窗口大小，转化字典
def data_preprocessing(data, slide, num_letters, char_to_int):
    char_Data = extract_data(data, slide)
    int_Data = char_to_int_Data(char_Data[0], char_Data[1], char_to_int)
    Input = int_Data[0]
    Output = list(np.array(int_Data[1]).flatten())
    Input_RESHAPED = np.array(Input).reshape(len(Input), slide)
    new = np.random.randint(
        0,
        10,
        size=[Input_RESHAPED.shape[0], Input_RESHAPED.shape[1], num_letters])
    for i in range(Input_RESHAPED.shape[0]):
        for j in range(Input_RESHAPED.shape[1]):
            new[i, j, :] = to_categorical(Input_RESHAPED[i, j],
                                          num_classes=num_letters)
    return new, Output

In [5]:
#extract X and y from text data
time_step = 20
X, y = data_preprocessing(data, time_step, num_letters, char_to_int)

In [19]:
#split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=10)
y_train_category = to_categorical(y_train, num_letters)

In [9]:
#set up the model
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(
    LSTM(units=20,
         input_shape=(X_train.shape[1], X_train.shape[2]),
         activation='relu'))
model.add(Dense(units=num_letters, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 20)                3520      
_________________________________________________________________
dense_1 (Dense)              (None, 23)                483       
Total params: 4,003
Trainable params: 4,003
Non-trainable params: 0
_________________________________________________________________


In [12]:
#train the model
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
model.fit(X_train, y_train_category, batch_size=1000, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x648607490>

In [20]:
#make prediction based on the training data
y_train_predict = model.predict_classes(X_train)
y_train_predict_char = [int_to_char[i] for i in y_train_predict]
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(y_train, y_train_predict)
accuracy_train

0.9914907090416164

In [21]:
y_test_predict = model.predict_classes(X_test)
accuracy_test = accuracy_score(y_text, y_test_predict)
accuracy_test

0.992520035618878

In [25]:
new_letters = 'flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.'
X_new, y_new = data_preprocessing(new_letters, time_step, num_letters,
                                  char_to_int)
y_new_predict = model.predict_classes(X_new)
y_new_predict_char = [int_to_char[i] for i in y_new_predict]
for i in range(0, X_new.shape[0]):
    print(new_letters[i:i + 20], '--predict next letter is--',
          y_new_predict_char[i])

flare is a teacher i --predict next letter is-- n
lare is a teacher in --predict next letter is--  
are is a teacher in  --predict next letter is-- a
re is a teacher in a --predict next letter is-- i
e is a teacher in ai --predict next letter is-- n
 is a teacher in ai  --predict next letter is-- i
is a teacher in ai i --predict next letter is-- n
s a teacher in ai in --predict next letter is-- d
 a teacher in ai ind --predict next letter is-- u
a teacher in ai indu --predict next letter is-- s
 teacher in ai indus --predict next letter is-- t
teacher in ai indust --predict next letter is-- r
eacher in ai industr --predict next letter is-- y
acher in ai industry --predict next letter is-- .
cher in ai industry. --predict next letter is--  
her in ai industry.  --predict next letter is-- H
er in ai industry. H --predict next letter is-- e
r in ai industry. He --predict next letter is--  
 in ai industry. He  --predict next letter is-- o
in ai industry. He o --predict next letter is-- b


he Southern Hemisphe --predict next letter is-- r
e Southern Hemispher --predict next letter is-- e
 Southern Hemisphere --predict next letter is-- .


LSTM文本生成实战summary：
1、通过搭建LSTM模型，实现了基于文本序列的字符生成功能  
2、学习了文本加载，字典生成方法  
3、掌握了文本的数据预处理方法，并熟悉了转化数据的结构  
4、实现了对新文本数据的字符预测