In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.utils import np_utils

Using TensorFlow backend.


1. 问题描述：对于原始字符集“AB……Z”，根据前面的字符预测下一个字符。
2. 思路：首先定义问题的输入输出，对于序列问题，每一步的输入组成的序列为模型的输入，每一步的输出组成的序列为模型的输出；然后，必须将语义输入输出转换成计算机能够处理的数值，这个过程中用一个字典映射作辅助。

In [2]:
raw_data = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
window = 3                  # 相当与n-gram模型中的窗口大小n
char_to_int = dict((c,i) for i,c in enumerate(raw_data))  # 将输入转换成数值（模型只能计算数值）
int_to_char = dict((i,c) for i,c in enumerate(raw_data))  # 方便将计算结果转换成语义结果
x_data = []
y_data = []
print("window=%s时理想的效果：" % window)
for i in range(0, len(raw_data)-window, 1):
    seq_in = raw_data[i:i+window]
    seq_out = raw_data[i+window]
    x_data.append([char_to_int[char] for char in seq_in]) # 将输入转换成数值
    y_data.append(char_to_int[seq_out])
    print(seq_in,'->',seq_out)
print("total samples: %s" % len(x_data))

window=3时理想的效果：
ABC -> D
BCD -> E
CDE -> F
DEF -> G
EFG -> H
FGH -> I
GHI -> J
HIJ -> K
IJK -> L
JKL -> M
KLM -> N
LMN -> O
MNO -> P
NOP -> Q
OPQ -> R
PQR -> S
QRS -> T
RST -> U
STU -> V
TUV -> W
UVW -> X
VWX -> Y
WXY -> Z
total samples: 23


* 从数据集中抽取了(DEF,G) (HIJ,K) (LMN,0) (PQR,S) (TUV,W)五条数据用作测试集，剩下的作为训练集。保证所有字目都是模型见过的。

In [3]:
x = np.reshape(x_data, (len(x_data), window, 1)) # 将数值输入转换成（sample，time-step，feature）形式送入LSTM处理。
x = x/len(raw_data)
y = np_utils.to_categorical(y_data)              # 对y_data进行one-hot编码
x_test = [x[i] for i in (3,7,11,15,19)]   # 取最后5条sample为测试集，剩下的用作训练
y_test = [y[i] for i in (3,7,11,15,19)]
# x_train = [i for i in x if i not in x_test]   
# y_train = [i for i in y if i not in y_test]
x_train = [x[i] for i in (0,1,2,4,5,6,8,9,10,12,13,14,16,17,18,20,21,22)]
y_train = [y[i] for i in (0,1,2,4,5,6,8,9,10,12,13,14,16,17,18,20,21,22)]
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
print("train samples count: %s, test samples count: %s"% (len(x_train),len(x_test)))
print("test sample:",x_test,y_test)

train samples count: 18, test samples count: 5
test sample: [[[ 0.11538462]
  [ 0.15384615]
  [ 0.19230769]]

 [[ 0.26923077]
  [ 0.30769231]
  [ 0.34615385]]

 [[ 0.42307692]
  [ 0.46153846]
  [ 0.5       ]]

 [[ 0.57692308]
  [ 0.61538462]
  [ 0.65384615]]

 [[ 0.73076923]
  [ 0.76923077]
  [ 0.80769231]]] [[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  1.  0.  0.  0.]]


### 为什么在测试准确率为0？为什么不能根据规律推测？
* 是不是训练集的知识和测试集的知识有断层（模型都没有见过测试集中的字母'V','W','X','Y','Z'）？

In [4]:
lstm = Sequential()
lstm.add(LSTM(32, input_shape=(x_train.shape[1], x_train.shape[2])))
lstm.add(Dense(y_train.shape[1], activation="softmax"))
lstm.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
lstm.fit(x_train, y_train, batch_size=1, epochs=500, verbose=0)
score = lstm.evaluate(x_train, y_train, verbose=0)
print("train accurancy: %.2f" % (score[1]*100))
for term in [x_data[i] for i in (3,7,11,15,19)]:
    sample = np.reshape(term, (1,len(term),1))
    sample = sample/float(len(raw_data))
    prediction = lstm.predict(sample, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in term]
    print(seq_in,'->',result)
score2 = lstm.evaluate(x_test, y_test, verbose=0)
print("test accuracy: %.2f" % (score2[1]*100))

train accurancy: 100.00
['D', 'E', 'F'] -> H
['H', 'I', 'J'] -> J
['L', 'M', 'N'] -> N
['P', 'Q', 'R'] -> R
['T', 'U', 'V'] -> V
test accuracy: 0.00
