#DeepLearning

The goal of this notebook is train a LSTM character model

Aim:以一个文本中的一个词作为train data，后续的所有词作为train label，从而能够根据一个给定词，预测后续的片段。

In [2]:
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile

In [3]:
# 返回的是字母序列
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0]))
    return data

text = read_data('text8.zip')
print('Data size = {0}'.format(len(text)))
print(text[:20])

Data size = 100000000
 anarchism originate


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])


vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

# 字符转化为编号a为1，b为2...z为26，其余字符为0
def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
    return 0

# 编号转化为字符
def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [32]:
# Function to generate a training batch for the LSTM model
class BatchGenerator(object):
    """
    text:全部的文本数据
    text_size:全部文本的字符串长度
    batch_size:每段训练数据的大小
    num_unrollings:要生成的训练数据段的数目
    segment:整个训练数据集可以分成几个训练数据片段 = text_size//batch_size
    cursor:一开始记录每个训练数据片段的起始位置坐标，即这个片段位于text的哪个index
           执行next_batch生成一个训练数据的时候，游标会从初始位置自增，
           直到取够batch_size个数据
    """
    def __init__(self,text,batch_size,num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        # print("segment = ",segment) 1562484
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
    
    """
    从当前游标cursor产生单一的批量数据
    batch(batch_size * vocabulay矩阵),每行的形式为[0,0...,1,0...0]
    为1的索引为第i个字符,每个step生成batch_size个字母,之后游标后移
    """
    def _next_batch(self):
        # Generate a single batch from the current cursor postion 
        # in the data
        batch = np.zeros(shape=(self._batch_size,vocabulary_size),dtype=np.float)
        for b in range(self._batch_size):
            batch[b,char2id(self._text[self._cursor[b]])] = 1.0
            # 游标后移
            self._cursor[b]=(self._cursor[b]+1) % self._text_size
        return batch
    
    """
    每调用一次next，生成一个num_unrollings长的array，以last_batch开头，
    跟num_unrollings个batch
    每个batch的作为train_input，
    每个batch后面的一个batch作为train_label，
    每个step训练num_unrolling个batch
    返回的batches中实际为三维数组([num_rollings+1,batch_size,vocabulary_size])
    """
    def next(self):
        # Generate the next array of batches from the data.The array consists
        # of the last batch of the previous array,followed by num_unrollings new ones
        batches = [self._last_batch]
        # 每个step生成batch_size个字母
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(probabilites):
    return [id2char(c) for c in np.argmax(probabilites,1)]

def batches2string(batches):
    #batches[0].shape=(64,27)(batch_size,vocabulary_size)
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s,characters(b))]
    return s

batch_size=64
num_unrollings=10

train_batches = BatchGenerator(train_text,batch_size,num_unrollings)
valid_batches = BatchGenerator(valid_text,1,1)

batches = train_batches.next()
print(characters(batches[0]))

print("=================================================")
print(batches)

print(batches2string(batches))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))



['o', 'w', 'l', ' ', 'm', 'h', 'y', 'a', 't', 'm', 'n', 'h', 'e', 'e', 'o', 'y', 'o', 'a', ' ', 'a', 'i', ' ', 't', 'd', 'f', 'a', 'e', 'e', 'a', 'r', 'i', 'o', 'a', 'g', 'i', 'r', 'c', 'a', ' ', 'm', 't', 'u', 'e', 'o', 'o', 's', 'k', 'e', 'w', 'e', 't', 'e', ' ', 'i', 't', 'd', 't', 'e', 'f', 'd', 't', 'a', 'a', 's']
['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's 

In [29]:
#
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

# 从一个正太分布的数组中取一个样本
def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

# 取样并返回一个[0,0...1...0,0]的列向量
def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

#随机生成0到1之间的列向量
def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b/np.sum(b, 1)[:,None]



In [None]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
    # Prameter
    # Input gate:input,previous output,and bias
    # 输入数据是num_nodes个词,可能有vocabulary_size种词
    ix = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes,num_nodes],-0.1,0.1))
    ib = tf.Variable(tf.zeros([1,num_nodes]))
    # Forger gate:
    fx = tf.Variable(tf.truncated_normal([vocabulary_size,num_nodes],-0.1,0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes,num_nodes]),-0.1,0.1)
    fb = tf.Variable(tf.zeros([1,num_nodes]))
    # Memory cell:
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    def lstm_cell(i,o,state):
        """
        Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates.
        """
        # i(t)=sigmoid(Wix*x(t) + Wim*m(t-1)+)
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state
    
    # Input data
    train_data = list()
    for _ in range(num_unrollings+1):
        train_data.append(
            tf.placeholder(tf.float32,shape=[batch_size,vocabulary_size])
        )
    train_inputs = train_data[:num_unrollings]
    train_labels = train[1:]
    
        