In [1]:
'''https://www.jianshu.com/p/795a5e2cd10c
http://alwa.info/2016/09/26/Keras-%E5%AE%9E%E7%8E%B0-LSTM/
'''
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classication of newsgroup messages into 20 different categories).
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)
20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''
# 多分类问题  序列
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import LSTM, SimpleRNN, GRU
import sys

Using TensorFlow backend.


In [2]:
BASE_DIR = 'D:/data analysis/data archives'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100   # glove.6B ----->  array(1,100)
VALIDATION_SPLIT = 0.2   # 验证
batch_size = 32

# first, build index mapping words in the embeddings set
# to their embedding vector

# embeddings_index = { "word1": "vector array of (1, 100)", "word2": "vector array of (1, 100)",  ...}   (20001,(1,100))
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
#Found 400000 word vectors.



# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id   20 个dict
labels = []  # list of label ids   20000个label  [0:1000]=0 [1000:2000]=1 ... （0-19)
# iter 20 messages dir
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        # 20 messages put tag 0-19
        label_id = len(labels_index)
        # 20 messages and its (index)label save to dict
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                # python 版本
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))
#Found 19997 texts.

Found 400000 word vectors.
Processing text dataset
Found 19997 texts.


# Tokenizer()
### somestr = ['ha ha gua angry',  'howa ha gua excited naive']
### tok = tt.Tokenizer()
### tok.fit_on_texts(somestr)

### tok.word_index
### Out[90]: {'angry': 3, 'excited': 5, 'gua': 2, 'ha': 1, 'howa': 4, 'naive': 6}

### tok.texts_to_sequences(somestr)
### Out[91]: [[1, 1, 2, 3], [4, 1, 2, 5, 6]]


In [4]:
# finally, vectorize the text samples into a 2D integer tensor
# 生成的就是列数为MAX_NB_WORDS的matrix
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(len(sequences)) # 19997
# 生成不重复的  (unique) 索引(tokens) Out[90]: {'angry': 3, 'excited': 5, 'gua': 2, 'ha': 1, 'howa': 4, 'naive': 6}
# 出现 21 万个单词
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



19997
Found 214873 unique tokens.


##### 上面的代码吧所有的单词都转换成了数字

In [6]:
for i in range(100):
    print(len(sequences[0]))# 1655

1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655
1655


In [7]:
# 取19997 篇 message 的前1000 个单词 
# data 是一个长度为 1000 的 array，sequences 中不够长的部分被补0了
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# labels 被转换成了 one-hot 编码的形式
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print(x_train.shape)
print(y_train.shape)
print(nb_validation_samples)

print('Preparing embedding matrix.')

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)
(15998, 1000)
(15998, 20)
3999
Preparing embedding matrix.


### embeddings_index  { "word1": "vector array of (1, 100)",...}       (400000)   (1,100)
### word_index        {"word" : int_num }                              (214873)    1-214873
### embedding_matrix  { int_num: "vector array of (1, 100)",...}       1-214873    (1,100)

In [8]:
nb_words = min(MAX_NB_WORDS, len(word_index))  # 20000  214873
#20000
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))  # 20001  100

#  embeddings_index  a dict              {"word1": "vector array of (1, 100)",...}       (400000)   (1,100)
#  word_index       a dict               {"word" : int_num }                              (214873)    1-214873
#  embedding_matrix  matrix   not a dict { int_num: "vector array of (1, 100)",...}       (20001, 100)
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    # get word 对应的 vector  from  embeddings_index (dict)
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)
#(20001, 100)


(20001, 100)


In [15]:
n = 0
for k,v in word_index.items():
    n+=1
    if n<5:
        print(k, v)
    else:
        break

the 1
edu 2
to 3
of 4


In [18]:
# 验证
num = word_index["edu"]
print(num)
print(embedding_matrix[num])
print(embeddings_index["edu"])
print(embedding_matrix[num]==embeddings_index["edu"])


2
[ 0.052258    0.1337     -0.13601001 -0.53100002 -0.33232999  0.61752999
  0.47343001 -0.29181001  0.55712003 -0.18652     0.50238001 -0.15098
 -0.68273002 -0.58508998  0.34746999  0.82177001  0.31852999  0.95371997
  0.24924999  0.38438001 -0.73689002  0.50779998 -0.20254     0.79536998
 -0.26524001 -0.25088    -0.54742998  0.17678    -0.016369   -0.58517998
  0.20823     0.7511     -0.6476      0.76545     0.35394999 -0.63657999
  0.097478    0.55644    -0.48076001  0.50494999  0.67242002  0.33906001
 -0.66140997  0.85762    -0.0035989   0.43921    -0.72952998  0.31665999
 -0.1517     -0.13259999  0.30517     0.18661    -0.97727001 -0.46492001
 -0.55678999  0.11558    -0.29574001  0.35183999 -0.46292999  0.47373
  0.52429998 -0.09943     0.34402001  0.46485999  0.23965999  0.92896998
  0.30223    -0.14928     0.47215     0.21781     0.035236    0.047968
  0.090056   -1.52830005 -0.08229    -0.41813999 -0.0087449   0.28643
  0.037936   -0.44747001  0.38986    -0.48626    -0.11746   

## 2.6 LSTM训练

In [20]:
# data的shape （19997,100）

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            dropout=0.2)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)

preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
print(x_train.shape)

  # Remove the CWD from sys.path while we load stuff.


Training model.
(15998, 1000)


In [21]:
print(nb_words, MAX_SEQUENCE_LENGTH)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)
print(y_train[0])

20000 1000
(15998, 1000) (15998, 20) (3999, 1000) (3999, 20)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [24]:
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            dropout=0.2)
batch_size = 32

print('Build model...')
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer()
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.add(Dense(len(labels_index), activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=5,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

  


Build model...


  


Train...




Train on 15998 samples, validate on 3999 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.18813910955531982
Test accuracy: 0.949999988079071


In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 100)         2000100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                40        
Total params: 2,080,641
Trainable params: 80,541
Non-trainable params: 2,000,100
_________________________________________________________________


In [25]:
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=2, batch_size=128)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 15998 samples, validate on 3999 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x25d716a0>