https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [None]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

BASE_DIR = '/home/matthew/Documents/classifypipe/src/python/notebooks/'
GLOVE_DIR = BASE_DIR + 'glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

# Cycle through the sorted dirs

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        
        # Cycle through file names
        
        for fname in sorted(os.listdir(path)):
            
            # If the filename is a digit (as expected)
            # create path and read
            
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

In [None]:
# Set the total number of words to be used in word embedding

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

# Updates internal vocabulary based on a list of texts.
# Required before using `texts_to_sequences` or `texts_to_matrix`.

tokenizer.fit_on_texts(texts)

# Transforms each text into a sequence of integers
# Will only use words which the tokenizer knows (set by tokenizer.fit_on_texts()).
# Will only use th etop 'num_words' set by the tokenizer

sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Pads each sentence to the same lengyh as the longest sentence
# Zeros are inserted where words are absent. Note that padding 
# starts on the left.

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Note that this will truncate at MAX_SEQUENCE_LENGTH leading to
# an array of size:

print(data.shape)
print('Data now contains ', round((data == 0).sum() / (data.shape[0] * data.shape[1]) * 100), '% zeros.')

In [None]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape, '(classes)')

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [None]:
# Read in the embedding here using 400000 words with 100 dimensions
# This is a pre-trained embedding (see link in top chunk)

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Output is a dict with two keys: word and coefs:

In [None]:
# Create embedding matrix, by setting words not found in 
# embedding matrix to zero. Set the size of the embdedding
# using the EMBEDDING_DIM var.

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping

tb = TensorBoard(
    log_dir='./tf_logs', histogram_freq=0, batch_size=32, 
    write_graph=True, write_grads=False, write_images=False, 
    embeddings_freq=0, embeddings_layer_names=None, 
    embeddings_metadata=None)

es = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=0, 
    verbose=0, mode='auto')

# Create keras embedding layer from pre-trained embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Set input size for embedding layer

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
                            

In [None]:
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=3, batch_size=128, callbacks=[tb, es])

In [None]:
model.save('convnet.h5')
