https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import sqlalchemy as sa
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from datetime import datetime


BASE_DIR = '/Users/matthewupson/Documents/classifypipe/src/python/notebooks/'
GLOVE_DIR = BASE_DIR + 'glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '20_newsgroup/'
MAX_SEQUENCE_LENGTH = 500
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
ENGINE = os.getenv('DATABASE_URL')

In [None]:
df = pd.read_sql_query(
        (
            "select raw.respondent_id, concat_ws(', ', comment_why_you_came,"
            "comment_where_for_help, comment_further_comments) as comment_combined,"
            "vote, code from raw left join (select respondent_id,"
            "vote from priority where coders is not null) p on"
            "(raw.respondent_id = p.respondent_id) "
            "left join (select code_id, code from codes) c on"
            "(p.vote = c.code_id)"
        ),
        con=ENGINE
        )

df = df.drop_duplicates(subset='respondent_id')
df = df.dropna(axis=0, subset=['vote'])
training_index = pd.read_csv('../../data/2017-06-24_training_set_indexes.csv')
df = df[df['respondent_id'].isin(training_index['respondent_id'])]

# Shouldn't do anything..
assert len(df.loc[df['comment_combined'].isnull(),'comment_combined']) == 0

Implement one-versus-all classification

In [None]:
ova = df['vote'].copy().as_matrix()
ova = np.array([0 if i not in [12] else 1 for i in ova])

print(ova)
print(ova.sum())
print(len(ova))

pd.Series(ova).value_counts()

In [None]:
texts = df['comment_combined'].tolist()
labels_index = {'0': 'other', '4': 'service-problem', '12': 'ok'}
labels = ova

print('Found %s texts.' % len(texts))

In [None]:
# Set the total number of words to be used in word embedding

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

# Updates internal vocabulary based on a list of texts.
# Required before using `texts_to_sequences` or `texts_to_matrix`.

tokenizer.fit_on_texts(texts)

# Transforms each text into a sequence of integers
# Will only use words which the tokenizer knows (set by tokenizer.fit_on_texts()).
# Will only use th etop 'num_words' set by the tokenizer

sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Pads each sentence to the same lengyh as the longest sentence
# Zeros are inserted where words are absent. Note that padding 
# starts on the left.

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Note that this will truncate at MAX_SEQUENCE_LENGTH leading to
# an array of size:

print(data.shape)
print('Data now contains ', round((data == 0).sum() / (data.shape[0] * data.shape[1]) * 100), '% zeros.')

In [None]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape, '(classes)')


In [None]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print(labels[:-nb_validation_samples,0].sum())
print(len(labels[:-nb_validation_samples,0]))
print(labels[-nb_validation_samples:,0].sum())
print(len(labels[-nb_validation_samples:,0]))

print(labels[:-nb_validation_samples,0].sum()/len(labels[:-nb_validation_samples,0]))
print(labels[-nb_validation_samples:,0].sum()/len(labels[-nb_validation_samples:,0]))

In [None]:
# Read in the embedding here using 400000 words with 100 dimensions
# This is a pre-trained embedding (see link in top chunk)

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Output is a dict with two keys: word and coefs:

In [None]:
# Create embedding matrix, by setting words not found in 
# embedding matrix to zero. Set the size of the embdedding
# using the EMBEDDING_DIM var.

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping

logname = 'tf_logs/govuk_ova_ok_' + str(datetime.now())

tb = TensorBoard(
    log_dir=logname, histogram_freq=0, batch_size=32, 
    write_graph=True, write_grads=False, write_images=False, 
    embeddings_freq=0, embeddings_layer_names=None, 
    embeddings_metadata=None)

es = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, 
    verbose=1, mode='auto')

# Create keras embedding layer from pre-trained embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Set input size for embedding layer

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
                            

In [None]:
len(word_index)

In [None]:
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(10)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)


# Adjust number of output nodes here for OVA should be 2!

preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adamax',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=20, batch_size=128, callbacks=[tb])

In [None]:
test_loss, test_acc = model.evaluate(x_val, y_val)

print('test_acc:', test_acc)

In [None]:
test_pred = model.predi(x_val)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_val[:,1], test_pred))


In [None]:
model.save(logname + '.h5')