In [2]:
import numpy as np
import keras

Using TensorFlow backend.


In [3]:
def load_data():
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 5000)
    X_train = sequence.pad_sequences(X_train, maxlen = 500, padding = "post", truncating = "post")
    X_test = sequence.pad_sequences(X_test, maxlen = 500, padding = "post", truncating = "post")
    
    vocabulary = imdb.get_word_index()
    vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
    vocabulary_inv[0] = '<PAD/>'
    return X_train, y_train, X_test, y_test, vocabulary_inv

In [4]:
X_train, y_train, X_test, y_test, vocabulary_inv = load_data()

In [5]:
X_train.shape, y_train.shape

((25000, 500), (25000,))

In [6]:
def train_word2vec(sentence_matrix, vocabulary_inv, num_features = 300, min_word_count = 1, context = 10):
    from gensim.models import word2vec
    num_workers = 2
    downsampling = 1e-3
    
    print("Training Word2Vec model...")
    sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
    embedding_model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, 
                                        min_count = min_word_count, window = context, sample = downsampling)
    embedding_model.init_sims(replace = True)
    
    num_words = sentence_matrix.max()
    embedding_weights = []
    print("num_words : {}".format(num_words))
    for i in range(num_words + 1):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights.append(embedding_model[word])
        else:
            print("word : {}".format(word))
            embedding_weights.append(np.random.uniform(-0.25, 0.25, embedding_model.vector_size))
    print("embedding_weights len : {}".format(len(embedding_weights)))
    return np.vstack(embedding_weights)
    #return embedding_weights

In [7]:
embedding_weights = train_word2vec(np.vstack((X_train, X_test)), vocabulary_inv, num_features = 50, min_word_count = 1, context = 10)

Training Word2Vec model...
num_words : 4999
word : a
embedding_weights len : 5000




In [9]:
embedding_weights.shape

(5000, 50)

In [11]:
X_train.shape

(25000, 500)

In [None]:
def build_model(embedding_weights):
    from keras.models import Model
    from keras.layers import Input, Embedding, Dropout, Convolution1D, MaxPooling1D, Flatten, Concatenate
    model_input = Input(shape = (500, 50))
    layer = Embedding(5000, 50, input_length = 500, name = 'embedding')(model_input)
    layer = Dropout(0.5)(layer)
    conv_blocks = []
    for sz in (3, 8):
        conv = Convolution1D(filters = 10,
                            kernel_size = sz,
                            padding = "valid",
                            activation = "relu",
                            strides = 1)(layer)
        conv = MaxPooling1D(pool_size = 2)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    
    layer = Concatenate()(conv_blocks)
    layer = Dropout(0.8)(layer)
    layer = Dense(50, activation = 'relu')(layer)
    model_output = Dense(1, activation = 'sigmoid')(layer)
    
    model = Model(model_input, model_output)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights([embedding_weights])
    return model