In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
def load_data():
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 5000)
    X_train = sequence.pad_sequences(X_train, maxlen = 500, padding = "post", truncating = "post")
    X_test = sequence.pad_sequences(X_test, maxlen = 500, padding = "post", truncating = "post")
    
    vocabulary = imdb.get_word_index()
    vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
    vocabulary_inv[0] = '<PAD/>'
    return X_train, y_train, X_test, y_test, vocabulary_inv

In [3]:
X_train, y_train, X_test, y_test, vocabulary_inv = load_data()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [4]:
X_train.shape, y_train.shape

((25000, 500), (25000,))

In [6]:
def train_word2vec(sentence_matrix, vocabulary_inv, num_features = 300, min_word_count = 1, context = 10):
    from gensim.models import word2vec
    num_workers = 2
    downsampling = 1e-3
    
    print("Training Word2Vec model...")
    sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
    embedding_model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, 
                                        min_count = min_word_count, window = context, sample = downsampling)
    embedding_model.init_sims(replace = True)
    
    num_words = sentence_matrix.max()
    embedding_weights = []
    print("num_words : {}".format(num_words))
    for i in range(num_words + 1):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights.append(embedding_model[word])
        else:
            print("word : {}".format(word))
            embedding_weights.append(np.random.uniform(-0.25, 0.25, embedding_model.vector_size))
    print("embedding_weights len : {}".format(len(embedding_weights)))
    return np.vstack(embedding_weights)
    #return embedding_weights

In [7]:
embedding_weights = train_word2vec(np.vstack((X_train, X_test)), vocabulary_inv, num_features = 50, min_word_count = 1, context = 10)

Training Word2Vec model...
num_words : 4999
word : a
embedding_weights len : 5000




In [8]:
embedding_weights.shape

(5000, 50)

In [9]:
X_train.shape

(25000, 500)

In [17]:
def build_model(embedding_weights):
    from keras.models import Model
    from keras.layers import Input, Embedding, Dropout, Convolution1D, MaxPooling1D, Flatten, Concatenate, Dense
    model_input = Input(shape = (500, ))
    layer = Embedding(5000, 50, input_length = 500, name = 'embedding')(model_input)
    layer = Dropout(0.5)(layer)
    conv_blocks = []
    for sz in (3, 8):
        conv = Convolution1D(filters = 10,
                            kernel_size = sz,
                            padding = "valid",
                            activation = "relu",
                            strides = 1)(layer)
        conv = MaxPooling1D(pool_size = 2)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    
    layer = Concatenate()(conv_blocks)
    layer = Dropout(0.2)(layer)
    layer = Dense(50, activation = 'relu')(layer)
    model_output = Dense(1, activation = 'sigmoid')(layer)
    
    model = Model(model_input, model_output)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights([embedding_weights])
    return model

In [18]:
model = build_model(embedding_weights)

In [19]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 500, 50)      250000      input_4[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 500, 50)      0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 498, 10)      1510        dropout_5[0][0]                  
____________________________________________________________________________________________

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size = 0.3)

In [None]:
from keras import backend as K
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * pred, 0, 1)))
    posible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def calc_f1_score(y_, true):
    precision = precision(y_true, y_pred)

In [None]:
hist = model.fit(X_train, y_train, epochs = 20, batch_size = 32, validation_data = (X_dev, y_dev))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 17500 samples, validate on 7500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20