In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
def load_data():
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 5000)
    X_train = sequence.pad_sequences(X_train, maxlen = 500, padding = "post", truncating = "post")
    X_test = sequence.pad_sequences(X_test, maxlen = 500, padding = "post", truncating = "post")
    
    vocabulary = imdb.get_word_index()
    vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
    vocabulary_inv[0] = '<PAD/>'
    return X_train, y_train, X_test, y_test, vocabulary_inv

In [3]:
X_train, y_train, X_test, y_test, vocabulary_inv = load_data()

In [4]:
X_train.shape, y_train.shape

((25000, 500), (25000,))

In [19]:
def train_word2vec(sentence_matrix, vocabulary_inv, num_features = 300, min_word_count = 1, context = 10):
    from gensim.models import word2vec
    num_workers = 2
    downsampling = 1e-3
    
    print("Training Word2Vec model...")
    sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
    embedding_model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, 
                                        min_count = min_word_count, window = context, sample = downsampling)
    embedding_model.init_sims(replace = True)
    
    num_words = len(np.unique(sentence_matrix))
    embedding_weights = []
    print("num_words : {}".format(num_words))
    for i in range(num_words):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights.append(embedding_model[word])
        else:
            print("word : {}".format(word))
            embedding_weights.append(np.random.uniform(-0.25, 0.25, embedding_model.vector_size))
    print("embedding_weights len : {}".format(len(embedding_weights)))
    #return np.vstack(embedding_weights)
    return embedding_weights

In [20]:
embedding_weights = train_word2vec(np.vstack((X_train, X_test)), vocabulary_inv, num_features = 50, min_word_count = 1, context = 10)

Training Word2Vec model...
num_words : 4999
word : a
embedding_weights len : 4999




In [18]:
embedding_weights.shape

(4999, 50)

In [12]:
a = np.array([1, 2])

In [13]:
b = np.array([3, 4])

In [14]:
np.vstack([a, b])

array([[1, 2],
       [3, 4]])