In [1]:
from keras import models,layers
import numpy as np
import matplotlib.pyplot as plt

In [2]:
vocab_size = 10000
oov_token = '<UNK>'
max_sentence_length = 120
embedding_dim = 16

In [3]:
from keras.datasets import imdb

(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=vocab_size,oov_char=oov_token)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
word_index = imdb.get_word_index()
sentences = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
reverse_word_index = dict([(value,key) for key,value in word_index.items()])
for i in range(train_data.shape[0]):
    sentences.append([reverse_word_index.get(j,'?') for j in train_data[i]])

In [6]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences,maxlen=max_sentence_length)

In [7]:
print(padded)

[[5232   15  480 ...   18  178   32]
 [   4   89   28 ...   15  145   95]
 [ 112    3 2390 ...    6  129  113]
 ...
 [  97    5   19 ...    3 3564    2]
 [  22    2    4 ...   11    8   22]
 [  50  305   11 ...  204  131    8]]


In [8]:
test_sentences = []
for i in range(test_data.shape[0]):
    test_sentences.append([reverse_word_index.get(j,'?') for j in test_data[i]])

In [9]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences,maxlen=max_sentence_length)

In [10]:
padded[0]

array([5232,   15,  480,   66, 3768,   33,    3,  130,   11,   15,   38,
        617,    4,   24,  124,   51,   36,  135,   47,   24, 1411,   33,
          5,   21,   11,  215,   27,   77,   52,    4,   13,  407,   15,
         82,    2,    7,    3,  107,  117, 5927,   14,  256,    3,    2,
          6, 3737,    4,  723,   36,   71,   43,  529,  476,   25,  399,
        317,   46,    6,    3,    2, 1028,   12,  104,   88,    3,  381,
         14,  297,   98,   32, 2067,   56,   25,  141,    5,  194, 7479,
         17,    3,  226,   21,   20,  134,  476,   25,  480,    4,  144,
         29, 5517,   17,   51,   36,   27,  224,   92,   24,  104,    3,
        226,   65,   15,   38, 1334,   88,   11,   15,  283,    4,   15,
       4453,  113,  103,   32,   14,   15, 5343,   18,  178,   32],
      dtype=int32)

In [17]:
network = models.Sequential()
network.add(layers.Embedding(vocab_size,embedding_dim,input_length=max_sentence_length))
network.add(layers.GlobalAveragePooling1D())
network.add(layers.Dense(6,activation='relu'))
network.add(layers.Dense(1,activation='sigmoid'))

In [18]:
network.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
network.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = network.fit(padded,train_labels,epochs=3,validation_data=(test_padded,test_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [20]:
embed_layer = network.layers[0]
weights = embed_layer.get_weights()[0]

In [21]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [16]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)

[[9, 63, 101, 10, 5, 477, 1200]]
