In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
print('tensorflow version ',tf.__version__)

tensorflow version  2.2.0


In [31]:
import numpy as np

In [2]:
imdb,info = tfds.load(name='imdb_reviews',as_supervised=True,with_info=True)

In [3]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [4]:
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [5]:
train_data, test_data = imdb['train'],imdb['test']

In [27]:
train_sentences = []
train_labels = []

test_sentences=[]
test_labels=[]
# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for sentence,label in train_data:
    train_sentences.append(str(sentence.numpy()))
    train_labels.append(label.numpy())

for sentence,label in test_data:
    test_sentences.append(str(sentence.numpy()))
    test_labels.append(label.numpy())

In [35]:
train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)

In [36]:
train_labels_final

array([0, 0, 0, ..., 0, 0, 1])

In [37]:
train_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [29]:
set(train_labels)

{0, 1}

In [38]:
vocab_size=10000
embedding_dim = 16
max_length=120
trunc_type='post'
oov_token = '<OOV>'

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [40]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)

In [41]:
tokenizer.fit_on_texts(train_sentences)

In [42]:
word_index = tokenizer.word_index

In [43]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [68]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [45]:
# train_sequences[0]

In [48]:
padded_train_sequences = pad_sequences(train_sequences,maxlen=max_length,truncating=trunc_type)

In [69]:
padded_test_sequences = pad_sequences(test_sequences,maxlen=max_length,truncating=trunc_type)

In [49]:
padded_train_sequences[0]

array([   0,    0,   59,   12,   14,   35,  439,  400,   18,  174,   29,
          1,    9,   33, 1378, 3401,   42,  496,    1,  197,   25,   88,
        156,   19,   12,  211,  340,   29,   70,  248,  213,    9,  486,
         62,   70,   88,  116,   99,   24, 5740,   12, 3317,  657,  777,
         12,   18,    7,   35,  406, 8228,  178, 2477,  426,    2,   92,
       1253,  140,   72,  149,   55,    2,    1, 7525,   72,  229,   70,
       2962,   16,    1, 2880,    1,    1, 1506, 4998,    3,   40, 3947,
        119, 1608,   17, 3401,   14,  163,   19,    4, 1253,  927, 7986,
          9,    4,   18,   13,   14, 4200,    5,  102,  148, 1237,   11,
        240,  692,   13,   44,   25,  101,   39,   12, 7232,    1,   39,
       1378,    1,   52,  409,   11,   99, 1214,  874,  145,   10],
      dtype=int32)

In [55]:
reverse_word_index=dict([(v,k) for (k,v) in word_index.items()])

In [60]:
def decode_review(text):
    return " ".join([reverse_word_index.get(i,'?') for i in text])

In [61]:
decode_review(padded_train_sequences[0])

"? ? b this was an absolutely terrible movie don't be <OOV> in by christopher walken or michael <OOV> both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie's ridiculous storyline this movie is an early nineties us propaganda piece the most pathetic scenes were those when the <OOV> rebels were making their cases for <OOV> maria <OOV> <OOV> appeared phony and her pseudo love affair with walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning i am disappointed that there are movies like this ruining <OOV> like christopher <OOV> good name i could barely sit through it"

In [63]:
train_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

## Define Model

In [86]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=20,activation='relu'),
    tf.keras.layers.Dense(units=1,activation='sigmoid')
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 20)                38420     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 21        
Total params: 198,441
Trainable params: 198,441
Non-trainable params: 0
_________________________________________________________________


In [87]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [88]:
num_epochs = 20
model.fit(padded_train_sequences,train_labels_final,epochs=num_epochs,validation_data=[padded_test_sequences,test_labels_final])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x14dca9080>

In [73]:
embedding_layer = model.layers[0]

In [78]:
weights = embedding_layer.get_weights()[0]
weights.shape # vocab_size, embedding_dim

(10000, 16)

In [82]:
import io

In [85]:
vectors = io.open('vecs.tsv','w',encoding='utf-8')
meta = io.open('meta.tsv','w',encoding='utf-8')

for word_num in range(1,vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    meta.write(word+"\n")
    vectors.write("\t".join([str(x) for x in embeddings])+"\n")
vectors.close()
meta.close()