In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
print('tensorflow version ',tf.__version__)

tensorflow version  2.2.0


In [2]:
import numpy as np

In [3]:
imdb,info = tfds.load(name='imdb_reviews',as_supervised=True,with_info=True)

In [4]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [5]:
imdb

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [6]:
train_data, test_data = imdb['train'],imdb['test']

train_sentences = []
train_labels = []

test_sentences=[]
test_labels=[]
# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for sentence,label in train_data:
    train_sentences.append(str(sentence.numpy()))
    train_labels.append(label.numpy())

for sentence,label in test_data:
    test_sentences.append(str(sentence.numpy()))
    test_labels.append(label.numpy())

train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)

In [7]:
# import numpy as np

# train_data, test_data = imdb['train'], imdb['test']

# training_sentences = []
# training_labels = []

# testing_sentences = []
# testing_labels = []

# # str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
# for s,l in train_data:
#     training_sentences.append(str(s.numpy()))
#     training_labels.append(l.numpy())

# for s,l in test_data:
#     testing_sentences.append(str(s.numpy()))
#     testing_labels.append(l.numpy())

  
# training_labels_final = np.array(training_labels)
# testing_labels_final = np.array(testing_labels)

In [8]:
train_labels_final

array([0, 0, 0, ..., 0, 0, 1])

In [9]:
train_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [10]:
set(train_labels)

{0, 1}

In [12]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,truncating=trunc_type)

In [13]:
reverse_word_index=dict([(v,k) for (k,v) in word_index.items()])

def decode_review(text):
    return " ".join([reverse_word_index.get(i,'?') for i in text])

## Define Model

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=6,activation='relu'),
    tf.keras.layers.Dense(units=1,activation='sigmoid')
])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [15]:
num_epochs = 10
model.fit(padded, train_labels_final, epochs=num_epochs, validation_data=(testing_padded, test_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x142fa95f8>

In [16]:
test_labels_final

array([1, 1, 0, ..., 0, 1, 1])

In [17]:
embedding_layer = model.layers[0]

In [18]:
weights = embedding_layer.get_weights()[0]
weights.shape # vocab_size, embedding_dim

(10000, 16)

In [20]:
import io

vectors = io.open('vecs.tsv','w',encoding='utf-8')
meta = io.open('meta.tsv','w',encoding='utf-8')

for word_num in range(1,vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    meta.write(word+"\n")
    vectors.write("\t".join([str(x) for x in embeddings])+"\n")
vectors.close()
meta.close()