In [2]:
import os
os.environ[ "TF_CPP_MIN_LOG_LEVEL"] = "3"
import io
import numpy as np
import tensorflow as tf
tf.compat.v1.logging.set_verbosity( tf.compat.v1.logging.ERROR)
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
imdb, info = tfds.load( "imdb_reviews", with_info=True, as_supervised=True)

In [4]:
print( info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='C:\\Users\\amrka\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_

In [5]:
print( imdb)

{'train': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, 'test': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, 'unsupervised': <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}


In [6]:
train_data, test_data = imdb[ "train"], imdb[ "test"]

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

for s, l in train_data:
	training_sentences.append( s.numpy().decode( "utf8"))
	training_labels.append( l.numpy())

for s, l in test_data:
	testing_sentences.append( s.numpy().decode( "utf8"))
	testing_labels.append( l.numpy())

training_labels_final = np.array( training_labels)
testing_labels_final = np.array( testing_labels)

In [7]:
vocab_size = 10000
max_length = 120
embedding_dim = 16
trunc_type = "post"
oov_tok = "<OOV>"

In [8]:
tokenizer = Tokenizer( num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts( training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences( training_sentences)
padded = pad_sequences( sequences, maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences( testing_sentences)
testing_padded = pad_sequences( testing_sequences, maxlen=max_length, truncating=trunc_type)

In [9]:
model = tf.keras.Sequential( [
	tf.keras.layers.Embedding( vocab_size, embedding_dim, input_length=max_length),
	# tf.keras.layers.GlobalAveragePooling1D(),
	tf.keras.layers.Flatten(),
	tf.keras.layers.Dense( 6, activation="relu"),
	tf.keras.layers.Dense( 1, activation="sigmoid")
])

model.compile(
	loss="binary_crossentropy",
	optimizer="adam",
	metrics=[ "accuracy"]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171533 (670.05 KB)
Trainable params: 171533 (670.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
num_epochs = 5

model.fit(
	padded,
	training_labels_final,
	epochs=num_epochs,
	validation_data=( testing_padded, testing_labels_final)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2b74fffd610>

In [11]:
embedding_layer = model.layers[ 0]

embedding_weights = embedding_layer.get_weights()[ 0]

print( embedding_weights.shape)

(10000, 16)


In [12]:
reversed_word_index = tokenizer.index_word

In [13]:
out_v = io.open( "vecs.tsv", "w", encoding="utf-8")
out_m = io.open( "meta.tsv", "w", encoding="utf-8")

for word_num in range( 1, vocab_size):
	word_name = reversed_word_index[ word_num]

	word_embedding = embedding_weights[ word_num]

	out_m.write( word_name + "\n")

	out_v.write( "\t".join( [ str( x) for x in word_embedding]) + "\n")

out_v.close()
out_m.close()