In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
import tensorflow.keras.callbacks as callbacks
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt
from tqdm import tqdm

import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.spatial.distance import cosine, euclidean

import os
import io
from pprint import pprint

import matplotlib.pyplot as plt

print('GPU:', tf.test.is_gpu_available())

In [None]:
# See all registered datasets
# tfds.list_builders()

In [None]:
root_dir = os.path.expanduser('~/workspace/ml-data')

_, info = tfds.load(
    'imdb_reviews/subwords8k', 
    data_dir=root_dir,
    with_info=True
)

In [None]:
info

In [None]:
padded_shapes = ([None],())
train_batches = train_data.shuffle(1000).padded_batch(10, padded_shapes=padded_shapes)
test_batches = test_data.shuffle(1000).padded_batch(10, padded_shapes=padded_shapes)

encoder = info.features['text'].encoder

In [None]:
embedding_dim = 16

embedding_layer = layers.Embedding(1000, 5)

model = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
tensorboard_callback = keras.callbacks.TensorBoard(log_dir='./tb')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    train_batches,
    epochs=10,
    validation_data=test_batches, validation_steps=20, callbacks=[tensorboard_callback])

model.save('./emb_model.hdf5')

In [None]:
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(16,6))

plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
model = load_model('./emb_model.hdf5')

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
encoder = info.features['text'].encoder

out_v = io.open('./tb/vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('./tb/meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(encoder.subwords):
    vec = weights[num+1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")

out_v.close()
out_m.close()

In [None]:
!realpath tb