In [None]:
import tensorflow as tf

## From Tensor Slices

In [None]:
X = tf.Variable(tf.range(10))
dataset = tf.data.Dataset.from_tensor_slices(X)

In [None]:
for item in dataset:
    print(item)

In [None]:
dataset = dataset.map(lambda x : x**2)
for item in dataset:
    print(item)

In [None]:
dataset = dataset.filter(lambda x : x % 2 == 0)
for item in dataset:
    print(item)

## Categorical Data

### One-hot Vs Embedding

As a rule of thumb, if the number of categories is lower than 10,
then one-hot encoding is generally the way to go (but your mileage
may vary!). If the number of categories is greater than 50 (which is
often the case when you use hash buckets), then embeddings are
usually preferable. In between 10 and 50 categories, you may want
to experiment with both options and see which one works best for
your use case.

### One-hot

In [None]:
from sklearn.datasets import fetch_california_housing
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
df = load_housing_data()
df.head(2)

In [None]:
## Initializing table
ocean_list = list(df['ocean_proximity'].unique())
indices = tf.range(len(ocean_list), dtype = tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(ocean_list, indices)
num_oov = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov)

In [None]:
## one-hot
random_categories = tf.constant([random.sample(ocean_list, 1)[0] for i in range(4)])
print(random_categories)
cat_indicies = table.lookup(random_categories)
cat_indicies

In [None]:
tf.one_hot(cat_indicies, depth = len(ocean_list) + num_oov)

### Embedding

In [None]:
embedding_dim = 2
embed_init = tf.random.uniform(shape = [(len(ocean_list) + num_oov), embedding_dim])
embed_matrix = tf.Variable(embed_init)
embed_matrix

In [None]:
print(random_categories)
tf.nn.embedding_lookup(embed_matrix, cat_indicies)

In [None]:
## Keras layer : random_init (but witholds the cat_ind)
embed_layer = tf.keras.layers.Embedding(input_dim=len(ocean_list)+num_oov, output_dim=embedding_dim)
embed_layer(cat_indicies)

One-hot encoding followed by a Dense layer (with no activation
function and no biases) is equivalent to an Embedding layer. However,
the Embedding layer uses way fewer computations (the performance
difference becomes clear when the size of the embedding
matrix grows). The Dense layer’s weight matrix plays the role of the
embedding matrix. For example, using one-hot vectors of size 20
and a Dense layer with 10 units is equivalent to using an Embedding
layer with input_dim=20 and output_dim=10. As a result, it would
be wasteful to use more embedding dimensions than the number
of units in the layer that follows the Embedding layer.

In [14]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [9]:
dataset = tfds.load(name = "mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]

In [10]:
mnist_train = mnist_train.shuffle(1000).batch(32)
mnist_train = mnist_train.map(lambda items : (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

In [13]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape = [28, 28, 1]),
    tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'Adam', metrics = ["accuracy"])
model.fit(mnist_train, steps_per_epoch = 32, epochs = 5)

Train for 32 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x13071648>