In [1]:
import tensorflow as tf

## From Tensor Slices

In [10]:
X = tf.Variable(tf.range(10))
dataset = tf.data.Dataset.from_tensor_slices(X)

In [11]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [12]:
dataset = dataset.map(lambda x : x**2)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(25, shape=(), dtype=int32)
tf.Tensor(36, shape=(), dtype=int32)
tf.Tensor(49, shape=(), dtype=int32)
tf.Tensor(64, shape=(), dtype=int32)
tf.Tensor(81, shape=(), dtype=int32)


In [13]:
dataset = dataset.filter(lambda x : x % 2 == 0)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(36, shape=(), dtype=int32)
tf.Tensor(64, shape=(), dtype=int32)


## Categorical Data

### One-hot Vs Embedding

As a rule of thumb, if the number of categories is lower than 10,
then one-hot encoding is generally the way to go (but your mileage
may vary!). If the number of categories is greater than 50 (which is
often the case when you use hash buckets), then embeddings are
usually preferable. In between 10 and 50 categories, you may want
to experiment with both options and see which one works best for
your use case.

### One-hot

In [28]:
from sklearn.datasets import fetch_california_housing
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
fetch_housing_data()
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [33]:
df = load_housing_data()
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [38]:
## Initializing table
ocean_list = list(df['ocean_proximity'].unique())
indices = tf.range(len(ocean_list), dtype = tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(ocean_list, indices)
num_oov = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov)

In [50]:
## one-hot
random_categories = tf.constant([random.sample(ocean_list, 1)[0] for i in range(4)])
print(random_categories)
cat_indicies = table.lookup(random_categories)
cat_indicies

tf.Tensor([b'INLAND' b'ISLAND' b'NEAR BAY' b'NEAR BAY'], shape=(4,), dtype=string)


<tf.Tensor: id=278, shape=(4,), dtype=int64, numpy=array([2, 4, 0, 0], dtype=int64)>

In [52]:
tf.one_hot(cat_indicies, depth = len(ocean_list) + num_oov)

<tf.Tensor: id=282, shape=(4, 6), dtype=float32, numpy=
array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)>

### Embedding

In [55]:
embedding_dim = 2
embed_init = tf.random.uniform(shape = [(len(ocean_list) + num_oov), embedding_dim])
embed_matrix = tf.Variable(embed_init)
embed_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.524539  , 0.40950465],
       [0.91189754, 0.17248845],
       [0.8579438 , 0.7456107 ],
       [0.12113166, 0.8425473 ],
       [0.15396714, 0.22420442],
       [0.16823995, 0.05178714],
       [0.26820529, 0.7380533 ]], dtype=float32)>

In [57]:
print(random_categories)
tf.nn.embedding_lookup(embed_matrix, cat_indicies)

tf.Tensor([b'INLAND' b'ISLAND' b'NEAR BAY' b'NEAR BAY'], shape=(4,), dtype=string)


<tf.Tensor: id=311, shape=(4, 2), dtype=float32, numpy=
array([[0.8579438 , 0.7456107 ],
       [0.15396714, 0.22420442],
       [0.524539  , 0.40950465],
       [0.524539  , 0.40950465]], dtype=float32)>

In [58]:
## Keras layer : random_init (but witholds the cat_in)
embed_layer = tf.keras.layers.Embedding(input_dim=len(ocean_list)+num_oov, output_dim=embedding_dim)
embed_layer(cat_indicies)

<tf.Tensor: id=325, shape=(4, 2), dtype=float32, numpy=
array([[ 0.01217709,  0.01941189],
       [ 0.0390201 , -0.04843371],
       [ 0.02771847, -0.041774  ],
       [ 0.02771847, -0.041774  ]], dtype=float32)>