In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt


In [4]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
# data 正規化

assert X_train_full.shape == (60000, 28, 28)
assert X_test.shape == (10000, 28, 28)
assert y_train_full.shape == (60000,)
assert y_test.shape == (10000,)
X_train_full = X_train_full.astype(np.float32)/255
X_test = X_test.astype(np.float32)/255
X_valid, X_train= X_train_full[:5000], X_train_full[5000:]
y_valid, y_train= y_train_full[:5000], y_train_full[5000:]

In [None]:
# one of the methods to include a preprocessing layer directly in the model
means = np.mean(X_train, axis=0, keepdims=True)
stds = np.std(X_train, axis=0, keepdims=True)
eps = keras.backend.epsilon()
model = keras.models.Sequential([
    keras.layers.Lambda(lambda inputs: (inputs- means)/ (stds + eps)),
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax")
])
# an example

In [25]:
# self-contained custom layer
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample ,axis=0, keepdims=True)
        self.stds_ =np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return( inputs- self.means_)/(self.stds_+ keras.backend.epsilon())

In [26]:
# before using, pass it a data_sample 
std_layer = Standardization()
std_layer.adapt(X_train)

In [27]:
model = keras.Sequential()
model.add(std_layer)
model.add(keras.layers.Flatten(input_shape=[28, 28]))
model.add(keras.layers.Dense(100, activation="relu"))
model.add(keras.layers.Dense(10, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy",
                optimizer=keras.optimizers.SGD(learning_rate=1e-3),
                metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, batch_size=32,
                    validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [67]:
# categorical features using One-Hot Vectors
# examples are shown with california housing prize data
import os
import pandas as pd
HOUSING_PATH = os.path.join("datasets", "housing","housing.csv")
housing = pd.read_csv(HOUSING_PATH)

In [68]:
housing.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [69]:
housing["ocean_proximity"].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [70]:
vocab = ['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND']
type(vocab)

list

In [71]:
indices = tf.range(len(vocab), dtype=tf.int64)
indices
# encoder用のlistを作成

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([0, 1, 2, 3, 4], dtype=int64)>

In [72]:
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
#　vocabにindicesを割り当て、対応付ける
# if the categories were listed in a textfile with one category per line,
# TextFileInitializer should be used
num_oov_buckets=2
# when the category does not exist, 4から数えて２つ上の数字を割り当てる意味
# to avoid collisions or to handle changing datasets
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [73]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices
# 割り当てが終わったtableに対して.lookupを使うことでcat後のcat_indicesを作る

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 5, 2, 2], dtype=int64)>

In [74]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab)+num_oov_buckets) # one_hot化
cat_one_hot
# the Desert unknown bucket is located at 5, using num_oov_buckets=2

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.]], dtype=float32)>

In [75]:
uio = tf.random.uniform(shape=(2, 3))
uio
# an example

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.803156  , 0.49777734, 0.37054038],
       [0.9118674 , 0.637642  , 0.18209696]], dtype=float32)>

In [76]:
# encoding categorical features using embeddings
# embeddings is preferable on large datasets especially with categories over 50
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab)+num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init) # tensor.Variableに変える
embed_init

<tf.Tensor: shape=(7, 2), dtype=float32, numpy=
array([[0.95831835, 0.01680839],
       [0.3156035 , 0.16013157],
       [0.7148702 , 0.7892921 ],
       [0.11484027, 0.33310425],
       [0.21091413, 0.62329304],
       [0.9865029 , 0.12230623],
       [0.20660043, 0.87113273]], dtype=float32)>

In [77]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.95831835, 0.01680839],
       [0.3156035 , 0.16013157],
       [0.7148702 , 0.7892921 ],
       [0.11484027, 0.33310425],
       [0.21091413, 0.62329304],
       [0.9865029 , 0.12230623],
       [0.20660043, 0.87113273]], dtype=float32)>

In [78]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 5, 2, 2], dtype=int64)>

In [79]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)
# embedding_matrixを基にcat_indicesをembedding

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.95831835, 0.01680839],
       [0.9865029 , 0.12230623],
       [0.7148702 , 0.7892921 ],
       [0.7148702 , 0.7892921 ]], dtype=float32)>

In [80]:
# keras provides a keras.layers.Embedding to hanble all of this
embedding = keras.layers.Embedding(input_dim=len(vocab)+num_oov_buckets,
                                   output_dim=embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.02184172,  0.02552453],
       [ 0.01789192,  0.04401754],
       [ 0.03224948, -0.00919038],
       [ 0.03224948, -0.00919038]], dtype=float32)>

In [83]:
# finally create a keras model that can process categorical features
regular_inputs = keras.layers.Input(shape=[8])
categories = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categories)
cat_embed = keras.layers.Embedding(input_dim=7, output_dim=2)(cat_indices)
encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
outputs = keras.layers.Dense(1)(encoded_inputs)
model = keras.models.Model(inputs=[regular_inputs, categories],
                           outputs=[outputs])


In [84]:
# Keras preprocessing layers
# normalization = keras.layers.Normalization()
# discretization = keras.layers.Discretization([...])
# pipeline = keras.layers.PreprocessingStage([normalization, discretization])
# pipeline.adapt(data_sample)
# P562

In [None]:
# Tensorflow Transform
#P564-567