# tf.feature_column
input data type for tf.estimator.Estimator class<br>
A way to bridge data input_fn and the estimator<br>
<b>API docs:</b> https://www.tensorflow.org/api_docs/python/tf/feature_column <br>
<b>Medium Post:</b> https://medium.com/ml-book/demonstration-of-tensorflow-feature-columns-tf-feature-column-3bfcca4ca5c4 <br>

<img src="./feature_columns.jpeg">



In [130]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import pprint
tf.enable_eager_execution()

## Create Input tf.data.Dataset
An estimator requires an input_fn that returns a tf.data.Dataset for consumption.  In this example, we'll ingest our CSV data from a URL into a Pandas dataframe and then wrap that in a dataset for consumption

### Read Data into Pandas Dataframe

In [42]:
# get some data into a Pandas Dataframe
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


### Create tf.data.Dataset from Pandas dataframe

In [43]:
# convert features and labels into SERIES object
labels = dataframe.pop('target')   #TYPE: pandas.Series
features = dict(dataframe)         #TYPE: dict KEY = column name, VAL = column data (type: pandas.Series)
print("Features: " + str(list(features.keys()))) 
print("age feature values: " + str(features['age'][0:4].values))

# make dataset from tuple of Pandas Series objs
dataset = tf.data.Dataset.from_tensor_slices((features,labels))

# make a small batch for looking at
dataset = dataset.batch(5)

Features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
age feature values: [63 67 67 37]


### Inspect Dataset

In [47]:
# here, we'll grab one batch of data from the dataset.  Batch size is defined above on the dataset
# now that we've put the "features" and "labels" inside the Dataset, using "from_tensor_slices()", the
# "label_batch" is now a Tensor w shape=(5,)
# "feature_batch" is now a dict, w column name as the key (see feature_batch.keys() below) and the 
# value is a Tensor w shape=(5,), and a dtype corresponding to that column, such as dtype=string for col "thal"
# or dtype=float64 for "oldpeak" and dtype=int32 for the rest

for feature_batch, label_batch in dataset.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])    #NOTE: dict access to get the "age" tensor
    print('A batch of targets:', label_batch )
    break


Every feature: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
A batch of ages: tf.Tensor([63 67 67 37 41], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)


## Create different feature_column types
First, we'll create a utility to consume and help display each of the feature columns

In [82]:
# outputs data in feature column.  Input:  tf.feature_column, dict{column_name, tf.Tensor}
def inspect(feature_column, feature_batch):
  feature_layer = tf.keras.layers.DenseFeatures(feature_column)  # make a layer that expects the incoming feature column
  print(feature_layer(feature_batch).numpy())           # give the dataset to the layer and it will peel out what it wants

def anotherWay(feature_batch, feature_column):
    #Parses tf.Examples to extract tensors for given feature_columns
    columnTensors = tf.contrib.layers.parse_feature_columns_from_examples(serializedData,feature_columns,name=None, example_names=None)
    return columnTensors


In [63]:
# We will use this batch to demonstrate several types of feature columns
(feature_batch, label_batch) = next(iter(dataset))
# "feature_batch" is now a dict, KEY: column name (ie 'age') VALUE: tf.Tensor w shape=(batchSize,)
# "label_batch" is now a Tensor w shape=(5,)
print('A batch of ages:', feature_batch['age'])    #NOTE: dict access to get the "age" tensor
print('A batch of targets:', label_batch )

A batch of ages: tf.Tensor([63 67 67 37 41], shape=(5,), dtype=int32)
A batch of targets: tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int32)


### Numeric columns
The output of a feature column becomes the input to the model<br> 
The <i>inspect()</i> function defined above, will show how each column is transformed<br> 
A numeric_column represents real valued features<br>
The model will receive the column value from the dataframe unchanged<br>
[API doc](https://www.tensorflow.org/api_docs/python/tf/feature_column/numeric_column)

In [68]:
age = tf.feature_column.numeric_column("age")  #TYPE: tensorflow.python.feature_column.feature_column_v2.NumericColumn
inspect(age, feature_batch)

[[63.]
 [67.]
 [67.]
 [37.]
 [41.]]


### Bucketized columns
Take numeric valued feature, determine which bucket that value has membership, one-hot encode the result<br>
User defines bucket edges.  A vector w N edges defines N+1 buckets, and you'll get N+1 one hot encoded columns<br>
[API doc](https://www.tensorflow.org/api_docs/python/tf/feature_column/bucketized_column)

In [81]:
#here we define 7 buckets using 6 edges in a list
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[40, 45, 50, 55, 60, 65])
# TYPE(age_buckets) tensorflow.python.feature_column.feature_column_v2.BucketizedColumn
inspect(age_buckets, feature_batch)

[[0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]]


### Categorical columns
In this dataset, thal is represented as a string (e.g. 'fixed', 'normal', or 'reversible'). We cannot feed strings directly to a model. Instead, we must first map them to numeric values. The categorical vocabulary columns provide a way to represent strings as a one-hot vector (much like you have seen above with age buckets). The vocabulary can be passed as a list using [categorical_column_with_vocabulary_list](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_list), or loaded from a file using [categorical_column_with_vocabulary_file](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_file).

In [111]:
thal = tf.feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible'])
print(type(thal))

thal_one_hot = tf.feature_column.indicator_column(thal)
print(type(thal_one_hot))
inspect(thal_one_hot, feature_batch)
print(feature_batch['thal'])


<class 'tensorflow.python.feature_column.feature_column_v2.VocabularyListCategoricalColumn'>
<class 'tensorflow.python.feature_column.feature_column_v2.IndicatorColumn'>
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]
tf.Tensor([b'fixed' b'normal' b'reversible' b'normal' b'normal'], shape=(5,), dtype=string)


### Embedding columns
Suppose instead of having just a few possible strings, we have thousands (or more) values per category. For a number of reasons, as the number of categories grow large, it becomes infeasible to train a neural network using one-hot encodings. We can use an embedding column to overcome this limitation. Instead of representing the data as a one-hot vector of many dimensions, an [embedding column](https://www.tensorflow.org/api_docs/python/tf/feature_column/embedding_column) represents that data as a lower-dimensional, dense vector in which each cell can contain any number, not just 0 or 1. The size of the embedding (8, in the example below) is a parameter that must be tuned.

Key point: using an embedding column is best when a categorical column has many possible values. We are using one here for demonstration purposes, so you have a complete example you can modify for a different dataset in the future.

In [110]:
# same feature column definition as above
thal = tf.feature_column.categorical_column_with_vocabulary_list('thal', ['fixed', 'normal', 'reversible']) 

# dimension is the dimensionality of the embedding vector.  ie each categorical label will be converted to
# an 8-dimensional vector
thal_embedding = tf.feature_column.embedding_column(thal, dimension=8)
inspect(thal_embedding, feature_batch)
print(feature_batch['thal'])


[[ 0.43152484 -0.33226964 -0.48574808 -0.02084353 -0.26794535 -0.55284816
  -0.50064814 -0.31672466]
 [ 0.03001935 -0.34558374 -0.34236407  0.06355304 -0.6391774  -0.1554181
  -0.20584467 -0.0960248 ]
 [ 0.1448013  -0.38700113  0.04931199 -0.2777156   0.13313629  0.13107201
   0.2579299   0.5306967 ]
 [ 0.03001935 -0.34558374 -0.34236407  0.06355304 -0.6391774  -0.1554181
  -0.20584467 -0.0960248 ]
 [ 0.03001935 -0.34558374 -0.34236407  0.06355304 -0.6391774  -0.1554181
  -0.20584467 -0.0960248 ]]
tf.Tensor([b'fixed' b'normal' b'reversible' b'normal' b'normal'], shape=(5,), dtype=string)


### Hashed feature columns

Another way to represent a categorical column with a large number of values is to use a [categorical_column_with_hash_bucket](https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_hash_bucket). This feature column calculates a hash value of the input, then selects one of the `hash_bucket_size` buckets to encode a string. When using this column, you do not need to provide the vocabulary, and you can choose to make the number of hash_buckets significantly smaller than the number of actual categories to save space.

Key point: An important downside of this technique is that there may be collisions in which different strings are mapped to the same bucket. In practice, this can work well for some datasets regardless.

In [109]:
thal_hashed = tf.feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size=10)
thal_hashed_one_hot = tf.feature_column.indicator_column(thal_hashed)
inspect(thal_hashed_one_hot, feature_batch)
print(feature_batch['thal'])

# since vocab size = 3, only 3 columns are lit up; however, if collisions, there could have been fewer cols
# you can see that label 'fixed' maps to col[0], 'normal' to col[3], and reversible' to col[4]

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
tf.Tensor([b'fixed' b'normal' b'reversible' b'normal' b'normal'], shape=(5,), dtype=string)


In [108]:
# this cell will demonstrate collisions in hashing.  There are three possible categories, but only two cols lit up
thal_hashed = tf.feature_column.categorical_column_with_hash_bucket('thal', hash_bucket_size=3)
thal_hashed_one_hot = tf.feature_column.indicator_column(thal_hashed)
inspect(thal_hashed_one_hot, feature_batch)
print(feature_batch['thal'])
# you can see that label 'fixed' maps to col[1] and both 'reversible' and 'normal' map to col[2]

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
tf.Tensor([b'fixed' b'normal' b'reversible' b'normal' b'normal'], shape=(5,), dtype=string)


### Crossed feature columns
Combining features into a single feature, better known as [feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross), enables a model to learn separate weights for each combination of features. Here, we will create a new feature that is the cross of age and thal. Note that `crossed_column` does not build the full table of all possible combinations (which could be very large). Instead, it is backed by a `hashed_column`, so you can choose how large the table is.

In [103]:
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=10)
crossed_feature_one_hot = tf.feature_column.indicator_column(crossed_feature)
inspect(crossed_feature_one_hot, feature_batch)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]


## End to End Example
First, we create a list of feature_columns, where each element is one of the several dense feature_column types

In [139]:
feature_columns = []

# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(tf.feature_column.numeric_column(header))

# bucketized cols
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# categorical wrapped in indicator column
thal = tf.feature_column.categorical_column_with_vocabulary_list(
      'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = tf.feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = tf.feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed columns wrapped in indicator column
crossed_feature = tf.feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = tf.feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

# visualize the stuff
for fc in feature_columns: 
    print(fc.name + ":\n" + str(fc) + "\n")

age:
NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

trestbps:
NumericColumn(key='trestbps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

chol:
NumericColumn(key='chol', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

thalach:
NumericColumn(key='thalach', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

oldpeak:
NumericColumn(key='oldpeak', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

slope:
NumericColumn(key='slope', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

ca:
NumericColumn(key='ca', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

age_bucketized:
BucketizedColumn(source_column=NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))

thal_indicator:
IndicatorColumn(categorical_column=VocabularyListC

### Make Train Eval Test Dataset

In [140]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)

train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

193 train examples
49 validation examples
61 test examples


## Define the model
Now that we have defined our feature columns, we will use a [DenseFeatures](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/DenseFeatures) layer to input them to our Keras model, and regular layers after that

In [141]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
layer1 = tf.keras.layers.Dense(128, activation='relu')
layer2 = tf.keras.layers.Dense(128, activation='relu')
output_layer = tf.keras.layers.Dense(1)

model = tf.keras.Sequential([
  feature_layer,
  layer1,
  layer2,
  output_layer
])

## Train and Eval the model

In [142]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Epoch 1/5


FailedPreconditionError: Table already initialized.
	 [[{{node sequential_2/dense_features_23/age_bucketized_X_thal_indicator/thal_lookup/hash_table/table_init/LookupTableImportV2}}]] [Op:__inference_keras_scratch_graph_7299]