## Getting ready

In [34]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np

In [8]:
census_dir = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/'
train_path = tf.keras.utils.get_file('adult.data', census_dir + 'adult.data')
test_path = tf.keras.utils.get_file('adult.test', census_dir + 'adult.test')

In [9]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
          'income_bracket']

train_data = pd.read_csv(train_path, header=None, names=columns)
test_data = pd.read_csv(test_path, header=None, names=columns, skiprows=1)

## How to do it

In [11]:
predictors = ['age', 'workclass', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
             'gender']

y_train = (train_data.income_bracket==' >50K').astype(int)
y_test = (test_data.income_bracket==' >50K').astype(int)

train_data = train_data[predictors]
test_data = test_data[predictors]

In [14]:
train_data[['age', 'education_num']] = train_data[['age', 'education_num']].fillna(train_data[['age', 'education_num']]).mean()
test_data[['age', 'education_num']] = test_data[['age', 'education_num']].fillna(test_data[['age', 'education_num']]).mean()

In [15]:
def define_feature_columns(data_df, numeric_cols, categorical_cols, categorical_embeds, dimension=30):
    numeric_columns = []
    categorical_columns = []
    embeddings = []
    
    for feature_name in numeric_cols:
        numeric_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
        
    for feature_name in categorical_cols:
        vocabulary = data_df[feature_name].unique()
        categorical_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))
        
    for feature_name in categorical_embeds:
        vocabulary = data_df[feature_name].unique()
        to_categorical = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
        embeddings.append(tf.feature_column.embedding_column(to_categorical, dimension=dimension))
        
    return numeric_columns, categorical_columns, embeddings

In [19]:
def create_interactions(interactions_list, buckets=10):
    feature_columns = []
    
    for (a, b) in interactions_list:
        crossed_feature = tf.feature_column.crossed_column([a, b], hash_bucket_size=buckets)
        crossed_feature_one_hot = tf.feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature_one_hot)
        
    return feature_columns

In [20]:
numeric_columns, categorical_columns, embeddings = define_feature_columns(train_data, 
                                                                          numeric_cols=['age', 'education_num'],
                                                                          categorical_cols=['gender'],
                                                                          categorical_embeds=['workclass', 'education',
                                                                                            'marital_status', 'occupation',
                                                                                            'relationship'],
                                                                          dimension=32
                                                                         )
interactions = create_interactions([['education', 'occupation']], buckets=10)

In [29]:
estimator = tf.estimator.DNNLinearCombinedClassifier(
# wide settings
linear_feature_columns=numeric_columns+categorical_columns+interactions,
linear_optimizer=keras.optimizers.Ftrl(learning_rate=0.0002),
# deep settings
dnn_feature_columns=embeddings,
dnn_hidden_units=[1024, 256, 128, 64],
dnn_optimizer=keras.optimizers.Adam(learning_rate=0.0001))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpxnk3fqlo', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [30]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=256):
    
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

In [32]:
train_input_fn = make_input_fn(train_data, y_train, num_epochs=100, batch_size=256)
test_input_fn = make_input_fn(test_data, y_test, num_epochs=1, shuffle=False)
estimator.train(input_fn=train_input_fn, steps=1500)
results = estimator.evaluate(input_fn=test_input_fn)
print(results)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.


2022-01-24 17:44:47.537824: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-01-24 17:44:47.593714: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:44:47.594434: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2d:00.0 name: GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-01-24 17:44:47.594449: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-01-24 17:44:47.596013: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-01-24 17:44:47.596038: I tensorflow/stream_executor/plat

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


2022-01-24 17:44:48.882814: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-24 17:44:48.883389: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:44:48.883780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2d:00.0 name: GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-01-24 17:44:48.883835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), 

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpxnk3fqlo/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...


2022-01-24 17:44:51.863945: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11


INFO:tensorflow:loss = 0.7061246, step = 0


2022-01-24 17:44:52.136953: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


INFO:tensorflow:global_step/sec: 135.501
INFO:tensorflow:loss = 0.34892508, step = 100 (0.738 sec)
INFO:tensorflow:global_step/sec: 183.084
INFO:tensorflow:loss = 0.4303697, step = 200 (0.546 sec)
INFO:tensorflow:global_step/sec: 180.552
INFO:tensorflow:loss = 0.35352567, step = 300 (0.554 sec)
INFO:tensorflow:global_step/sec: 180.272
INFO:tensorflow:loss = 0.37762165, step = 400 (0.555 sec)
INFO:tensorflow:global_step/sec: 182.87
INFO:tensorflow:loss = 0.34192562, step = 500 (0.547 sec)
INFO:tensorflow:global_step/sec: 180.8
INFO:tensorflow:loss = 0.34803823, step = 600 (0.553 sec)
INFO:tensorflow:global_step/sec: 181.885
INFO:tensorflow:loss = 0.36325437, step = 700 (0.550 sec)
INFO:tensorflow:global_step/sec: 174.791
INFO:tensorflow:loss = 0.36329564, step = 800 (0.572 sec)
INFO:tensorflow:global_step/sec: 178.806
INFO:tensorflow:loss = 0.37925425, step = 900 (0.559 sec)
INFO:tensorflow:global_step/sec: 176.736
INFO:tensorflow:loss = 0.36890337, step = 1000 (0.566 sec)
INFO:tensorfl

2022-01-24 17:45:01.542755: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:45:01.543001: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2d:00.0 name: GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-01-24 17:45:01.543047: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:45:01.543250: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:45:01.543424: I tensorflow/core/common

INFO:tensorflow:Inference Time : 0.87013s
INFO:tensorflow:Finished evaluation at 2022-01-24-17:45:02
INFO:tensorflow:Saving dict for global step 1500: accuracy = 0.80726, accuracy_baseline = 1.0, auc = 0.0, auc_precision_recall = 0.0, average_loss = 0.37457475, global_step = 1500, label/mean = 0.0, loss = 0.37501976, precision = 0.0, prediction/mean = 0.23987472, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1500: /tmp/tmpxnk3fqlo/model.ckpt-1500
{'accuracy': 0.80726, 'accuracy_baseline': 1.0, 'auc': 0.0, 'auc_precision_recall': 0.0, 'average_loss': 0.37457475, 'label/mean': 0.0, 'loss': 0.37501976, 'precision': 0.0, 'prediction/mean': 0.23987472, 'recall': 0.0, 'global_step': 1500}


In [36]:
def predict_proba(predictor):
    preds = list()
    for pred in predictor:
        preds.append(pred['probabilities'])
    return np.array(preds)

predictions = predict_proba(estimator.predict(input_fn=test_input_fn))
print(predictions)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpxnk3fqlo/model.ckpt-1500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


2022-01-24 17:47:59.201848: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:47:59.202080: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2d:00.0 name: GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-01-24 17:47:59.202127: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:47:59.202331: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-24 17:47:59.202506: I tensorflow/core/common

[[0.99828005 0.00171994]
 [0.86335665 0.1366433 ]
 [0.3650979  0.63490206]
 ...
 [0.24605854 0.7539415 ]
 [0.91424894 0.0857511 ]
 [0.19365892 0.80634105]]
