In [39]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [40]:
column_names = [
    'MPG', 'Cylinders', 'Displacement',
    'Horsepower', 'Weight', 'Acceleration',
    'ModelYear', 'Origin'
]

df = pd.read_csv('dataset/auto-mpg.data', names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)

In [41]:
df.dropna()
df.reset_index(drop=True)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1


In [42]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=0.8)
train_stats = df_train.describe().transpose()

train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,318.0,23.462264,7.910658,9.0,17.5,22.0,29.0,46.6
Cylinders,318.0,5.493711,1.704963,3.0,4.0,4.0,8.0,8.0
Displacement,318.0,194.677673,104.339294,68.0,101.75,151.0,265.75,455.0
Horsepower,313.0,104.952077,38.651727,48.0,76.0,95.0,129.0,230.0
Weight,318.0,2974.600629,838.339274,1613.0,2220.0,2860.0,3573.0,4997.0
Acceleration,318.0,15.558805,2.757308,8.0,13.9,15.5,17.275,24.6
ModelYear,318.0,76.006289,3.705222,70.0,73.0,76.0,79.0,82.0
Origin,318.0,1.556604,0.795256,1.0,1.0,1.0,2.0,3.0


**SETTING UP FEATURE COLUMNS**

In [43]:
numeric_column_names = [
    'Cylinders', 'Displacement',
    'Horsepower', 'Weight',
    'Acceleration'
]

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean) / std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean) / std

df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
191,22.0,0.29695,0.290613,-0.12812,0.308228,-0.057594,76,1
303,31.8,-0.876095,-1.051164,-1.033643,-1.138681,1.320562,79,3
33,19.0,0.29695,0.357702,-0.12812,-0.40628,-0.928009,71,1
344,39.0,-0.876095,-1.04158,-1.059515,-1.311642,0.305078,81,1
242,21.5,-0.876095,-0.706135,0.1306,-0.446837,-1.000543,77,2


In [44]:
# these features constitute the continuous features

numeric_features = []

for col_name in numeric_column_names:
    numeric_features.append(tf.feature_column.numeric_column(key=col_name))

In [45]:
# we now have *modelYear* feature which we will bucketize
# year < 73 -> 0
# 73 <= year < 76 -> 1
# 76 <= year < 79 -> 2
# year >= 79 -> 3

feature_year = tf.feature_column.numeric_column(key='ModelYear')
bucketized_features = []

bucketized_features.append(tf.feature_column.bucketized_column(
    source_column=feature_year,
    boundaries=[73, 76, 79]
))

In [46]:
feature_origin = tf.feature_column.categorical_column_with_vocabulary_list(
    key='Origin',
    vocabulary_list=[1, 2, 3]
)

In [47]:
categorical_indicator_features = []
categorical_indicator_features.append(
    tf.feature_column.indicator_column(feature_origin)
)

**Defining the input function**

In [48]:
def train_input_fn(df_train, batch_size=8):
    df = df_train.copy()
    train_x, train_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y))

    # shuffle, repeat, and batch the examples
    return dataset.shuffle(1000).repeat().batch(batch_size)
    # dict(..) used to convert pd.DataFrame to Py. Dict.

In [49]:
ds = train_input_fn(df_train_norm)
batch = next(iter(ds))
print('Keys:', batch[0].keys())
print('Batch Model Years:', batch[0]['ModelYear'])

Keys: dict_keys(['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin'])
Batch Model Years: tf.Tensor([72 70 79 72 78 81 77 70], shape=(8,), dtype=int64)


Input for test dataset

In [50]:
def eval_input_fn(df_test, batch_size=8):
    df = df.copy()
    test_x, test_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices(
        dict(test_x), test_y
    )
    return dataset.batch(batch_size)

In [51]:
# defining the feature columns now

all_feature_columns = (
    numeric_features + 
    bucketized_features + 
    categorical_indicator_features
)

Instantiate the DNNRegressor

In [56]:
regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    model_dir='models/autompg-dnnregressor/'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/autompg-dnnregressor/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


*train(), evaluate() & predict()*

In [None]:
EPOCHS = 1000
BATCH_SIZE = 8
total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE))
print('Training Steps:', total_steps)

regressor.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE),
    steps=total_steps)

###### NAN error

In [59]:
reloaded_regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    warm_start_from='models/autompg-dnnregressor/',
    model_dir='models/autompg-dnnregressor/')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/autompg-dnnregressor/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
eval_results = reloaded_regressor.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))
print('Average-Loss {:.4f}'.format(eval_results['average_loss']))

#### ERROR
pred_res = regressor.predict(
    input_fn=lambda: eval_input_fn(
        df_test_norm, batch_size=8))
print(next(iter(pred_res)))

*BOOSTED TREE REGRESSOR*

In [None]:
boosted_tree = tf.estimator.BoostedTreesRegressor(
    feature_columns=all_feature_columns,
    n_batches_per_layer=20,
    n_trees=200)

boosted_tree.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE))

eval_results = boosted_tree.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))

print(eval_results)

print('Average-Loss {:.4f}'.format(eval_results['average_loss']))

# DOES NOT WORK (probably deprecated)