In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
import_data = pd.read_csv("datasets/imports/imports.csv", skipinitialspace=True, skiprows=1, na_values="?")
import_data.sample(10)

Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500
21,1,118.0,dodge,gas,turbo,two,hatchback,fwd,front,93.7,...,98,mpfi,3.03,3.39,7.6,102.0,5500.0,24,30,7957.0
187,3,,volkswagen,gas,std,two,convertible,fwd,front,94.5,...,109,mpfi,3.19,3.4,8.5,90.0,5500.0,24,29,11595.0
76,2,161.0,mitsubishi,gas,std,two,hatchback,fwd,front,93.7,...,92,2bbl,2.97,3.23,9.4,68.0,5500.0,31,38,6669.0
118,1,154.0,plymouth,gas,std,four,hatchback,fwd,front,93.7,...,90,2bbl,2.97,3.23,9.4,68.0,5500.0,31,38,6229.0
96,2,168.0,nissan,gas,std,two,hardtop,fwd,front,95.1,...,97,2bbl,3.15,3.29,9.4,69.0,5200.0,31,37,8249.0
140,0,102.0,subaru,gas,std,four,sedan,fwd,front,97.2,...,108,2bbl,3.62,2.64,9.5,82.0,4400.0,28,33,7775.0
174,-1,65.0,toyota,gas,std,four,sedan,fwd,front,102.4,...,122,mpfi,3.31,3.54,8.7,92.0,4200.0,27,32,10898.0
189,0,,volkswagen,gas,std,four,sedan,fwd,front,100.4,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,24,13295.0
172,-1,65.0,toyota,diesel,turbo,four,sedan,fwd,front,102.4,...,110,idi,3.27,3.35,22.5,73.0,4500.0,30,33,10698.0
114,0,161.0,peugot,diesel,turbo,four,sedan,rwd,front,107.9,...,152,idi,3.7,3.52,21.0,95.0,4150.0,28,33,17950.0


In [3]:
# set missing column names
import_data.columns = [
    "symboling",
    "normalized-losses",
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location",
    "wheel-base",
    "length", 
    "width", 
    "height", 
    "curb-weight", 
    "engine-type",
    "num-of-cylinders",
    "engine-size", 
    "fuel-system",
    "bore", 
    "stroke", 
    "compression-ratio", 
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price"
]

In [4]:
# split train data into validation data
train_data = import_data.sample(frac=0.7, replace=False)
validation_data = import_data[len(train_data):]

# cleanup data
train_data = train_data[np.isfinite(train_data['price'])]

In [5]:
train_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0
67,0,93.0,mercedes-benz,diesel,turbo,two,hardtop,rwd,front,106.7,...,183,idi,3.58,3.64,21.5,123.0,4350.0,22,25,28176.0
185,2,94.0,volkswagen,diesel,turbo,four,sedan,fwd,front,97.3,...,97,idi,3.01,3.4,23.0,68.0,4500.0,37,42,9495.0
21,1,118.0,dodge,gas,turbo,two,hatchback,fwd,front,93.7,...,98,mpfi,3.03,3.39,7.6,102.0,5500.0,24,30,7957.0
44,2,,isuzu,gas,std,two,hatchback,rwd,front,96.0,...,119,spfi,3.43,3.23,9.2,90.0,5000.0,24,29,11048.0


In [6]:
# build train input function
# input_fn_train = tf.estimator.inputs.pandas_input_fn(
#     x = pd.DataFrame({
#         "make" : train_data["make"].values,
#         "highway-mpg" : train_data["highway-mpg"].values,
#         "curb-weight" : train_data["curb-weight"].values,
#         "body-style" : train_data["body-style"].values
#     }),
#     y = pd.Series(train_data["price"].values),
#     shuffle=True,
#     batch_size=128
# )

# Build the training input_fn.
def input_fn(data):
    data_set = tf.data.Dataset.from_tensor_slices(({
        "make" : data["make"].values,
        "highway-mpg" : data["highway-mpg"].values,
        "curb-weight" : data["curb-weight"].values,
        "body-style" : data["body-style"].values
    }, data["price"].values))
    return (
        # Shuffling with a buffer larger than the data set ensures
        # that the examples are well mixed.
        data_set.shuffle(1000).batch(128)
        # Repeat forever
        .repeat().make_one_shot_iterator().get_next())

def input_fn_train():
    return input_fn(train_data)

def input_fn_validation():
    return input_fn(validation_data)

feature_columns = [
    tf.feature_column.numeric_column("highway-mpg"),
    tf.feature_column.numeric_column("curb-weight"),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(key="body-style", vocabulary_list=[
            "hardtop", "wagon", "sedan", "hatchback", "convertible"
        ])
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_hash_bucket(key="make", hash_bucket_size=50),
        dimension=3
    )
]
estimator = tf.estimator.DNNRegressor(feature_columns=feature_columns, hidden_units=[20, 20])
estimator.train(input_fn=input_fn_train, steps=5000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122601748>, '_evaluation_master': '', '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True, '_task_id': 0, '_task_type': 'worker', '_model_dir': '/var/folders/55/qkrngbr949x1jzs0c0dt70wc0000gn/T/tmpoqfls3c6', '_save_summary_steps': 100, '_log_step_count_steps': 100, '_service': None, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_save_checkpoints_steps': None, '_master': '', '_global_id_in_cluster': 0, '_keep_checkpoint_max': 5, '_num_worker_replicas': 1, '_num_ps_replicas': 0}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/55/qkrngbr949x1jzs0c0dt70wc000

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x122601400>

In [7]:
eval_result = estimator.evaluate(input_fn=input_fn_validation, steps=100)

average_loss = eval_result["average_loss"]

# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
    .format(average_loss**0.5))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-20-17:10:55
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/55/qkrngbr949x1jzs0c0dt70wc0000gn/T/tmpoqfls3c6/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-04-20-17:10:55
INFO:tensorflow:Saving dict for global step 5000: average_loss = 2336150.8, global_step = 5000, loss = 142505200.0

********************************************************************************

RMS error f