In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
import_data = pd.read_csv("datasets/imports/imports.csv", skipinitialspace=True, skiprows=1, na_values="?")

In [3]:
import_data.sample(10)

Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500
121,-1,74.0,plymouth,gas,std,four,wagon,fwd,front,103.3,...,122,2bbl,3.35,3.46,8.5,88.0,5000.0,24,30,8921.0
39,0,85.0,honda,gas,std,four,sedan,fwd,front,96.5,...,110,mpfi,3.15,3.58,9.0,101.0,5800.0,24,28,12945.0
47,0,,jaguar,gas,std,two,sedan,rwd,front,102.0,...,326,mpfi,3.54,2.76,11.5,262.0,5000.0,13,17,36000.0
179,-1,,toyota,gas,std,four,wagon,rwd,front,104.5,...,161,mpfi,3.27,3.35,9.2,156.0,5200.0,19,24,15750.0
113,0,161.0,peugot,gas,std,four,sedan,rwd,front,107.9,...,120,mpfi,3.46,3.19,8.4,97.0,5000.0,19,24,16630.0
61,0,,mazda,diesel,std,,sedan,fwd,front,98.8,...,122,idi,3.39,3.39,22.7,64.0,4650.0,36,42,10795.0
140,0,102.0,subaru,gas,std,four,sedan,fwd,front,97.2,...,108,2bbl,3.62,2.64,9.5,82.0,4400.0,28,33,7775.0
126,3,,porsche,gas,std,two,convertible,rwd,rear,89.5,...,194,mpfi,3.74,2.9,9.5,207.0,5900.0,17,25,37028.0
148,1,87.0,toyota,gas,std,two,hatchback,fwd,front,95.7,...,92,2bbl,3.05,3.03,9.0,62.0,4800.0,35,39,5348.0
37,0,85.0,honda,gas,std,four,sedan,fwd,front,96.5,...,110,1bbl,3.15,3.58,9.0,86.0,5800.0,27,33,8845.0


In [4]:
# set missing column names
import_data.columns = [
    "symboling",
    "normalized-losses",
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location",
    "wheel-base",
    "length", 
    "width", 
    "height", 
    "curb-weight", 
    "engine-type",
    "num-of-cylinders",
    "engine-size", 
    "fuel-system",
    "bore", 
    "stroke", 
    "compression-ratio", 
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price"
]

In [5]:
# split train data into validation data
train_data = import_data.sample(frac=0.8, replace=False)
validation_data = import_data[len(train_data):]

# cleanup data
train_data = train_data[np.isfinite(train_data['price'])]

In [6]:
train_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
176,3,197.0,toyota,gas,std,two,hatchback,rwd,front,102.9,...,171,mpfi,3.27,3.35,9.3,161.0,5200.0,20,24,16558.0
120,1,154.0,plymouth,gas,std,four,sedan,fwd,front,93.7,...,98,2bbl,2.97,3.23,9.4,68.0,5500.0,31,38,7609.0
94,1,122.0,nissan,gas,std,four,sedan,fwd,front,94.5,...,97,2bbl,3.15,3.29,9.4,69.0,5200.0,31,37,7499.0
136,2,83.0,subaru,gas,std,two,hatchback,fwd,front,93.7,...,97,2bbl,3.62,2.36,9.0,69.0,4900.0,31,36,5118.0
84,1,125.0,mitsubishi,gas,std,four,sedan,fwd,front,96.3,...,122,2bbl,3.35,3.46,8.5,88.0,5000.0,25,32,8189.0


In [37]:
# build train input function
input_fn_train = tf.estimator.inputs.pandas_input_fn(
    x = pd.DataFrame({
        "make" : train_data["make"].values,
        "highway-mpg" : train_data["highway-mpg"].values,
        "curb-weight" : train_data["curb-weight"].values,
        "body-style" : train_data["body-style"].values
    }),
    y = pd.Series(train_data["price"].values),
    shuffle=False
)

In [43]:
feature_columns = [
    tf.feature_column.numeric_column("highway-mpg"),
    tf.feature_column.numeric_column("curb-weight"),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(key="body-style", vocabulary_list=[
            "hardtop", "wagon", "sedan", "hatchback", "convertible"
        ])
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_hash_bucket(key="make", hash_bucket_size=50),
        dimension=3
    )
]
estimator = tf.estimator.LinearRegressor(feature_columns=feature_columns, hidden_units=[20, 20])
estimator.train(input_fn=input_fn_train, steps=5000)

TypeError: __init__() got an unexpected keyword argument 'hidden_units'

In [44]:
input_fn_validation = tf.estimator.inputs.pandas_input_fn(
    x = pd.DataFrame({
        "make" : validation_data["make"].values,
        "highway-mpg" : validation_data["highway-mpg"].values,
        "curb-weight" : validation_data["curb-weight"].values,
        "body-style" : validation_data["body-style"].values
    }),
    y = pd.Series(validation_data["price"].values),
    shuffle=False
)

eval_result = estimator.evaluate(input_fn=input_fn_validation)

average_loss = eval_result["average_loss"]

# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
    .format(average_loss**0.5))

print()

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-20-06:54:04
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpsoqg3zmi/model.ckpt-2
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-04-20-06:54:04
INFO:tensorflow:Saving dict for global step 2: average_loss = 1.69299e+08, global_step = 2, loss = 6.94126e+09

********************************************************************************

RMS error for the test set: $13011

