In [7]:
import tensorflow as tf

import collections

import numpy as np
import pandas as pd

In [30]:
list(defaults.keys())

['symboling',
 'normalized-losses',
 'make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-type',
 'num-of-cylinders',
 'engine-size',
 'fuel-system',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price']

In [33]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

# Order is important for the csv-readers, so we use an OrderedDict here.
defaults = collections.OrderedDict([
    ("symboling", [0]),
    ("normalized-losses", [0.0]),
    ("make", [""]),
    ("fuel-type", [""]),
    ("aspiration", [""]),
    ("num-of-doors", [""]),
    ("body-style", [""]),
    ("drive-wheels", [""]),
    ("engine-location", [""]),
    ("wheel-base", [0.0]),
    ("length", [0.0]),
    ("width", [0.0]),
    ("height", [0.0]),
    ("curb-weight", [0.0]),
    ("engine-type", [""]),
    ("num-of-cylinders", [""]),
    ("engine-size", [0.0]),
    ("fuel-system", [""]),
    ("bore", [0.0]),
    ("stroke", [0.0]),
    ("compression-ratio", [0.0]),
    ("horsepower", [0.0]),
    ("peak-rpm", [0.0]),
    ("city-mpg", [0.0]),
    ("highway-mpg", [0.0]),
    ("price", [0.0])
])  # pyformat: disable

pd.set_option('display.max_columns', 500)

pd.read_csv(URL, header=None, names=list(defaults.keys()))

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115,5500,18,22,17450
5,2,?,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,15250
6,1,158,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,17710
7,1,?,audi,gas,std,four,wagon,fwd,front,105.8,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.40,8.50,110,5500,19,25,18920
8,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.40,8.30,140,5500,17,20,23875
9,0,?,audi,gas,turbo,two,hatchback,4wd,front,99.5,178.2,67.9,52.0,3053,ohc,five,131,mpfi,3.13,3.40,7.00,160,5500,16,22,?


In [19]:





types = collections.OrderedDict((key, type(value[0]))
                                for key, value in defaults.items())


def _get_imports85():
  path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)
  return path


def dataset(y_name="price", train_fraction=0.7):
  """Load the imports85 data as a (train,test) pair of `Dataset`.
  Each dataset generates (features_dict, label) pairs.
  Args:
    y_name: The name of the column to use as the label.
    train_fraction: A float, the fraction of data to use for training. The
        remainder will be used for evaluation.
  Returns:
    A (train,test) pair of `Datasets`
  """
  # Download and cache the data
  path = _get_imports85()

  # Define how the lines of the file should be parsed
  def decode_line(line):
    """Convert a csv line into a (features_dict,label) pair."""
    # Decode the line to a tuple of items based on the types of
    # csv_header.values().
    items = tf.decode_csv(line, list(defaults.values()))

    # Convert the keys and items to a dict.
    pairs = zip(defaults.keys(), items)
    features_dict = dict(pairs)

    # Remove the label from the features_dict
    label = features_dict.pop(y_name)

    return features_dict, label

  def has_no_question_marks(line):
    """Returns True if the line of text has no question marks."""
    # split the line into an array of characters
    chars = tf.string_split(line[tf.newaxis], "").values
    # for each character check if it is a question mark
    is_question = tf.equal(chars, "?")
    any_question = tf.reduce_any(is_question)
    no_question = ~any_question

    return no_question

  def in_training_set(line):
    """Returns a boolean tensor, true if the line is in the training set."""
    # If you randomly split the dataset you won't get the same split in both
    # sessions if you stop and restart training later. Also a simple
    # random split won't work with a dataset that's too big to `.cache()` as
    # we are doing here.
    num_buckets = 1000000
    bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
    # Use the hash bucket id as a random number that's deterministic per example
    return bucket_id < int(train_fraction * num_buckets)

  def in_test_set(line):
    """Returns a boolean tensor, true if the line is in the training set."""
    # Items not in the training set are in the test set.
    # This line must use `~` instead of `not` because `not` only works on python
    # booleans but we are dealing with symbolic tensors.
    return ~in_training_set(line)

  base_dataset = (
      tf.data
      # Get the lines from the file.
      .TextLineDataset(path)
      # drop lines with question marks.
      .filter(has_no_question_marks))

  train = (base_dataset
           # Take only the training-set lines.
           .filter(in_training_set)
           # Decode each line into a (features_dict, label) pair.
           .map(decode_line)
           # Cache data so you only decode the file once.
           .cache())

  # Do the same for the test-set.
  test = (base_dataset.filter(in_test_set).cache().map(decode_line))

  return train, test


def raw_dataframe():
  """Load the imports85 data as a pd.DataFrame."""
  # Download and cache the data
  path = _get_imports85()

  # Load it into a pandas dataframe
  df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")

  return df


def load_data(y_name="price", train_fraction=0.7, seed=None):
  """Get the imports85 data set.
  A description of the data is available at:
    https://archive.ics.uci.edu/ml/datasets/automobile
  The data itself can be found at:
    https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
  Args:
    y_name: the column to return as the label.
    train_fraction: the fraction of the dataset to use for training.
    seed: The random seed to use when shuffling the data. `None` generates a
      unique shuffle every run.
  Returns:
    a pair of pairs where the first pair is the training data, and the second
    is the test data:
    `(x_train, y_train), (x_test, y_test) = get_imports85_dataset(...)`
    `x` contains a pandas DataFrame of features, while `y` contains the label
    array.
  """
  # Load the raw data columns.
  data = raw_dataframe()

  # Delete rows with unknowns
  data = data.dropna()

  # Shuffle the data
  np.random.seed(seed)

  # Split the data into train/test subsets.
  x_train = data.sample(frac=train_fraction, random_state=seed)
  x_test = data.drop(x_train.index)

  # Extract the label from the features dataframe.
  y_train = x_train.pop(y_name)
  y_test = x_test.pop(y_name)

  return (x_train, y_train), (x_test, y_test)

train, test = dataset()

In [20]:
PRICE_NORM_FACTOR = 1000

# Switch the labels to units of thousands for better convergence.
def to_thousands(features, labels):
  return features, labels / PRICE_NORM_FACTOR

train = train.map(to_thousands)
test = test.map(to_thousands)


In [17]:

STEPS = 1000



### Define the input train and test callback functions

These are the callback functions for input rain and test. They will be creating batches for our input.

In [None]:

# Build the training input_fn.
def input_train():
  return (
      # Shuffling with a buffer larger than the data set ensures
      # that the examples are well mixed.
      train.shuffle(1000).batch(128)
      # Repeat forever
      .repeat().make_one_shot_iterator().get_next())

  # Build the validation input_fn.
def input_test():
  return (test.shuffle(1000).batch(128)
          .make_one_shot_iterator().get_next())


## Train A Linear Regression Model on 2 Numeric Inputs

Here we will train our linear

In [21]:

feature_columns = [
    # "curb-weight" and "highway-mpg" are numeric columns.
    tf.feature_column.numeric_column(key="curb-weight"),
    tf.feature_column.numeric_column(key="highway-mpg"),
]

# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)

# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=input_train, steps=STEPS)

# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)

# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]

# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
      .format(PRICE_NORM_FACTOR * average_loss**0.5))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmprpa25oud', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff183598630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

### Run a Prediction

Try inputting a couple inputs



In [None]:
# Run a Prediction

# Run the model in prediction mode.
input_dict = {
    "curb-weight": np.array([2000, 3000]),
    "highway-mpg": np.array([30, 40])
}
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    input_dict, shuffle=False)
predict_results = model.predict(input_fn=predict_input_fn)

  # Print the prediction results.
print("\nPrediction results:")
for i, prediction in enumerate(predict_results):
  msg = ("Curb weight: {: 4d}lbs, "
         "Highway: {: 0d}mpg, "
         "Prediction: ${: 9.2f}")
  msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
                   PRICE_NORM_FACTOR * prediction["predictions"][0])

  print("    " + msg)
print()



### Do the same for two categorical columns



In [22]:

# The following code demonstrates two of the ways that `feature_columns` can
# be used to build a model with categorical inputs.

# The first way assigns a unique weight to each category. To do this, you must
# specify the category's vocabulary (values outside this specification will
# receive a weight of zero).
# Alternatively, you can define the vocabulary in a file (by calling
# `categorical_column_with_vocabulary_file`) or as a range of positive
# integers (by calling `categorical_column_with_identity`)
body_style_vocab = ["hardtop", "wagon", "sedan", "hatchback", "convertible"]
body_style_column = tf.feature_column.categorical_column_with_vocabulary_list(
    key="body-style", vocabulary_list=body_style_vocab)

# The second way, appropriate for an unspecified vocabulary, is to create a
# hashed column. It will create a fixed length list of weights, and
# automatically assign each input category to a weight. Due to the
# pseudo-randomness of the process, some weights may be shared between
# categories, while others will remain unused.
make_column = tf.feature_column.categorical_column_with_hash_bucket(
    key="make", hash_bucket_size=50)

feature_columns = [
    tf.feature_column.numeric_column(key="curb-weight"),
    tf.feature_column.numeric_column(key="highway-mpg"),
    # This model adds two categorical colums that will adjust the price based
    # on "make" and "body-style".
    body_style_column,
    make_column,
]


In [23]:
# Run the training
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)

# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=input_train, steps=STEPS)

# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)

# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]

# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}"
      .format(PRICE_NORM_FACTOR * average_loss**0.5))

print()


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp8lfk73hv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff1924d6908>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

In [39]:
#Run a Prediction

# Run the model in prediction mode.
input_dict = {
    "body-style": np.array(["hatchback", "sedan"]),
    "make": np.array(["Honda", "Ford"]),
    "curb-weight": np.array([2000, 3000]),
    "highway-mpg": np.array([30, 40])
}
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    input_dict, shuffle=False)
predict_results = model.predict(input_fn=predict_input_fn)

  # Print the prediction results.
print("\nPrediction results:")
for i, prediction in enumerate(predict_results):
  msg = ("Curb weight: {: 4d}lbs, "
         "Highway: {: 0d}mpg, "
         "Make: {}, "
         "Body Style: {}, "
         "Prediction: ${: 9.2f}")
  msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i], input_dict["make"][i], input_dict["body-style"][i],
                   PRICE_NORM_FACTOR * prediction["predictions"][0])

  print("    " + msg)
print()




Prediction results:
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp8lfk73hv/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
    Curb weight:  2000lbs, Highway:  30mpg, Make: Honda, Body Style: hatchback, Prediction: $  8541.20
    Curb weight:  3000lbs, Highway:  40mpg, Make: Ford, Body Style: sedan, Prediction: $ 14507.09



# Add Some more features

Try adding some more features to your model.  DOes it give you a more accurate result?  Make sure if the feature is categorical, that you handle it appropriately.