In [None]:
# install libs

!pip install pandas
!pip install seaborn
!pip install tensorflow

In [None]:
# import modules

from __future__ import absolute_import, division, print_function

import pathlib

import pandas as pd
import seaborn as sns
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)


In [None]:
# get dataset
dataset_path = keras.utils.get_file("auto-mpg.data", "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
print(dataset_path)

In [None]:
# read dataset
column_names = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration", "Model Year", "Origin"]

raw_dataset = pd.read_csv(
    dataset_path, 
    names=column_names, 
    na_values="?", 
    comment="\t", 
    sep=" ", 
    skipinitialspace=True)

dataset = raw_dataset.copy()

dataset.tail()

In [None]:
# clean
dataset.isna().sum()
dataset = dataset.dropna()
dataset.info()

In [None]:
# feature engineering

# one-hot
origin = dataset.pop("Origin")

print(origin.unique())

dataset["USA"] = (origin == 1) * 1.0
dataset["Europe"] = (origin == 2) * 1.0
dataset["Japan"] = (origin == 3) * 1.0


In [None]:
# split data
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
# inspect
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")

In [None]:
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

In [None]:
# spilit labels from features

train_labels = train_dataset.pop("MPG")
test_labels = test_dataset.pop("MPG")

In [None]:
# normalize
# ranges of the features are too different

def normalize(x):
    return (x - train_stats["mean"]) / train_stats["std"]

normalized_train_data = normalize(train_dataset)
normalized_test_data = normalize(test_dataset)

In [None]:
# build model

def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation=tf.nn.relu, input_shape=[len(train_dataset.keys())]),
        layers.Dense(64, activation=tf.nn.relu),
        layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.RMSprop(0.001)
    
    model.compile(loss="mse", optimizer=optimizer, metrics=["mae", "mse"])
    
    return model

model = build_model()

In [None]:
# inspect model

model.summary()

In [None]:
small_batch = normalized_train_data[:10]
small_result = model.predict(small_batch)
small_result

In [None]:
# train model

EPOCHS = 1000

history = model.fit(normalized_train_data, train_labels, epochs=EPOCHS, validation_split=0.2, verbose = 0)

hist = pd.DataFrame(history.history)
hist["epoch"] = history.epoch
hist.tail()

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist["epoch"] = history.epoch
    
    plt.figure()
    plt.xlabel("epoch")
    plt.ylabel("mean abs error (MPG)")
    
    plt.plot(hist["epoch"], hist["mae"], label="Train Error")
    
    plt.plot(hist["epoch"], hist["val_mae"], label="Val Error")
    
    plt.legend()
    plt.ylim([0, 5])
    
    plt.figure()
    plt.xlabel("epoch")
    plt.ylabel("mean abs error (MPG^2)")
    
    plt.plot(hist["epoch"], hist["mse"], label="Train Error")
    
    plt.plot(hist["epoch"], hist["val_mse"], label="Val Error")
    
    plt.legend()
    plt.ylim([0, 20])
    
plot_history(history)

In [None]:
model = build_model()

early_stop = keras.callbacks.EarlyStopping(monitor="var_loss", patience=10)

history = model.fit(normalized_train_data, train_labels, epochs=EPOCHS, 
                    validation_split=0.2, verbose = 0, callbacks=[early_stop])

plot_history(history)

In [None]:
loss, mae, mse = model.evaluate(normalized_test_data, test_labels, verbose=0)

print("testing set mean abs error {:5.2f}".format(mae))

In [None]:
test_predictions = model.predict(normalized_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel("true values")
plt.ylabel("predictions")
plt.axis("equal")
plt.axis("square")
plt.xlim([0, plt.xlim()[1]])
plt.ylim([0, plt.ylim()[1]])

_ = plt.plot([-100, 100], [-100, 100])


In [None]:
error = test_predictions - test_labels

plt.hist(error, bins=25)
plt.xlabel("prediction error")
_ = plt.ylabel("count")
