# Week 13
# Deep Learning for Regression

So far, we have seen how neural networks are used for classification tasks, such as image classification, text classification, text generation. Today we will apply neural network model to a regression task.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.1.0


## Load Auto MPG Dataset

This dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/auto+mpg)

**Attribute Information**:

1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)

The goal of this project is to train a model that predicts "MPG" using other features.

In [4]:
# Load the dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(url, names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


## Data Preprocessing
- Check for missing values
- Handle categorical features
- Split into training and test set

In [None]:
# Are there any missing values?
# If so, how should we handle them?



In [5]:
# The "Origin" column is really categorical, not numeric. 
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,USA
1,15.0,8,350.0,165.0,3693.0,11.5,70,USA
2,18.0,8,318.0,150.0,3436.0,11.0,70,USA
3,16.0,8,304.0,150.0,3433.0,12.0,70,USA
4,17.0,8,302.0,140.0,3449.0,10.5,70,USA


In [6]:
# Convert "Origin" to a one-hot vector:
dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
0,18.0,8,307.0,130.0,3504.0,12.0,70,0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,0,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,0,0,1


In [None]:
# Split the data into training set (80%) and test set (20%)



## Inspect the Data
- Inspect the distribution of each individual feature:
    - descriptive statistics
    - histogram
- Inspect the relationship between features:
    - correlation coefficients
    - scatter plots

In [None]:
# Compute descriptive statistices: min, max, mean, median, standard deviation



In [None]:
# Plot histogram and pairwise scatter plots for numerical variables



In [None]:
# Calculate the correlation coefficient between MPG and every other feature



## Build the Model
- Transform data into proper format
- Normalize data
- Build neural network
- Specify loss function and training method

In [None]:
# Separate the "MPG" feature from the rest.



In [None]:
# Normalize the data by applying the following tranformation:
# x <-- (x - mean) / std



In [None]:
# Build a 3-layer neural network:
# 1. input layer (what is the input shape?)
# 2. hidden layer with 64 nodes and ReLU activation
# 3. output layer (what is the output shape?)



In [None]:
# Use model.compile() to specify:
# 1. loss = 'mse'
# 2. optimizer = tf.keras.optimizers.RMSprop(0.001)
# 3. metrics = ['mae', 'mse']



In [None]:
# Display a summary of the model



## Train the Model
- Train the model
- Analyze the loss curve
- Improve the model

In [None]:
EPOCHS = 1000

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
)

In [None]:
# Visualize the model's training progress using the statistics stored in
# the "history" object
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.head()

In [None]:
# Plot training MAE and validation MAE against epochs



In [None]:
# Plot training MSE and validation MSE against epochs



In [None]:
# Re-train the model with 100 epochs



## Evaluate the Model
- Evaluate the model on the test set
- Visualize the predictions

In [7]:
# Evalute the performance on the test set



In [8]:
# Visualize model predictions against true values



In [None]:
# Plot the histogram of prediction errors

