# Week 15
# Deep Learning for Regression

So far, we have seen how neural networks are used for classification tasks, such as image classification, text classification, text generation. Today we will apply the neural network model to a regression task.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.4.1


## Load Auto MPG Dataset

This dataset is available from the [Kaggle.com](https://www.kaggle.com/uciml/autompg-dataset)

**Please follow the steps below to download this dataset to the Colab environment:**

1. Go to your Kaggle account, Scroll to API section and Click **Expire API Token** to remove previous tokens.

2. Click on **Create New API Token** - It will download `kaggle.json` file on your machine.

In [None]:
# 3. Install the kaggle API
! pip install kaggle



In [None]:
# 4. Upload the kaggle.json file
from google.colab import files

files.upload()

{}

In [None]:
# 5. Make a directory named kaggle and copy kaggle.json file there
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

# Change the permission of the file
! chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [None]:
# 6. Download and unzip the dataset
!kaggle datasets download -d uciml/autompg-dataset
!unzip autompg-dataset.zip

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python2.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python2.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 146, in authenticate
    self.config_file, self.config_dir))
IOError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.
unzip:  cannot find or open autompg-dataset.zip, autompg-dataset.zip.zip or autompg-dataset.zip.ZIP.


## Attribute Information

1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)

The goal of this project is to train a model that predicts "MPG" using other features.

In [None]:
# Load the dataset
filename = 'auto-mpg.csv'
dataset = pd.read_csv(filename,
                      na_values = "?", comment='\t',
                      sep=",", skipinitialspace=True)

dataset.head()

FileNotFoundError: ignored

In [None]:
# Drop the "car name" column, as the model tries to find the general relationship between mpg and other factors.
dataset = dataset.drop(['car name'], axis=1)
# Alternative:
# dataset.drop(['car name'], axis=1, inplace=False)
# dataset.drop(column=['car name], inplace=False)
# dataset.iloc[:, :8]
dataset.head()

## Data Preprocessing
- Check for missing values
- Handle categorical features
- Split into training and test set

In [None]:
# Are there any missing values?
# If so, how should we handle them?
dataset.isnull.sum().sort_values(ascending=False)
dataset.describe()

In [None]:
# Consider using mean or median to fill the missing values
# fill in missing values in horsepower and fill in 104.47 there
dataset['horsepower'] = dataset['horsepower'].fillna(104.47)
dataset.isnull().sum().sort_values(ascending=False)

In [None]:
# we want to see the distribution of the data
# ex. mpg col of histogram
dataset['mpg'].plot.hist()

In [None]:
# The "Origin" column is really categorical, not numeric. 
dataset['origin'] = dataset['origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset.head()

In [None]:
dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
dataset.head()

In [None]:
# Split the data into df_train (60%), df_validation (20%), and df_test (20%)
from sklearn.model_selection import train_test_split
df_train, temp = train_test_split(dataset, test_size=0.4)
df_validation, df_test = train_test_split(temp, test_size=0.5)
print(df_train.shape, df_test.shape, df_validation.shape)

## Inspect the Data
- Inspect the distribution of each individual feature:
    - descriptive statistics
    - histogram
- Inspect the relationship between features:
    - correlation coefficients
    - scatter plots

In [None]:
# Compute descriptive statistices: min, max, mean, median, standard deviation
df_train.describe()


In [None]:
# Plot histogram and pairwise scatter plots for numerical variables
pd.plotting.scatter_matrix(df_train, figsize=(15, 15))
plt.show()

In [None]:
# Calculate the correlation coefficient between MPG and every other feature



## Build the Model
- Transform data into proper format
- Normalize data
- Build neural network
- Specify loss function and training method

In [None]:
# Separate the "MPG" feature from the rest.
df_train_labels = df_train['mpg']
# df_train_data = df_train.loc[:, ['Cylinders', 'Displacement',...]]
df_train_data = df_train.iloc[:, 1:]
df_train_data.head()

In [None]:
# Perform this transformation to the validation set and the test set.




In [None]:
# Normalize the data by applying the following tranformation:
# x <-- (x - mean) / std
# Result of this transform:
#   1. the average value of each column will be 0.
#   2. the standard deviation of each column will be 1.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train_scaled = scaler.fit_transform(df_train_data)
df_train_scaled

In [None]:
# Don't forget to use the same scalar to transform the validation set and the test set
df_validation_data = df_validation.iloc[:, 1:]
df_validation_label = df_validation['mpg']
df_validation_scaled = scalar.transform(df_validation_data)
df_validation_scaled

In [None]:
# Build a 3-layer neural network:
# 1. input layer (what is the input shape?)
# 2. hidden layer with 64 nodes and ReLU activation
# 3. output layer (what is the output shape?)



In [None]:
# Display a summary of the model



In [None]:
# Use model.compile() to specify:
# 1. loss = 'mse'
# 2. optimizer = tf.keras.optimizers.RMSprop(0.001)
# 3. metrics = ['mae', 'mse']



## Train the Model
- Train the model
- Analyze the loss curve
- Improve the model

In [None]:
EPOCHS = 1000

history = model.fit(
  df_train_scaled, df_train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
)

In [None]:
# Visualize the model's training progress using the statistics stored in
# the "history" object
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.head()

In [None]:
# Plot training MAE and validation MAE against epochs



In [None]:
# Plot training MSE and validation MSE against epochs



## Evaluate the Model
- Evaluate the model on the test set
- Visualize the predictions

In [None]:
# Evalute the performance on the test set



In [None]:
# Visualize model predictions against true values



In [None]:
# Plot the histogram of prediction errors

