<a href="https://colab.research.google.com/github/cagBRT/Machine-Learning/blob/master/boston_housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Clone the entire repo.
!git clone -l -s https://github.com/cagBRT/Machine-Learning.git cloned-repo
%cd cloned-repo
!ls

# **Can the median value (medv) Boston housing market be predicted from the given feature set?**

# **The dataset definitions**

CRIM: This column represents per capita crime rate by town<br>
ZN: This column represents the proportion of residential land zoned for lots larger than 25,000 sq.ft.<br>
INDUS: This column represents the proportion of non-retail business acres per town.<br>
CHAS: This column represents the Charles River dummy variable (this is equal to 1 if tract bounds river; 0 otherwise)<br>
NOX: This column represents the concentration of the nitric oxide (parts per 10 million)<br>
RM: This column represents the average number of rooms per dwelling<br>
AGE: This column represents the proportion of owner-occupied units built prior to 1940<br>
DIS: This column represents the weighted distances to five Boston employment centers<br>
RAD: This column represents the index of accessibility to radial highways<br>
TAX: This column represents the full-value property-tax rate per \$10,000 <br>
PTRATIO: This column represents the pupil-teacher ratio by town <br>
B: This is calculated as 1000(Bk — 0.63)², where Bk is the proportion of people of African American descent by town<br>
LSTAT: This is the percentage lower status of the population<br>
MEDV: This is the median value of owner-occupied homes in $1000s<br>

# **Load the libraries**

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from keras.utils import to_categorical
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

# **Get the data**

In [0]:
boston_housing_data = pd.read_csv("boston_housing.csv", sep=',')

In [0]:
boston_housing_data.tail()

In [0]:
boston_housing_data.isna().sum()

# **Data Correlation**

In [0]:
corr = boston_housing_data.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
plt.show()

In [0]:
train_dataset = boston_housing_data.sample(frac=0.8,random_state=0)
test_dataset = boston_housing_data.drop(train_dataset.index)
print("done")

In [0]:
train_stats = train_dataset.describe()
train_stats.pop("medv")
train_stats = train_stats.transpose()
train_stats

In [0]:
test_stats = test_dataset.describe()
test_stats.pop("medv")
test_stats = test_stats.transpose()
test_stats

In [0]:
train_labels = train_dataset.pop('medv')
test_labels = test_dataset.pop('medv')
print("done")

# **Normalize the data**

In [0]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
print("done")

# **The model**

In [0]:
inputs = len(train_dataset.keys())
print("number of inputs to the model = " + str(inputs))

def build_model():
  model = keras.Sequential([
    layers.Dense(8, activation=tf.nn.relu,input_shape=([len(train_dataset.keys())]),),
    #layers.Dropout(0.2),
    #layers.Dense(8, activation=tf.nn.relu),
    layers.Dense(8, activation=tf.nn.relu),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mean_squared_error',
                optimizer=optimizer,
                metrics=['mean_absolute_error', 'mean_squared_error'])
  return model
  print("done")

In [0]:
model = build_model()
print("done")

# **Train the model**

In [0]:
# Display training progress by printing a single dot for each completed epoch

model = build_model()
EPOCHS = 1000

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop])


In [0]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_squared_error'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()


plot_history(history)

In [0]:
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=1)

print("Testing set Mean Abs Error: {:5.2f} medv".format(mae))

In [0]:
test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [wage]')
plt.ylabel('Predictions [wage]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
plt.show()

In [0]:
error = test_predictions - test_labels
plt.hist(error, bins = 25)
plt.xlabel("Prediction Error [medv]")
_ = plt.ylabel("Count")
plt.show()