<a href="https://colab.research.google.com/github/akchen1/CMPUT466-Project/blob/NN/NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q seaborn
!pip install -q tensorflow
!pip install -U scikit-learn
!pip install -U lime
!pip install -U pandas

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.7/dist-packages (0.24.1)
Requirement already up-to-date: lime in /usr/local/lib/python3.7/dist-packages (0.2.0.1)
Requirement already up-to-date: pandas in /usr/local/lib/python3.7/dist-packages (1.2.4)


In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import sklearn
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn import datasets, linear_model

# Neural Network


In [42]:
raw_dataset = pd.read_csv("./Melbourne_housing_FULL.csv", header=0)
cols = ['Rooms', 'Type', 'Regionname', 'Price', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'YearBuilt', 'BuildingArea', 'Distance']

In [43]:
dataset = raw_dataset.copy()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

In [44]:
# Drop columns that contain null values
dataset = dataset[cols]
dataset = dataset.dropna()

# One hot encode feature Type (H: houses, U: units, T: townhouses)
dataset = pd.concat([dataset, pd.get_dummies(dataset['Type'], prefix='Type')],axis=1)
dataset.drop('Type', axis=1, inplace=True)

# One hot encode feature Regionname
df_region = dataset.copy()
regionname_dummies = pd.get_dummies(df_region['Regionname'])
df_region = pd.concat([df_region, regionname_dummies], axis = 1)
df_region = df_region.drop(['Regionname'], axis = 1)
print(df_region.head())

    Rooms      Price  ...  Western Metropolitan  Western Victoria
2       2  1035000.0  ...                     0                 0
4       3  1465000.0  ...                     0                 0
6       4  1600000.0  ...                     0                 0
11      3  1876000.0  ...                     0                 0
14      2  1636000.0  ...                     0                 0

[5 rows x 20 columns]


In [45]:
# Code snippit from https://www.kaggle.com/medeirox/melbourne-house-pricing
age_groups = [0, 1, 2, 3, 5, 10, 20, 100]

# Supporting function to one hot encode the year built feature into decade groups
def divide_data_by_age_groups(year_built):
    if pd.isnull(year_built):
        return 9999
    age = datetime.datetime.now().year - year_built
    if (age % 10) >= 5:
        age_decades = ((age // 10) + 1)
    else:
        age_decades = (age // 10)

    for group in age_groups:
        if age_decades <= group:
            age_decades = group
            break
    return age_decades

In [46]:
# One hot encode YearBuilt feature and group into decades
df_date = df_region.copy()
df_date['AgeInDecades'] = df_date['YearBuilt'].apply(divide_data_by_age_groups)
decades_dummies = pd.get_dummies(df_date['AgeInDecades'])

column_names = ['0_Decades_Old', '1_Decades_Old', '2_Decades_Old', '3_Decades_Old', '5_Decades_Old', '10_Decades_Old', '20_Decades_Old', '100_Decades_Old']
decades_dummies.columns = column_names

df_date = df_date.drop(['AgeInDecades', 'YearBuilt'], axis = 1)

df_date = pd.concat([df_date, decades_dummies], axis = 1)

# Drop 100 decade old column. Found to improve results
df_date = df_date.drop(df_date[df_date['100_Decades_Old']==1].index, axis=0)
df_date = df_date.drop(['100_Decades_Old'], axis=1)

In [47]:
# Standarize features Landsize, BuilindgArea, and Distance to relative weights are similar
scaled_features = df_date.copy()
col_names = ['Landsize', 'BuildingArea', 'Distance']
features = scaled_features[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
scaled_features[col_names] = features


In [49]:
df = scaled_features.copy()
y = df['Price']
df = df.drop(['Price'], axis = 1)
train_X, test_X, train_y, test_y = train_test_split(df, y,train_size=0.8, random_state=11)
print(test_X.head())

       Rooms  Bedroom2  Bathroom  ...  5_Decades_Old  10_Decades_Old  20_Decades_Old
14726      3       3.0       3.0  ...              0               0               0
5237       3       3.0       1.0  ...              0               0               1
9697       3       3.0       1.0  ...              0               1               0
4965       4       3.0       1.0  ...              0               1               0
33794      2       2.0       1.0  ...              1               0               0

[5 rows x 25 columns]


In [51]:
model = keras.Sequential()

model.add(layers.Dropout(0.1, input_shape=(25, )))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
# train_X = np.asarray(train_X, dtype="float64")
model.fit(train_X, train_y, epochs=200, shuffle=True, verbose=2)

predictions = model.predict(test_X)

Epoch 1/200
223/223 - 1s - loss: 1662725914624.0000
Epoch 2/200
223/223 - 0s - loss: 1643686526976.0000
Epoch 3/200
223/223 - 0s - loss: 1573969985536.0000
Epoch 4/200
223/223 - 0s - loss: 1433559367680.0000
Epoch 5/200
223/223 - 0s - loss: 1227397005312.0000
Epoch 6/200
223/223 - 0s - loss: 979211780096.0000
Epoch 7/200
223/223 - 0s - loss: 739540533248.0000
Epoch 8/200
223/223 - 0s - loss: 546099101696.0000
Epoch 9/200
223/223 - 0s - loss: 425064202240.0000
Epoch 10/200
223/223 - 0s - loss: 359035273216.0000
Epoch 11/200
223/223 - 0s - loss: 330431102976.0000
Epoch 12/200
223/223 - 0s - loss: 326844973056.0000
Epoch 13/200
223/223 - 0s - loss: 315949580288.0000
Epoch 14/200
223/223 - 0s - loss: 304867868672.0000
Epoch 15/200
223/223 - 0s - loss: 305156751360.0000
Epoch 16/200
223/223 - 0s - loss: 299042308096.0000
Epoch 17/200
223/223 - 0s - loss: 288349192192.0000
Epoch 18/200
223/223 - 0s - loss: 288759119872.0000
Epoch 19/200
223/223 - 0s - loss: 282312736768.0000
Epoch 20/200
223