## 1. Import libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Any results you write to the current directory are saved as output.

In [2]:
# Importing tensorflow and keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Softmax
from tensorflow.keras import optimizers

In [3]:
# Checking the tensorflow libraries  
print(tf.__version__)


2.3.0


# 2. Data exploration

In [5]:
#Loading the dataset

housing_df=pd.read_csv("housing.csv")

In [6]:
housing_df.shape

(20640, 10)

In [7]:
# Displaying the data 

housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
# Checking for the null values 

housing_df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

In [9]:
#Count of the null values in each columns

housing_df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [10]:
# We can drop the null values as their count is less than 5 %

housing_df.dropna(inplace=True)

In [11]:
housing_df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [12]:
housing_df.shape

(20433, 10)

In [13]:
# Dividing the dataset into independant and dependant variables 
X=pd.DataFrame(columns=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity'],data=housing_df)
y=pd.DataFrame(columns=['median_house_value'],data=housing_df)

In [14]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In [15]:
y.head()

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0


In [16]:
#Creating the dummy values for ocean_proximity

X = pd.get_dummies(data = X, columns = ['ocean_proximity'] , prefix = ['ocean_proximity'] , drop_first = True)

In [17]:
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,0,0,1,0


# 3. Feature Scaling and test train split

In [18]:
#Dividing the training data into test and train 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


In [19]:
X_train.shape

(14303, 12)

In [20]:
#Feature Standardization

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
X_train

array([[-1.14653761,  0.44712894, -0.68468262, ..., -0.01182583,
        -0.35564951, -0.38507396],
       [ 0.85461023, -0.83616485, -0.92275856, ..., -0.01182583,
        -0.35564951, -0.38507396],
       [ 0.58013858, -0.68629112,  0.42633841, ..., -0.01182583,
        -0.35564951, -0.38507396],
       ...,
       [-1.34615336,  1.18713047, -0.44660669, ..., -0.01182583,
         2.81175703, -0.38507396],
       [ 0.84961983, -0.88300039, -1.55762772, ..., -0.01182583,
        -0.35564951, -0.38507396],
       [ 2.03234312, -1.38414066, -0.36724804, ..., -0.01182583,
        -0.35564951, -0.38507396]])

# ANN

In [22]:
model = Sequential()

#Input Layer
model.add(Dense(X.shape[1], activation='relu', input_dim = X.shape[1]))

#Hidden Layer
model.add(Dense(512,kernel_initializer='normal', activation='relu'))
model.add(Dense(512,kernel_initializer='normal', activation='relu'))
model.add(Dense(256,kernel_initializer='normal', activation='relu'))
model.add(Dense(128,kernel_initializer='normal', activation='relu'))
model.add(Dense(64,kernel_initializer='normal', activation='relu'))
model.add(Dense(32,kernel_initializer='normal', activation='relu'))
#Output Layer
model.add(Dense(1,kernel_initializer='normal', activation = 'relu'))


In [23]:
X.shape[1]


12

In [24]:
#Compile the network 

model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 12)                156       
_________________________________________________________________
dense_1 (Dense)              (None, 512)               6656      
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2

In [25]:
history = model.fit(X_train, y_train.to_numpy(), batch_size = 10, epochs = 10, verbose = 1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
y_pred = model.predict(X_test)


In [26]:
y_pred


array([[150001.45 ],
       [182087.66 ],
       [110598.73 ],
       ...,
       [143184.38 ],
       [ 76335.414],
       [199055.4  ]], dtype=float32)

In [27]:
y_test


Unnamed: 0,median_house_value
14185,98900.0
6125,153000.0
14095,91300.0
14359,345200.0
18004,344100.0
...,...
20450,280200.0
14681,264000.0
2639,76800.0
2935,42700.0


In [28]:
model.evaluate(X_test, y_test)




[3609943222.353018, 3609943800.0, 41766.19]