In [32]:
# import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score

In [3]:
# load read dataset
dataset = pd.read_csv('Melbourne_housing_FULL.csv')
dataset.head(2)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0


In [4]:
# all the columns
dataset.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
# select useful columns
final_dataset = dataset[['Distance', 'Bathroom', 'Bedroom2', 'Car', 'Landsize', 'BuildingArea', 'Propertycount', 'YearBuilt', 'Price']]
final_dataset

Unnamed: 0,Distance,Bathroom,Bedroom2,Car,Landsize,BuildingArea,Propertycount,YearBuilt,Price
0,2.5,1.0,2.0,1.0,126.0,,4019.0,,
1,2.5,1.0,2.0,1.0,202.0,,4019.0,,1480000.0
2,2.5,1.0,2.0,0.0,156.0,79.0,4019.0,1900.0,1035000.0
3,2.5,2.0,3.0,1.0,0.0,,4019.0,,
4,2.5,2.0,3.0,0.0,134.0,150.0,4019.0,1900.0,1465000.0
...,...,...,...,...,...,...,...,...,...
34852,6.3,1.0,4.0,3.0,593.0,,6543.0,,1480000.0
34853,6.3,2.0,2.0,1.0,98.0,104.0,6543.0,2018.0,888000.0
34854,6.3,1.0,2.0,2.0,220.0,120.0,6543.0,2000.0,705000.0
34855,6.3,,,,,,6543.0,,1140000.0


In [7]:
# find missing values
final_dataset.isnull().sum()

Distance             1
Bathroom          8226
Bedroom2          8217
Car               8728
Landsize         11810
BuildingArea     21115
Propertycount        3
YearBuilt        19306
Price             7610
dtype: int64

In [8]:
# drop all the missing values
final_dataset = final_dataset.dropna()
final_dataset.shape

(8895, 9)

In [9]:
final_dataset.head()

Unnamed: 0,Distance,Bathroom,Bedroom2,Car,Landsize,BuildingArea,Propertycount,YearBuilt,Price
2,2.5,1.0,2.0,0.0,156.0,79.0,4019.0,1900.0,1035000.0
4,2.5,2.0,3.0,0.0,134.0,150.0,4019.0,1900.0,1465000.0
6,2.5,1.0,3.0,2.0,120.0,142.0,4019.0,2014.0,1600000.0
11,2.5,2.0,4.0,0.0,245.0,210.0,4019.0,1910.0,1876000.0
14,2.5,1.0,2.0,2.0,256.0,107.0,4019.0,1890.0,1636000.0


In [10]:
# find correlation with respect to price
final_dataset.corr()['Price']

Distance        -0.231367
Bathroom         0.463153
Bedroom2         0.460658
Car              0.209128
Landsize         0.058423
BuildingArea     0.507194
Propertycount   -0.059389
YearBuilt       -0.313820
Price            1.000000
Name: Price, dtype: float64

In [13]:
# separate dataset into dependent and independent sets
x = final_dataset.drop(columns = ['Price'])
y = final_dataset['Price']

In [16]:
# split dataset into training and testing
training_size = int(len(x)*0.85)

x_train = x.iloc[0 : training_size, : ]
x_test = x.iloc[training_size: , : ]
print(x_train.shape)
print(x_test.shape)

(7560, 8)
(1335, 8)


In [18]:
# split dependent final dataset
y_train = y[0 : training_size]
y_test = y[training_size: ]
print(y_train.shape)
print(y_test.shape)

(7560,)
(1335,)


In [20]:
# train and fit the model
lr = LinearRegression()

# fit model
lr.fit(x_train, y_train)

In [21]:
# now predict prices
y_pred = lr.predict(x_test)
y_pred

array([ 829710.98203876, 1231760.74174598,  973304.54625116, ...,
        753954.42417132,  694704.98788951,  918372.67201374])

In [29]:
# find mean square error for testing data
r2_score(y_test, y_pred)

0.5323599974682857

In [30]:
lr.score(x_train, y_train)

0.525646499204478

In [31]:
lr.score(x_test, y_test)

0.5323599974682857

In [33]:
# now apply lasso and then check the score
lasso = Lasso(alpha = 50, max_iter = 100, tol = 0.1)
lasso.fit(x_train, y_train)

lasso.score(x_test, y_test)

0.5323749690970537

In [34]:
lasso.score(x_train, y_train)

0.5256464782804249

In [35]:
# apply Ridge
ridge = Ridge(alpha = 50, max_iter = 100, tol = 0.1)
ridge.fit(x_train, y_train)

ridge.score(x_test, y_test)

0.5326571462025974

In [36]:
ridge.score(x_train, y_train)

0.5256298641913028