In [1]:
import pandas as pd
import numpy as np

In [2]:
dt = pd.read_csv('melb_data.csv')

In [3]:
dt.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [5]:
dt.nunique()

Suburb             314
Address          13378
Rooms                9
Type                 3
Price             2204
Method               5
SellerG            268
Date                58
Distance           202
Postcode           198
Bedroom2            12
Bathroom             9
Car                 11
Landsize          1448
BuildingArea       602
YearBuilt          144
CouncilArea         33
Lattitude         6503
Longtitude        7063
Regionname           8
Propertycount      311
dtype: int64

In [6]:
dt = dt.dropna(axis=0)

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
label = LabelEncoder()

In [9]:
dt['Regionname'] = label.fit_transform(dt['Regionname'])

In [10]:
y = dt['Price']                               # y = dt.Price

In [11]:
features = ['Lattitude', 'Longtitude', 'Rooms', 'Bathroom', 'Landsize', 'Bedroom2', 'YearBuilt', 'BuildingArea', 'Regionname']

In [12]:
x = dt[features]

In [13]:
x

Unnamed: 0,Lattitude,Longtitude,Rooms,Bathroom,Landsize,Bedroom2,YearBuilt,BuildingArea,Regionname
1,-37.80790,144.99340,2,1.0,156.0,2.0,1900.0,79.00,2
2,-37.80930,144.99440,3,2.0,134.0,3.0,1900.0,150.00,2
4,-37.80720,144.99410,4,1.0,120.0,3.0,2014.0,142.00,2
6,-37.80240,144.99930,3,2.0,245.0,4.0,1910.0,210.00,2
7,-37.80600,144.99540,2,1.0,256.0,2.0,1890.0,107.00,2
...,...,...,...,...,...,...,...,...,...
12205,-37.51232,145.13282,3,2.0,972.0,3.0,1996.0,149.00,3
12206,-37.86558,144.90474,3,1.0,179.0,3.0,1890.0,115.00,6
12207,-37.85588,144.89936,1,1.0,0.0,1.0,1967.0,35.64,6
12209,-37.85581,144.99025,2,1.0,0.0,2.0,2012.0,61.60,5


In [14]:
from sklearn.tree import DecisionTreeRegressor

In [15]:
data = DecisionTreeRegressor()
data.fit(x,y)

DecisionTreeRegressor()

In [16]:
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(data.predict(x.head()))

Making predictions for the following 5 houses:
   Lattitude  Longtitude  Rooms  Bathroom  Landsize  Bedroom2  YearBuilt  \
1   -37.8079    144.9934      2       1.0     156.0       2.0     1900.0   
2   -37.8093    144.9944      3       2.0     134.0       3.0     1900.0   
4   -37.8072    144.9941      4       1.0     120.0       3.0     2014.0   
6   -37.8024    144.9993      3       2.0     245.0       4.0     1910.0   
7   -37.8060    144.9954      2       1.0     256.0       2.0     1890.0   

   BuildingArea  Regionname  
1          79.0           2  
2         150.0           2  
4         142.0           2  
6         210.0           2  
7         107.0           2  
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


## validation of the model

In [17]:
from sklearn.metrics import mean_absolute_error

In [18]:
predicted_price = data.predict(x)

In [19]:
mean_absolute_error(y, predicted_price)

434.0703679793415

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size= .02)

In [22]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(6072, 9)
(124, 9)
(6072,)
(124,)


In [23]:
data_2 = DecisionTreeRegressor(criterion= 'squared_error', splitter= 'best', max_depth= 20)

In [24]:
data_2.fit(xtrain, ytrain)

DecisionTreeRegressor(max_depth=20)

In [25]:
data_2.score(xtest, ytest) * 100

85.43827012855351

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
rdregr = RandomForestRegressor()

In [28]:
rdregr.fit(xtrain, ytrain)

RandomForestRegressor()

In [29]:
rdregr.score(xtrain, ytrain) * 100

97.13530066617052

In [30]:
rdtr_predictions = rdregr.predict(xtest)

In [31]:
mean_absolute_error(ytest, rdtr_predictions)

225562.97118279568

In [32]:
rdregr.score(xtest, ytest) * 100

81.02311434368674