In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
dataset =pd.read_csv('melb_data.csv')
dataset.head(20)

In [None]:
dataset.describe().T

In [None]:
dataset.isnull().values.any()

In [13]:
dataset.isnull().values.sum()

13256

In [16]:
dataset.columns
dataset.shape

(13580, 21)

In [17]:
dataset =dataset.dropna(axis =0)

In [18]:
dataset.isnull().values.any()
dataset.isnull().values.sum()
dataset.head()
dataset.shape
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.0,1890.0,Yarra,-37.806,144.9954,Northern Metropolitan,4019.0


# building the first model

In [19]:
# Defining the prediction target/ dependent variable
y =dataset.Price

In [20]:
#defining the independent variables/ features
dataset_features =['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x =dataset[dataset_features]

In [21]:
x.describe()


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [22]:
x.head()


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


The steps to building and using a model are:

Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
Fit: Capture patterns from provided data. This is the heart of modeling.
Predict: Just what it sounds like
Evaluate: Determine how accurate the model's predictions are.

In [1]:
from sklearn.tree import DecisionTreeRegressor
# Define model. Specify a number for random_state to ensure same results each run

melbourne_model = DecisionTreeRegressor(random_state=1)
  

In [24]:
#fit model

melbourne_model.fit(x,y)

DecisionTreeRegressor(random_state=1)

In [25]:
print("Making predictions for the following 5 houses:")
print(x.head())
print('The predictions are: ')
print(melbourne_model.predict(x.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are: 
[1035000. 1465000. 1600000. 1876000. 1636000.]


# Model validation


In [26]:
# Calculating mean_absolute_error
from sklearn.metrics import mean_absolute_error
predicted_home_prices =melbourne_model.predict(x)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

The measure we just computed can be called an "in-sample" score. 
We used a single "sample" of houses for both building the model and 
evaluating it. Here's why this is bad.

In [18]:
#using validation data and training data which is effective method

In [28]:
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.

train_x, val_x, train_y, val_y =train_test_split(x, y,random_state = 0)

# Define model
melbourne_model = DecisionTreeRegressor(random_state =1)
#fit model
melbourne_model.fit(train_x, train_y)

# get predicted prices on validation data
val_predictions =melbourne_model.predict(val_x)
print(mean_absolute_error(val_y, val_predictions))

273518.01872175594


# Underfitting and Overfitting

In [30]:
# We can use a utility function to help compare MAE scores from different values for max_leaf_nodes:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y):
    model =DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_x, train_y)
    preds_val = model.predict(val_x)
    mae =mean_absolute_error(val_y, preds_val)
    return mae


In [None]:

y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
x = filtered_melbourne_data[melbourne_features]

from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_x, val_x, train_y, val_y = train_test_split(x, y,random_state = 0)

In [35]:
#The data is loaded into train_X, val_X, train_y and val_y using the code above
# We can use a for-loop to compare the accuracy of models built with different values for max_leaf_nodes.
# compare MAE with differing values of max_leaf_nodes

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae =get_mae(max_leaf_nodes, train_x, val_y, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
    



ValueError: Expected 2D array, got 1D array instead:
array=[ 815000.  655000.  957500. ...  200000. 1002000. 1710000.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

# Random Forests

We build a random forest model similarly to how we built a decision tree in scikit-learn - this time using the RandomForestRegressor class instead of DecisionTreeRegressor.

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_x, train_y)
melb_preds = forest_model.predict(val_x)
print(mean_absolute_error(val_y, melb_preds))

207190.6873773146
