## The data analysis of the Melbourne House Prices
  1. First step is to load the data

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data=pd.read_csv('melb_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [7]:
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


  ### In this next segment we deal with the missing values in our data
     The steps to be followed are as below:
      1. Removing data with no target variable
      2. Imputing data with missing values. For categorical data, the data should not be too skewed
      
      
  ### The categorical data in the dataset shall then be encoded for easier evaluation of the model. 
      Data with low cardinality in the categorical data shall be used alongside data with numerical data

In [8]:
#removerows with missing target price 
data.dropna(axis=0,subset=['Price'], inplace=True)
y=data.Price
X=data.drop(['Price'], axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,test_size=0.2, random_state=0)



In [9]:
data.isnull().sum(axis=0)


Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [10]:
data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [11]:
#Define a function that train_tests and finds the MAE of model chosen.
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,accuracy_score
def mean_abs_error(X_train,X_test,y_train,y_test):
    model=RandomForestRegressor(n_estimators=10,random_state=0)
    model.fit(X_train,y_train)
    predictions=model.predict(X_test)
    return mean_absolute_error(y_test,predictions)

    

In [12]:
from sklearn.preprocessing import OneHotEncoder

#we keep a duplicate copy of the data to avoid changing it after encoding

label_X_train=X_train.copy()
label_X_test=X_test.copy()

#We encode the categorical data with low cardinality. 
object_columns=[col for col in label_X_train.columns if 
               label_X_train[col].dtype=='object']

low_cardinality_cols=[col for col in object_columns
                        if label_X_train[col].nunique()<10]
encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)



label_X_train=pd.DataFrame(encoder.fit_transform(X_train[low_cardinality_cols]))
label_X_test=pd.DataFrame(encoder.transform(X_test[low_cardinality_cols]))

label_X_train.index=X_train.index
label_X_test.index=X_test.index

# Use categorical data with low cardinality and concatenate with numerical data to be our predictors

num_X_train=X_train.drop(object_columns,axis=1)
num_X_test=X_test.drop(object_columns,axis=1)

# ADDITION OF THE NUMERICAL DATA AND THE ENCODED COLUMNS
final_X_train=pd.concat([num_X_train,label_X_train],axis=1, join='inner')
final_X_test=pd.concat([num_X_test,label_X_test], axis=1, join='inner')




## Validation of the training and the test set of data
Display the first five rows of the training and the test set for the data

In [13]:
final_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,6,7,8,9,10,11,12,13,14,15
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
final_X_test.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,6,7,8,9,10,11,12,13,14,15
8505,4,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5523,2,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12852,3,10.5,3020.0,3.0,1.0,1.0,581.0,,,-37.7674,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4818,3,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12812,3,8.5,3044.0,3.0,2.0,2.0,480.0,,,-37.72523,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.impute import SimpleImputer

#Imputing values of the predictors with missing values and have low cardinality
imputer=SimpleImputer()
Imputed_X_train=pd.DataFrame(imputer.fit_transform(final_X_train))
Imputed_X_test=pd.DataFrame(imputer.transform(final_X_test))
#imputation removes the column names. Re-introduce them 

Imputed_X_train.columns=final_X_train.columns
Imputed_X_test.columns=final_X_test.columns



## Validation of the imputed training and test set data

In [16]:
Imputed_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,6,7,8,9,10,11,12,13,14,15
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## The calculation of how our model performs. We calculate the Mean Absolute Error


In [17]:
Imputed_X_test.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,6,7,8,9,10,11,12,13,14,15
0,4.0,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,10.5,3020.0,3.0,1.0,1.0,581.0,153.764119,1964.839866,-37.7674,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3.0,8.5,3044.0,3.0,2.0,2.0,480.0,153.764119,1964.839866,-37.72523,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print ('The mean absolute error is ',mean_abs_error(Imputed_X_train,Imputed_X_test,y_train,y_test))

The mean absolute error is  170566.4818360334
