# Melbourne Housing Prices

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import catboost
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Import Data

In [None]:
df = pd.read_csv( './Data/MELBOURNE_HOUSE_PRICES_LESS.csv' )

## Explore Data

In [None]:
df.shape

In [None]:
df.head()

In [None]:
col = 'Price'

plt.hist( df[ col ], bins = 50 )
plt.title( 'Histogram of ' + col + ' Values' )
plt.show()

## Features to Use

In [None]:
features = ['Rooms','Distance']
#features = ['Suburb','Method','CouncilArea','Rooms','Distance']

target_cols = ['Price']

In [None]:
df_sifted = df[ features + target_cols ]

In [None]:
df_sifted

## Clean Data

In [None]:
df_sifted.dropna( inplace = True )  

In [None]:
df_sifted

## Separate DF

In [None]:
df_features = df_sifted[ features   ]
df_target   = df_sifted[ target_cols[0] ]

In [None]:
df_features.head()

In [None]:
df_target.head()

## Split into Train / Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split( df_features, df_target, test_size = .25 )

In [None]:
X_train.head()

In [None]:
y_train.head()

## Select a Model

In [None]:
model = catboost.CatBoostRegressor()

In [None]:
model.fit( X_train, y_train, verbose = False )

In [None]:
df_train_results = pd.DataFrame( {'Estimated Home Value': model.predict( X_train ) }, index = X_train.index )
df_test_results = pd.DataFrame( {'Estimated Home Value': model.predict( X_test ) }, index = X_test.index )


In [None]:
df_test = X_test.join( y_test )
df_test = df_test.join( df_test_results )

In [None]:
df_test

In [None]:
df_train = X_train.join( y_train )
df_train = df_train.join( df_train_results )

In [None]:
df_train

In [None]:
mean_absolute_error( [10,9,8], [3,4,5] )

## Check the Model Fit

In [None]:
print ('TRAIN: ' + str( mean_absolute_error( df_train['Price'], df_train['Estimated Home Value'] ) ))
print ('TEST: ' + str( mean_absolute_error( df_test['Price'], df_test['Estimated Home Value'] ) ))

## Shap

In [None]:
explainer = shap.TreeExplainer( model )
shap_values = explainer.shap_values( df_features )
shap.summary_plot(shap_values, df_features, show = False )
plt.gcf().set_size_inches( 12.0, 12.0)
plt.show()

