In [3]:
droppedColumns = ['id','lowBand','highBand','price_bands','date','car_parks','bathrooms','address', 'suburb', 'suburb_property_count',
                              'council_area', 'method']

In [4]:
# ---------- IMPORT LIBRARIES ----------------------

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn

from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score 

import graphviz

In [6]:
# ---------- FUNCTIONS ----------------------

In [7]:
def preprocess(dataTarget):
    le = preprocessing.LabelEncoder()
    le.fit(dataTarget)
    class_labels = le.transform(dataTarget)
    
    return class_labels

In [8]:
def preprocessData(data):
    data = data.replace('Unknown', np.nan)
    data.dropna(inplace=True)
    
    le = preprocessing.LabelEncoder()
    for column_name in data.columns:
        if data[column_name].dtype == object:
            data[column_name] = le.fit_transform(data[column_name])
        else:
            pass
    
    return data

In [9]:
def buildDt(dataAttrs, dataTarget):
    #construct decision tree
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(dataAttrs, dataTarget)
    
    return clf

In [10]:
def printPred(df, testX, testY):
    predictions = df.predict(testX)
    print(metrics.classification_report(testY, predictions))

In [11]:
def drawGraph(clf, X, Y):
    dot_data = tree.export_graphviz(clf, out_file = None,
                                feature_names =X.columns,
                                class_names= Y,
                                filled = True,
                                rounded= False,
                                special_characters = True
                               )
    graph = graphviz.Source(dot_data)
    return graph

In [12]:
def predict(df, testX):
    prediction = df.predict(testX)
    return prediction

In [13]:
def printAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    print(accuracy)

In [14]:
def getAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    return accuracy

In [15]:
# ---------- SET UP ----------------------

In [16]:
propertyData = pd.read_csv("property_prices.csv")

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

propertyData = preprocessData(propertyData)

In [17]:
Y = propertyData['lowBand']
X = propertyData.drop(columns=droppedColumns)
trainX, testX, trainY, testY = train_test_split(np.array(X), np.array(Y), test_size=0.2)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.73      0.76      0.75       595
          2       0.69      0.71      0.70       146
          3       0.63      0.51      0.56        93
          4       0.58      0.60      0.59       272
          5       0.52      0.54      0.53       350
          6       0.46      0.40      0.43       321

avg / total       0.61      0.61      0.61      1778

0.6107986501687289


In [19]:
# ---------- DECISION TREE CLASSIFIER ----------------------

In [21]:
#TESTING PARAMETER: MAX DEPTH OF TREE + MIN SAMPLES LEAF

Y = propertyData['lowBand']
X = propertyData.drop(columns=droppedColumns)

trainX, testX, trainY, testY = train_test_split(np.array(X), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

max_depth = [1,2,3,4,5,6,7,8,9,10,15,20]
min_samples = [1,2,3,4,5,6,7,8,9,10,15,20]
accuracy = []

for depth in max_depth:
    for samples in min_samples:
        lf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth, min_samples_leaf=samples)
        clf.fit(trainX, trainY)
        prediction = predict(clf,testX)
        acc = accuracy_score(testY, prediction)
        accuracy.append(acc)
        
#printPred(clf, testX, testY)
#printAccuracy(clf, testX, testY)



  if diff:


[3 4 6 ... 1 1 1]
[0 1 2 3 4 5 6]
[0.608548931383577, 0.6147356580427447, 0.6282339707536558, 0.6130483689538808, 0.609673790776153, 0.6276715410573678, 0.6107986501687289, 0.6214848143982002, 0.6079865016872891, 0.6192350956130483, 0.6046119235095613, 0.6220472440944882, 0.610236220472441, 0.6186726659167604, 0.6152980877390326, 0.6136107986501688, 0.6197975253093363, 0.623734533183352, 0.624859392575928, 0.608548931383577, 0.6203599550056242, 0.6169853768278966, 0.6152980877390326, 0.6164229471316085, 0.6181102362204725, 0.6136107986501688, 0.6130483689538808, 0.6203599550056242, 0.6175478065241845, 0.6197975253093363, 0.6220472440944882, 0.6186726659167604, 0.6181102362204725, 0.6271091113610798, 0.6209223847019123, 0.6147356580427447, 0.6186726659167604, 0.6226096737907761, 0.6186726659167604, 0.623734533183352, 0.6164229471316085, 0.6186726659167604, 0.6186726659167604, 0.6181102362204725, 0.6136107986501688, 0.6119235095613048, 0.6124859392575928, 0.6164229471316085, 0.6226096737

In [26]:
linked_accuracy = []
for idx, elem in enumerate(max_depth):
    for jdx, elem2 in enumerate(min_samples):
        linked_accuracy.append((elem, elem2, accuracy[idx * len(max_depth) + jdx]))

linked_accuracy = sorted(linked_accuracy, key=lambda x: x[2], reverse=True)

for elem in linked_accuracy:
    print elem

(6, 15, 0.6287964004499438)
(7, 5, 0.6287964004499438)
(1, 3, 0.6282339707536558)
(20, 9, 0.6282339707536558)
(1, 6, 0.6276715410573678)
(3, 10, 0.6271091113610798)
(8, 6, 0.6259842519685039)
(2, 7, 0.624859392575928)
(6, 3, 0.624859392575928)
(5, 4, 0.6242969628796401)
(8, 2, 0.6242969628796401)
(2, 6, 0.623734533183352)
(4, 4, 0.623734533183352)
(10, 6, 0.623734533183352)
(15, 15, 0.623734533183352)
(20, 20, 0.623734533183352)
(5, 20, 0.6231721034870641)
(8, 1, 0.6231721034870641)
(8, 10, 0.6231721034870641)
(4, 2, 0.6226096737907761)
(5, 1, 0.6226096737907761)
(5, 3, 0.6226096737907761)
(9, 3, 0.6226096737907761)
(1, 20, 0.6220472440944882)
(3, 7, 0.6220472440944882)
(1, 8, 0.6214848143982002)
(5, 5, 0.6214848143982002)
(20, 8, 0.6214848143982002)
(3, 15, 0.6209223847019123)
(6, 1, 0.6209223847019123)
(6, 20, 0.6209223847019123)
(7, 8, 0.6209223847019123)
(2, 9, 0.6203599550056242)
(3, 4, 0.6203599550056242)
(6, 5, 0.6203599550056242)
(9, 6, 0.6203599550056242)
(9, 8, 0.620359955005

In [31]:
#USING RESULTS FROM PREVIOUS

Y = propertyData['lowBand']
X = propertyData.drop(columns=droppedColumns)

trainX, testX, trainY, testY = train_test_split(np.array(X), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

lf = tree.DecisionTreeClassifier(criterion="entropy", max_depth = 6, min_samples_leaf = 15)
clf.fit(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)



[4 4 6 ... 1 5 1]
[0 1 2 3 4 5 6]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.75      0.73      0.74       615
          2       0.60      0.65      0.62       145
          3       0.63      0.53      0.58        75
          4       0.61      0.68      0.65       287
          5       0.54      0.53      0.53       360
          6       0.45      0.44      0.44       296

avg / total       0.62      0.62      0.62      1778

0.6164229471316085


  if diff:
