In [None]:
# ---------- IMPORT LIBRARIES ----------------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn

from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score 

import graphviz

In [None]:
# ---------- FUNCTIONS ----------------------

In [None]:
def preprocess(dataTarget):
    le = preprocessing.LabelEncoder()
    le.fit(dataTarget)
    class_labels = le.transform(dataTarget)
    
    return class_labels

In [None]:
def preprocessData(data):
    data = data.replace('Unknown', np.nan)
    data.dropna(inplace=True)
    
    return data

In [None]:
def buildDt(dataAttrs, dataTarget):
    #construct decision tree
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(dataAttrs, dataTarget)
    
    return clf

In [None]:
def printPred(df, testX, testY):
    predictions = df.predict(testX)
    print(metrics.classification_report(testY, predictions))

In [None]:
def drawGraph(clf, X, Y):
    dot_data = tree.export_graphviz(clf, out_file = None,
                                feature_names =X.columns,
                                class_names= Y,
                                filled = True,
                                rounded= False,
                                special_characters = True
                               )
    graph = graphviz.Source(dot_data)
    return graph

In [None]:
def predict(df, testX):
    prediction = df.predict(testX)
    return prediction

In [None]:
def printAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    print(accuracy)

In [None]:
def getAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    return accuracy

In [None]:
# ---------- TRYING OUT SCRIPTS ----------------------

In [None]:
#SCRIPT FOR BACKWARD SUBSET FEATURE SELECTION - ONE AT THE TIME 
attribute = ['address', 'rooms', 'type', 'method', 'realestate_agent', 'date', 'distance', 'postcode', 'bedrooms',
             'bathrooms', 'car_parks', 'landsize', 'building_area', 'year_built',
            'council_area', 'lattitude', 'longtitude', 'region_name', 'suburb_property_count']

acc_score = []

propertyData = pd.read_csv("property_prices.csv")

propertyData = preprocessData(propertyData)

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

for attr in attribute: 

    Y = propertyData['lowBand']
    X = propertyData.drop(columns=['id','lowBand', 'highBand','price_bands', attr])

    XE = pd.get_dummies(X)
    trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

    le = preprocessing.LabelEncoder()
    le.fit(trainY)
    class_labels = le.inverse_transform([0,1,2,3,4,5,6])

    clf = buildDt(trainX, trainY)
    accuracy = getAccuracy(clf, testX, testY)
    acc_score.append(accuracy)


In [None]:
#PRINTING THE ACCURACY SCORE FOR EACH ATTRIBUTE, IF HIGH = REMOVE 
attribute = ['address', 'rooms', 'type', 'method', 'realestate_agent', 'date', 'distance', 'postcode', 'bedrooms',
             'bathrooms', 'car_parks', 'landsize', 'building_area', 'year_built',
            'council_area', 'lattitude', 'longtitude', 'region_name', 'suburb_property_count']

attr = []
for a in attribute:
    attr.append(a[:3])
    
fig = plt.figure()
fig = plt.plot(attr, acc_score)

In [None]:
#SCRIPT FOR SEVERAL ITERATIONS OF BACKWARD SUBSET FEATURE SELECTION
import copy
attribute = ['address', 'rooms', 'type', 'method', 'realestate_agent', 'date', 'distance', 'postcode', 'bedrooms',
             'bathrooms', 'car_parks', 'landsize', 'building_area', 'year_built',
            'council_area', 'lattitude', 'longtitude', 'region_name', 'suburb_property_count']

droppedColumns = ['id','lowBand', 'highBand','price_bands']

previous_accuracy = 0

propertyData = pd.read_csv("property_prices.csv")

propertyData = preprocessData(propertyData)

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

Y = propertyData['lowBand']
        
while True:
    acc_score = []
    
    for attr in attribute: 

        dropCopy = copy.copy(droppedColumns)
        dropCopy.extend([attr])
        
        X = propertyData.drop(columns=dropCopy)
        
        XE = pd.get_dummies(X)
        trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

       # le = preprocessing.LabelEncoder()
       # le.fit(trainY)
       # class_labels = le.inverse_transform([0,1,2,3,4,5,6])

        clf = buildDt(trainX, trainY)
        accuracy = getAccuracy(clf, testX, testY)
        acc_score.append(accuracy)

    max_index = 0
    for i in range(len(acc_score)):
        if(acc_score[i] > acc_score[max_index]):
            max_index = i

    droppedColumns.append(attribute[max_index])
    attribute.pop(max_index)
    current_accuracy = acc_score[max_index]
    
    if(current_accuracy < previous_accuracy):
        break
    else:
        previous_accuracy = current_accuracy
    


In [None]:
#Printing the attributes we are left with 
attribute

In [None]:
#Printing the attributes that were dropped
droppedColumns

In [None]:
#Printing the accuracy before the script stopped
previous_accuracy