In [None]:
# ---------- IMPORT LIBRARIES ----------------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn

from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score 

import graphviz

In [None]:
# ---------- FUNCTIONS ----------------------

In [None]:
def preprocess(dataTarget):
    le = preprocessing.LabelEncoder()
    le.fit(dataTarget)
    class_labels = le.transform(dataTarget)
    
    return class_labels

In [None]:
def preprocessData(data):
    data = data.replace('Unknown', np.nan)
    data.dropna(inplace=True)
    
    return data

In [None]:
def buildDt(dataAttrs, dataTarget):
    #construct decision tree
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(dataAttrs, dataTarget)
    
    return clf

In [None]:
def printPred(df, testX, testY):
    predictions = df.predict(testX)
    print(metrics.classification_report(testY, predictions))

In [None]:
def drawGraph(clf, X, Y):
    dot_data = tree.export_graphviz(clf, out_file = None,
                                feature_names =X.columns,
                                class_names= Y,
                                filled = True,
                                rounded= False,
                                special_characters = True
                               )
    graph = graphviz.Source(dot_data)
    return graph

In [None]:
def predict(df, testX):
    prediction = df.predict(testX)
    return prediction

In [None]:
def printAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    print(accuracy)

In [None]:
def getAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    return accuracy

In [None]:
# ---------- VISUALIZING DATA ----------------------

In [None]:
#visualizing correlation plot
correlations = propertyData.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1, cmap=plt.cm.PuBu)
fig.colorbar(cax)
ticks = np.arange(0,13,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(propertyData.columns, minor = True)
ax.set_yticklabels(propertyData.columns)
plt.show()

In [None]:
propertyData = pd.read_csv("property_prices.csv")

propertyData = preprocessData(propertyData)

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

In [None]:
propertyData.describe()

In [None]:
# ---------- PREPROCESS DATA ----------------------

In [None]:
from sklearn.preprocessing import Imputer

propertyData = pd.read_csv("property_prices.csv")

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

propertyData = propertyData.replace('Unknown', np.nan)

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands'])

XE = pd.get_dummies(X)

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
XE = imp.fit_transform(XE)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
_______________________________

In [None]:
propertyData = pd.read_csv("property_prices.csv")

propertyData = preprocessData(propertyData)

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

In [None]:
# ---------- FEATURE SELECTION ----------------------

In [None]:
#BASIC: NO FEATURES SELECTED

In [None]:
from sklearn.preprocessing import robust_scale

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands'])

In [None]:
Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'suburb','rooms','type','method', 'lattitude','longtitude'])
#X = propertyData[['suburb','rooms','type','method','date']]
XE = pd.get_dummies(X)
X = robust_scale(XE)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#VARIANCE THRESHOLD

In [None]:
Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands'])

XE = pd.get_dummies(X)

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(XE)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#FEATURE IMPORTANCE + RANDOM FOREST REGRESSOR

In [None]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor

names = propertyData[:1]
rf = RandomForestRegressor()
rf.fit(XE, Y)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True)

In [None]:
#SELECT K BEST - CHI2

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'longtitude','lattitude', 'building_area'])

XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

names = propertyData.columns[1:]

select_feature = SelectKBest(chi2, k=10).fit(trainX, trainY)
print('Score list:', select_feature.scores_)
print('Feature list:', names)

In [None]:
joined = zip(names, select_feature.scores_)
joined = sorted(joined, key=lambda x: x[1])
joined.reverse()

for entry in joined:
    print(entry[0], entry[1])

In [None]:
#CLASSIF

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'longtitude','lattitude'])

XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

names = propertyData.columns[1:]

select_feature = SelectKBest(score_func=f_classif, k=5).fit(trainX, trainY)
print('Score list:', select_feature.scores_)
print('Feature list:', names)

In [None]:
joined = zip(names, select_feature.scores_)
joined = sorted(joined, key=lambda x: x[1])
joined.reverse()

for entry in joined:
    print(entry[0], entry[1])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(trainX, trainY)


In [None]:
names = propertyData[:1]

print sorted(zip(rfe.support_, names), reverse=True)
#print('Chosen best 5 feature by rfe:',names.[rfe.support_])

In [None]:
Y = propertyData['lowBand']
#X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 
                               #'bathrooms','longtitude', 'lattitude'])
X = propertyData[['realestate_agent','postcode','method','distance']]
XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#SELECT FROM MODEL

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

names = propertyData[:1]
rfr= RandomForestRegressor()
rfr.fit(XE, Y)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), names), 
             reverse=True)

model = SelectFromModel(rfr, prefit=True)
X_new = model.transform(XE)
print(X_new.shape)

In [None]:

trainX, testX, trainY, testY = train_test_split(np.array(X_new), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#TRYING FEATURES DETERMINED BY KBEST

In [None]:
Y = propertyData['lowBand']
#X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 
                               #'bathrooms','longtitude', 'lattitude'])
X = propertyData[['rooms','date','type','distance','suburb','method', 'year_built','bedrooms']]
XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)