In [2]:
# ---------- IMPORT LIBRARIES ----------------------

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn

from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score 

import graphviz

In [4]:
# ---------- FUNCTIONS ----------------------

In [5]:
def preprocess(dataTarget):
    le = preprocessing.LabelEncoder()
    le.fit(dataTarget)
    class_labels = le.transform(dataTarget)
    
    return class_labels

In [3]:
def preprocessData(data):
    data = data.replace('Unknown', np.nan)
    data.dropna(inplace=True)
    
    le = preprocessing.LabelEncoder()
    for column_name in data.columns:
        if data[column_name].dtype == object:
            data[column_name] = le.fit_transform(data[column_name])
        else:
            pass
    
    return data

In [7]:
def buildDt(dataAttrs, dataTarget):
    #construct decision tree
    clf = tree.DecisionTreeClassifier(criterion="entropy")
    clf = clf.fit(dataAttrs, dataTarget)
    
    return clf

In [8]:
def printPred(df, testX, testY):
    predictions = df.predict(testX)
    print(metrics.classification_report(testY, predictions))

In [9]:
def drawGraph(clf, X, Y):
    dot_data = tree.export_graphviz(clf, out_file = None,
                                feature_names =X.columns,
                                class_names= Y,
                                filled = True,
                                rounded= False,
                                special_characters = True
                               )
    graph = graphviz.Source(dot_data)
    return graph

In [10]:
def predict(df, testX):
    prediction = df.predict(testX)
    return prediction

In [11]:
def printAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    print(accuracy)

In [12]:
def getAccuracy(clf, testX, testY):
    prediction = predict(clf, testX)
    accuracy = accuracy_score(testY, prediction)
    return accuracy

In [4]:
propertyData = pd.read_csv("property_prices.csv")

propertyData = preprocessData(propertyData)

propertyData['lowBand'], propertyData['highBand'] = propertyData['price_bands'].str.split('-', 1).str
propertyData['lowBand'] = propertyData['lowBand'].str[:-1]
propertyData['highBand'] = propertyData['highBand'].str[:-1]

NameError: name 'pd' is not defined

In [5]:
Y = propertyData['lowBand']

X = propertyData.drop(columns=['id','lowBand','highBand','price_bands','date','car_parks','bathrooms','address', 'suburb', 'suburb_property_count',
                              'council_area', 'method'])
trainX, testX, trainY, testY = train_test_split(np.array(X), np.array(Y), test_size=0.2)

'''le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)'''

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

NameError: name 'propertyData' is not defined

In [None]:
# ---------- FEATURE SELECTION ----------------------

In [None]:
#BASIC: NO FEATURES SELECTED

In [None]:
from sklearn.preprocessing import robust_scale

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands'])

In [None]:
Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'suburb','rooms','type','method', 'lattitude','longtitude'])
#X = propertyData[['suburb','rooms','type','method','date']]
XE = pd.get_dummies(X)
X = robust_scale(XE)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#VARIANCE THRESHOLD

In [None]:
Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands'])

XE = pd.get_dummies(X)

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(XE)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#FEATURE IMPORTANCE + RANDOM FOREST REGRESSOR

In [None]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor

names = propertyData[:1]
rf = RandomForestRegressor()
rf.fit(XE, Y)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True)

In [None]:
#SELECT K BEST - CHI2

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'longtitude','lattitude', 'building_area'])

XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

names = propertyData.columns[1:]

select_feature = SelectKBest(chi2, k=10).fit(trainX, trainY)
print('Score list:', select_feature.scores_)
print('Feature list:', names)

In [None]:
joined = zip(names, select_feature.scores_)
joined = sorted(joined, key=lambda x: x[1])
joined.reverse()

for entry in joined:
    print(entry[0], entry[1])

In [None]:
#CLASSIF

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

Y = propertyData['lowBand']
X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 'longtitude','lattitude'])

XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

names = propertyData.columns[1:]

select_feature = SelectKBest(score_func=f_classif, k=5).fit(trainX, trainY)
print('Score list:', select_feature.scores_)
print('Feature list:', names)

In [None]:
joined = zip(names, select_feature.scores_)
joined = sorted(joined, key=lambda x: x[1])
joined.reverse()

for entry in joined:
    print(entry[0], entry[1])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(trainX, trainY)


In [None]:
names = propertyData[:1]

print sorted(zip(rfe.support_, names), reverse=True)
#print('Chosen best 5 feature by rfe:',names.[rfe.support_])

In [None]:
Y = propertyData['lowBand']
#X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 
                               #'bathrooms','longtitude', 'lattitude'])
X = propertyData[['realestate_agent','postcode','method','distance']]
XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#SELECT FROM MODEL

In [172]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

names = propertyData[:1]
rfr= RandomForestRegressor()
rfr.fit(X, Y)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rfr.feature_importances_), names), 
             reverse=True)

model = SelectFromModel(rfr, prefit=True)
X_new = model.transform(XE)
print(X_new.shape)

Features sorted by their score:
[(0.2021, 'realestate_agent'), (0.1439, 'distance'), (0.135, 'date'), (0.1238, 'address'), (0.1219, 'postcode'), (0.1002, 'method'), (0.0572, 'rooms'), (0.0415, 'price_bands'), (0.0339, 'suburb'), (0.0189, 'type'), (0.0167, 'id'), (0.0048, 'bedrooms')]
(8887L, 6L)


In [None]:

trainX, testX, trainY, testY = train_test_split(np.array(X_new), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

In [None]:
#TRYING FEATURES DETERMINED BY KBEST

In [170]:
Y = propertyData['lowBand']
#X = propertyData.drop(columns=['id','lowBand', 'highBand', 'price_bands', 
                               #'bathrooms','longtitude', 'lattitude'])
X = propertyData.drop(columns=['id','lowBand','highBand','price_bands','date','car_parks','bathrooms','address', 'suburb', 'suburb_property_count',
                              'council_area', 'method'])
XE = pd.get_dummies(X)

trainX, testX, trainY, testY = train_test_split(np.array(XE), np.array(Y), test_size=0.2)

le = preprocessing.LabelEncoder()
le.fit(trainY)
class_labels = le.inverse_transform([0,1,2,3,4,5,6])
print(le.transform(trainY))
print(class_labels)

clf = buildDt(trainX, trainY)
printPred(clf, testX, testY)
printAccuracy(clf, testX, testY)

[4 5 4 ... 2 5 1]
[0 1 2 3 4 5 6]
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.77      0.73      0.75       626
          2       0.57      0.63      0.60       127
          3       0.55      0.56      0.55        59
          4       0.65      0.62      0.64       317
          5       0.52      0.55      0.53       365
          6       0.43      0.46      0.44       283

avg / total       0.62      0.62      0.62      1778

0.6181102362204725


  if diff:
