In [22]:
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from collections import Counter
from numpy import *
import sklearn
from sklearn import tree

In [23]:
dataset = pd.read_excel('real_estate.xlsx')

In [24]:
dataset.sample(frac = 1)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
392,393,2013.083333,42.7,443.80200,6,24.97927,121.53874,35.3
12,13,2012.916667,13.0,492.23130,5,24.96515,121.53737,39.3
75,76,2013.500000,12.3,1360.13900,1,24.95204,121.54842,29.5
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
307,308,2012.833333,10.3,3079.89000,0,24.95460,121.56627,24.7
379,380,2013.333333,0.0,292.99780,6,24.97744,121.54458,69.7
33,34,2013.250000,16.5,323.65500,6,24.97841,121.54281,49.3
346,347,2013.416667,13.2,1712.63200,2,24.96412,121.51670,30.8
262,263,2012.916667,15.9,289.32480,5,24.98203,121.54348,53.0
173,174,2013.083333,41.3,401.88070,4,24.98326,121.54460,35.1


In [5]:
def gini_cost(rows):
    headers = rows.columns
    last_col = headers[-1]
    row = rows[last_col]
    mean = 1.0*sum(row)/len(row)
    variance = 1.0*sum([(i-mean)**2 for i in row])/len(row)
    return variance
    
    

In [6]:
def info_gain(upper,lower,current): 
    len_upper = len(upper)
    len_lower = len(lower)
    
    p = len_lower*1.0/(len_lower+len_upper)
    cost = p*gini_cost(lower)+(1-p)*gini_cost(upper)
    return current-cost

In [7]:
def unique_values(row):
    l = []
    for i in row: 
        if i not in l: 
            l +=[i]
    return asarray(l)
        

In [8]:
def best_split(dataset):
    
    atlist = dataset.columns[:-1]
    atlist = [str(i) for i in atlist]
    
    last_col = dataset.columns[-1]
    current = gini_cost(dataset)
    best_gain = -1000
    best_attribute = 0
    best_cutoff = 0
    
    for attr in atlist:
        cutoff = unique_values(dataset[attr])
        
        for cut in cutoff:
            upper,lower = part(dataset,attr,cut)
            if(len(upper)<1 or len(lower)<1): 
                continue
                
            gain=info_gain(upper,lower,current)
            
            if(gain>=best_gain): 
                best_gain,best_attribute,best_cutoff = gain,attr,cut
            
    
    return best_gain,best_attribute,best_cutoff
        
    

In [25]:
class Decision: 
    def __init__(self,true,false,cutoff,attribute): 
        self.true = true
        self.false = false
        self.cutoff = cutoff
        self.attribute = attribute

In [26]:
def predict(dataset):
    last_col = dataset.columns[-1]
    return sum(dataset[last_col])/len(dataset[last_col]) 

In [27]:
class Output: 
    def __init__(self,dataset):
        self.prediction = predict(dataset)

In [28]:
def part(dataset,attr,cutoff):
    
    upper = dataset[dataset[attr]>cutoff]
    lower = dataset[dataset[attr]<=cutoff]
    return upper,lower

In [29]:
def build_tree(dataset): 
    
    best_gain, best_attr,best_cutoff = best_split(dataset)
    
    if(best_gain<=0):
        return Output(dataset)
    
    upper,lower = part(dataset,best_attr,best_cutoff)
    
    true_branch = build_tree(upper)
    
    false_branch = build_tree(lower)
    
    return Decision(true_branch,false_branch,best_cutoff,best_attr)
    
    
    

In [30]:
def comp(row,node):
    at = node.attribute
    
    if(row[at]>node.cutoff): 
        return True
    else:
        return False

In [31]:
def data_split(dataset):
    size = len(dataset)
    thresh = 8*size/10
    train = dataset[:thresh]
    test = dataset[thresh:]
    return train,test

In [32]:
def classify(row,node): 
    if isinstance(node,Output):
        return node.prediction
    else:
        result = comp(row,node)
        if(result):
            return classify(row,node.true)
        else: 
            return classify(row,node.false)
    
        
        

In [33]:
#Q2(a) Build Regression tree

train,test = data_split(dataset)
tree_train = build_tree(train) 

In [34]:
#Q3 and Q4. Displaying the performance of regressor and comparing with scikit-learn.  

clf = sklearn.tree.DecisionTreeRegressor()

x_data = asarray(train.values.tolist())[:,1:-1]
y_data = asarray(train.values.tolist())[:,-1]

clf.fit(x_data,y_data)

train,test = data_split(dataset)

tree_train = build_tree(train)

last_col = dataset.columns[-1]

error = 0

scikit_error = 0

for index,row in test.iterrows():
    
    scikit_output = clf.predict([row[1:-1]])[0]
    
    predicted_output = classify(row,tree_train)
    
    actual_output = row[last_col]
    
    error+= (predicted_output-actual_output)**2
    scikit_error += (scikit_output-actual_output)**2

print("My model mean square error is", (1.0*error/(len(test)))**0.5)
print("the scikit learn mean square error is",(1.0*scikit_error/len(test))**0.5)
        




('My model mean square error is', 11.348164757946284)
('the scikit learn mean square error is', 10.639445530892072)


In [10]:
 =