In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv("HousingData.csv")
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
#Attribute Information (in order):
#- CRIM per capita crime rate by town
#- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
#- INDUS proportion of non-retail business acres per town
#- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#- NOX nitric oxides concentration (parts per 10 million)
#- RM average number of rooms per dwelling
#- AGE proportion of owner-occupied units built prior to 1940
#- DIS weighted distances to five Boston employment centres
#- RAD index of accessibility to radial highways
#- TAX full-value property-tax rate per  10,000
# - PTRATIO  pupil-teacher ratio by town
# - B  1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# - LSTAT    % lower status of the population
#- MEDV     Median value of owner-occupied homes 

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     486 non-null    float64
 1   ZN       486 non-null    float64
 2   INDUS    486 non-null    float64
 3   CHAS     486 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      486 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    486 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [6]:
## checking the percentage of nan values present in each feature

features_with_na=[features for features in data.columns if data[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(data[feature].isnull().mean(), 4),  ' % missing values')

CRIM 0.0395  % missing values
ZN 0.0395  % missing values
INDUS 0.0395  % missing values
CHAS 0.0395  % missing values
AGE 0.0395  % missing values
LSTAT 0.0395  % missing values


In [7]:
data.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [8]:
data = data.fillna(data.mean())

In [9]:
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [10]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value

In [11]:

import numpy as np
class DecisionTreeRegressor():
   # called every time an object is created from a class
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
       
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split #specifies the minimum number of samples required to split an internal node
        self.max_depth = max_depth  #determines the maximum depth of the decision tree that will be constructed
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features in the dataset
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            #it will assighn the unique values in the dataset
            possible_thresholds = np.unique(feature_values)
            # loop over all the unique feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null 
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute varience reduction for target variable
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    # if curr_var_red is greater than max_var_red then it will update the best split as this value
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        # if feature value or index is less than or equal to the threshold then the value is assighned to left 
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        # if feature value or index is greater than  to the threshold then the value is assighned to right 
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        # after splitiing it will return the result
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction gfeature'''
        
        weight_l = len(l_child) / len(parent)# it will calculate the varience of the left chaild
        weight_r = len(r_child) / len(parent)# it will calculate the varience of the right chld
        #after calculating varience of left and right child then using this we are going to calculate the varience reduction using this formula 
        #taking sum of the right and left child and subtracting with the varience of parent 
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        #return the varience reduction
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        #it is used to find value of leaf node
        val = np.mean(Y)
        return val  

    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        #it will separate the data into independent and dependet 
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        #this dictionory will store best split value
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
              #if the best split varience reduction is greater than 0 it will build left and right subtree and increases the size of the depth +1
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    
    #it will shows the how the tree will be build 
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        #the feature value is less than or equal to the threshol value of the tree it will make prediction on left tree else make prediction on right tree
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
         # Traverse the decision tree to make a prediction for a single instance
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions


       
    def mean_squared_error(self,y_true, y_pred):
   
      # Check if the lengths of both arrays are equal
      if len(y_true) != len(y_pred):
          raise ValueError("Length of y_true and y_pred should be the same.")
      
      # Calculate the squared differences between the true and predicted values
      squared_differences = [(y_true[i] - y_pred[i])**2 for i in range(len(y_true))]
      
      # Calculate the mean of the squared differences
      mse = sum(squared_differences) / len(squared_differences)
      
      return mse


    

In [12]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)
#print(Y)

In [13]:
import random

def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        random.seed(random_state)
    
    n = len(X)
    test_data = set(random.sample(range(n), int(n * test_size)))
    train_data = set(range(n)) - test_data
    
    X_train = [X[i] for i in train_data]
    X_test = [X[i] for i in test_data]
    y_train = [y[i] for i in train_data]
    y_test = [y[i] for i in test_data]
    return X_train, X_test, y_train, y_test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [14]:
regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train,Y_train)
regressor.print_tree()


X_5 <= 6.833 ? 35.04811008514187
 left:X_12 <= 14.37 ? 15.549253772248505
  left:X_7 <= 1.3567 ? 7.254110053155017
    left:50.0
    right:X_5 <= 6.54 ? 4.9401685306475525
        left:21.50297619047619
        right:27.333333333333332
  right:X_0 <= 6.80117 ? 6.497678954506002
    left:X_9 <= 296.0 ? 2.3464273212690916
        left:19.966666666666665
        right:16.118032786885248
    right:X_4 <= 0.597 ? 4.23368327765078
        left:16.7
        right:10.863829787234042
 right:X_5 <= 7.42 ? 38.442443376711296
  left:X_0 <= 4.64689 ? 19.825400000000002
    left:X_12 <= 5.39 ? 4.261275510204079
        left:34.31428571428572
        right:30.185714285714283
    right:X_0 <= 19.6091 ? 8.0
        left:16.4
        right:10.4
  right:X_4 <= 0.647 ? 22.533632067365524
    left:X_0 <= 0.57529 ? 6.176879192999509
        left:43.82941176470588
        right:49.760000000000005
    right:21.9


In [15]:
y_pred = regressor.predict(X_test)

In [16]:
y_pred


[30.185714285714283,
 16.118032786885248,
 16.118032786885248,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 16.118032786885248,
 27.333333333333332,
 16.118032786885248,
 21.50297619047619,
 27.333333333333332,
 21.50297619047619,
 19.966666666666665,
 21.50297619047619,
 16.118032786885248,
 19.966666666666665,
 27.333333333333332,
 30.185714285714283,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 27.333333333333332,
 21.50297619047619,
 21.50297619047619,
 21.50297619047619,
 16.118032786885248,
 21.50297619047619,
 16.118032786885248,
 16.118032786885248,
 21.50297619047619,
 16.118032786885248,
 16.118032786885248,
 16.118032786885248,
 16.118032786885248,
 16.118032786885248,
 16.118032786885248,
 49.760000000000005,
 49.760000000000005,
 16.118032786885248,
 21.50297619047619,
 43.82941176470588,
 34.31428571428572,
 30.185714285714283,
 43.82941176470588,
 43.82941176470588,
 21.5029761904

In [18]:
np.sqrt(regressor.mean_squared_error(Y_test,y_pred))

array([6.26163758])