In [200]:
import numpy as np

In [None]:
from sklearn.datasets import fetch_california_housing# california dataset is a popular dataset of house prices 
from sklearn.model_selection import train_test_split


In [202]:
x = fetch_california_housing().data
y = fetch_california_housing().target

In [None]:
class Node:  # initializing Node class
    def __init__(self,threshold,feature,left,right,value):
        self.threshold = threshold
        self.feature = feature
        self.left = left
        self.right = right
        self.value = value

In [None]:
class decisionTreeRegressor:  # initializing decision tree regressor class
    def __init__(self,max_depth,max_leaf_samp):
        self.max_depth = max_depth
        self.max_leaf_samp = max_leaf_samp
        self.root = None # top node or starting node of the tree 
    def mse(self,x): # calculates mean squared error
        x_mean = np.mean(x)
        return np.mean((x-x_mean)**2)
    def best_split(self,x,y): # spilting data based on calculated mean squared error.
        m_samp,n_feat = x.shape
        
        min_mse = float(np.inf)
        best_thresh = None
        best_feat = None
        for feat in range(n_feat): # iterating features
            for thresh in np.unique(x[:,feat]): # iterating thresholds or unique values in columns
                x_left_idx = np.where(x[:,feat]<thresh)[0] # finding the idexes of the splited data
                x_right_idx = np.where(x[:,feat]>=thresh)[0]
                if len(x_left_idx)==0 or len(x_right_idx)==0:# skipping the condition when any of the splited list of indexes is empty
                    continue
                y_left = y[x_left_idx] # selecting the target values for left and right split
                y_right = y[x_right_idx]
                error = (len(y_left)*self.mse(y_left)+len(y_right)*self.mse(y_right))/len(y)# calculating weighted mse 
                if error<min_mse:
                    min_mse = error
                    best_thresh=thresh
                    best_feat=feat
        return best_thresh,best_feat
    def grow(self,x,y,depth):
        if len(x)<self.max_leaf_samp or depth==self.max_depth:

            return Node(None,None,None,None,np.mean(y)) # returning the node if the boundary conditions are satisfied
        best_thresh,best_feat = self.best_split(x,y)
        if best_thresh is None:
            return Node(None,None,None,None,np.mean(y))
        left_idx = np.where(x[:,best_feat]<best_thresh)[0] # selecting row values 
        right_idx =np.where(x[:,best_feat]>=best_thresh)[0]
        left_node = self.grow(x[left_idx],y[left_idx],depth=depth+1) # recurcively calling function to continue splitting
        right_node = self.grow(x[right_idx],y[right_idx],depth=depth+1)
        return Node(best_thresh,best_feat,left_node,right_node,None)
    def traverse(self,x,node):
        if node.value is not None:
            return node.value
        if x[node.feature]<node.threshold:
           return self.traverse(x,node.left)
        else:
           return self.traverse(x,node.right)    
    def predict(self,X):
        return np.array([self.traverse(x,self.root) for x in X])
    def fit(self,X,Y):
        self.root = self.grow(X,Y,0)
    

        


                    



In [205]:
model = decisionTreeRegressor(9,10)

In [206]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,shuffle=True,random_state=33)

In [207]:
model.fit(x_train,y_train)

In [216]:
pred = model.predict(x_test)

In [208]:
from sklearn.metrics import root_mean_squared_error,r2_score

In [209]:
# rm = 100
# rm_list = []
# dep = None
# n_sam = None
# for i in range(1,11):
#     for j in range(1,11):
#         model_test = decisionTreeRegressor(i,j)
#         model_test.fit(x_train,y_train)
#         pred = model_test.predict(x_test)
#         curr_rm = root_mean_squared_error(y_test,pred)
#         rm_list.append(curr_rm)
#         if rm>curr_rm:
#             rm = curr_rm
#             dep = i
#             n_sam = j
# print("min rmse :",rm)            
# print("rm list:\n",rm_list)
# print("depth:",dep)
# print("n samples per leaf:",n_sam)

            


In [217]:
print("rmse:",root_mean_squared_error(y_test,pred))
print("r2 score:",r2_score(y_test,pred))

rmse: 0.6156932174917196
r2 score: 0.7211987733914427


In [None]:
def print_tree(node, depth=0):#i got this from chat gpt 
    indent = "  " * depth
    if node.value is not None:
        print(f"{indent}Leaf: value = {node.value}")
    else:
        print(f"{indent}X[{node.feature}] < {node.threshold}")
        print_tree(node.left, depth+1)
        print_tree(node.right, depth+1)
