In [1]:
import numpy as np
import pandas as pd
from preprocessing import Preprocessor
from asserts import asserts

df = pd.read_csv('./data/student_habits_performance.csv')
X = df.iloc[:, 1: -1]
y = df.iloc[:, -1]



In [2]:
preprocessor = Preprocessor()

In [3]:
X_train, X_test, y_train, y_test = preprocessor.preprocess(X, y)

In [4]:
import numpy as np

class TreeNode():
    def __init__(self, left=None, right=None, feature_index=None, threshold=None, reduction_in_var=None, value=None):
        self.left = left
        self.right = right
        self.feature_index = feature_index
        self.threshold = threshold
        self.reduction_in_var = reduction_in_var
        self.value = value  #for leaf nodes

class DecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=5):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def get_best_split(self, X, y):
        n_samples, n_features = X.shape
        best_weighted_child_var = float('inf')
        best_feature_index = None
        best_threshold = None

        for i in range(n_features):
            sorted_col_indices = X[:, i].argsort()
            sorted_col = X[:, i][sorted_col_indices]
            sorted_y = y[sorted_col_indices]

            for j in range(n_samples - 1):
                threshold = (sorted_col[j] + sorted_col[j + 1]) / 2
                left_y = sorted_y[sorted_col <= threshold]
                right_y = sorted_y[sorted_col > threshold]

                if len(left_y) < 2 or len(right_y) < 2:
                    continue
                    
                weighted_left_var = np.var(left_y) * len(left_y) / n_samples
                weighted_right_var = np.var(right_y) * len(right_y) / n_samples
                weighted_child_var = weighted_left_var + weighted_right_var

                if weighted_child_var < best_weighted_child_var:
                    best_weighted_child_var = weighted_child_var
                    best_feature_index = i
                    best_threshold = threshold

        if best_feature_index is None:  #no valid split found
            return None, None, None
            
        reduction_in_var = np.var(y) - best_weighted_child_var
        return best_feature_index, best_threshold, reduction_in_var

    def build_tree(self, X, y, depth=0):
        n_samples = X.shape[0] if len(X.shape) > 1 else len(X)
        
        if n_samples < self.min_samples_split or depth >= self.max_depth:
            return TreeNode(value=np.mean(y))
        
        feature_index, threshold, reduction_in_var = self.get_best_split(X, y)
        if feature_index is None:  #no valid split found
            return TreeNode(value=np.mean(y))

        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask
        
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[right_mask], y[right_mask]

        left_child = self.build_tree(X_left, y_left, depth + 1)
        right_child = self.build_tree(X_right, y_right, depth + 1)

        return TreeNode(left_child, right_child, feature_index, threshold, reduction_in_var)
    
    def fit(self, X_train, y_train):
        asserts(X_train, y_train)
            
        self.root = self.build_tree(X_train, y_train)

    def predict(self, X_test):
        if self.root is None:
            raise ValueError("The tree is not fitted yet")
            
        preds = np.zeros(len(X_test))
        for i in range(len(X_test)):
            node = self.root
            while node.value is None:
                if X_test[i, node.feature_index] <= node.threshold:
                    node = node.left
                else:
                    node = node.right
            preds[i] = node.value
        return preds

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        rss = np.sum((y_test - y_pred) ** 2)
        tss = np.sum((y_test - np.mean(y_test)) ** 2)
        return 1 - (rss / tss) if tss != 0 else 0.0

In [5]:
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
regressor.score(X_test, y_test)

TypeError: '<' not supported between instances of 'OneHotEncoder' and 'OneHotEncoder'

In [None]:
input = [
    [23, 'Female', 0.0, 1.2, 1.1, 'No', 85.0, 8.0, 'Fair', 6, 'Master', 'Average', 8, 'Yes'],
    [20, 'Female', 6.9, 2.8, 2.3, 'No', 97.3, 4.6, 'Good', 6, 'High School', 'Average', 8, 'No'],
    [21, 'Male', 1.4, 3.1, 1.3, 'No', 94.8, 8.0, 'Poor', 1, 'High School', 'Poor', 1, 'No'],
    [23, 'Female', 1.0, 3.9, 1.0, 'No', 71.0, 9.2, 'Poor', 4, 'Master', 'Good', 1, 'Yes'],
    [19, 'Female', 5.0, 4.4, 0.5, 'No', 90.9, 4.9, 'Fair', 3, 'Master', 'Good', 1, 'No'],
    [24, 'Male', 7.2, 1.3, 0.0, 'No', 82.9, 7.4, 'Fair', 1, 'Master', 'Average', 4, 'No'],
    [21, 'Female', 5.6, 1.5, 1.4, 'Yes', 85.8, 6.5, 'Good', 2, 'Master', 'Poor', 4, 'No'],
    [21, 'Female', 4.3, 1.0, 2.0, 'Yes', 77.7, 4.6, 'Fair', 0, 'Bachelor', 'Average', 8, 'No'],
    [23, 'Female', 4.4, 2.2, 1.7, 'No', 100.0, 7.1, 'Good', 3, 'Bachelor', 'Good', 1, 'No'],
    [18, 'Female', 4.8, 3.1, 1.3, 'No', 95.4, 7.5, 'Good', 5, 'Bachelor', 'Good', 10, 'Yes'],
    [19, 'Female', 4.6, 3.7, 0.8, 'No', 77.6, 5.8, 'Fair', 1, 'None', 'Good', 3, 'No'],
]

input = preprocessor.transform_input(input)
my_pred = regressor.predict(X_test)



In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, regressor.predict(X_test))

0.7519831168449562

In [None]:
import sklearn
dtr = sklearn.tree.DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr.score(X_test, y_test)

0.7000335660050903

In [None]:
sklearn_pred = dtr.predict(X_test)

In [None]:
count = 0
for pred1, pred2 in zip(my_pred, sklearn_pred):
    if abs(pred1 - pred2) <= 10:
        count += 1
print(f"Number of predictions within 10 distance: {count}")
print(count/len(my_pred))

Number of predictions within 15 distance: 210
0.84
