In [15]:
import numpy as np
import pandas as pd
from collections import Counter

In [9]:
#creating sample data
X = np.random.rand(10, 4) * 10; #10 samples with 4 features
y = np.random.randint(0, 2, size=10) #2 labels of 10 samples

In [95]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a-b)**2))

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, x, y):
        self.x_train = x
        self.y_train = y

    def predict(self, x):
        #compute distance
        distances = [euclidean_distance(x, x_train) for x_train in self.x_train]

        #get k neighbors
        k_indices = np.argsort(distances)[:self.k]

        # majority vote
        k_neigbors_label = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_neigbors_label).most_common(1)[0][0]
        return most_common

    def _predict(self, test):
        return [self.predict(x) for x in test]
        

In [109]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=42)
knn = KNN()
knn.fit(X_train, y_train)
y_pred = knn._predict(X_test)

accuracy = sum(1 for a, b in zip(y_pred, y_test) if a == b) / len(y_test)
print("Train-Test Split Accuracy:", accuracy)

Train-Test Split Accuracy: 1.0


In [112]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn.fit(X_train, y_train)
    y_pred = knn._predict(X_test)

    accuracies.append(int(y_pred[0] == y_test[0]))

print("LOOCV Accuracy:", sum(accuracies) / len(accuracies))

LOOCV Accuracy: 0.9600000000000001


In [114]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn.fit(X_train, y_train)
    y_pred = knn._predict(X_test)

    acc = sum(1 for a, b in zip(y_pred, y_test) if a == b) / len(y_test)
    accuracies.append(acc)

print("5-Fold Cross Validation Accuracy:", sum(accuracies) / len(accuracies))

5-Fold Cross Validation Accuracy: 0.9602083333333334


In [55]:
def entropy(feature):
    counts = Counter(feature) #frequency of each label
    total = len(feature)
    entropy_ = 0.0
    for c in counts.values():
        p = c/total
        if p > 0:
            entropy_ -= p * np.log2(p)
    return entropy_

'''
def info_gain(X_column, y, split_value=None): #X_column are the values of a feature, y is the label of each value, and split_value defines whether the feature is categorical or numerical
    parent_entropy = entropy(y)
    n = len(y)
    total_entropy = 0.0
    if split_value is None: #categorical feature
        values = set(X_column) #it gives us the unique values eg feature has_job=['yes', 'no', 'no', 'yes', 'yes'] ---> with set: values=['yes', 'no'], removes duplicates
        for v in values:
            y_subset = [y[i] for i in range(len(y)) if X_column[i] == v] 
            total_entropy += (len(y_subset)/n) * entropy(y_subset)
    else: #binary split when split_value is given
        if isinstance(split_value, (int, float)): #if true then numerical feature
            left_y = [y[i] for i in range(n) if X_column[i] <= split_value]
            right_y = [y[i] for i in range(n) if X_column[i] > split_value]
        else: #catagorical value
            left_y = [y[i] for i in range(n) if X_column[i] == split_value]
            right_y = [y[i] for i in range(n) if X_column[i] != split_value]

        #if one side is empty
        if len(left_y) == 0 or len(right_y) == 0:
            return 0.0
            
        total_entropy += ((len(left_y)/n) * entropy(left_y)) + ((len(right_y)/n) * entropy(right_y))

    return parent_entropy - total_entropy
'''

def info_gain(X_column, y, split_value):
    parent_entropy = entropy(y)
    n = len(y)

    if isinstance(split_value, (int, float)):
        # numeric split
        left_y = [y[i] for i in range(n) if X_column[i] <= split_value]
        right_y = [y[i] for i in range(n) if X_column[i] > split_value]
    else:
        # categorical split
        left_y = [y[i] for i in range(n) if X_column[i] == split_value]
        right_y = [y[i] for i in range(n) if X_column[i] != split_value]

    # if one branch is empty, no information gain
    if len(left_y) == 0 or len(right_y) == 0:
        return 0.0

    weighted_entropy = (len(left_y)/n) * entropy(left_y) + (len(right_y)/n) * entropy(right_y)
    return parent_entropy - weighted_entropy


#  -----Decision Tree-----
class Node:
    def __init__(self, feature=None, threshold=None, right=None, left=None, *, value=None):
        self.right = right
        self.left = left
        self.value = value
        self.feature = feature
        self.threshold = threshold

    def is_leaf(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.root = None

    def fit(self, x, y):
        self.root = self.grow_tree(x, y)

    '''
    def grow_tree(self, x, y, depth=0): #recursive function
        #if empty dataset
        if len(x) == 0:
            return Node(value=None)
        
        #stopping condition
        if len(set(y)) == 1: #all labels are the same i.e pure node, one unique label
            return Node(value=y[0]) 

        num_features = len(x[0]) if len(x) > 0 else 0
        if depth >= self.max_depth or num_features == 0: #if we reached max_depth or no more features to split
            return Node(value=Counter(y).most_common(1)[0][0]) #returning leaf node with the majority class

        # ---- choosing best feature to split -----
        best_gain = -1.0
        split_index, split_threshold = None, None

         #split_index --> stores the index of the feature ;feature on which split has to be done eg:has job splits into yes and no
         #Example: if your data x = [[Age, Color, Student], …], and the best split is on "Color", then split_index = 1 (the column number).
         #split_threshold --> stores actual value of the feature on which we are splitting
         #Example: If the feature is numeric (e.g., Age=30), then threshold is that number → split into ≤30 vs >30.
                   If the feature is categorical (e.g., Color="Red"), then threshold is that category → split into "Red" vs "Not Red".

        for feature in range(num_features):
            #col = [row[feature] for row in x] --> list of features
            #value = set(col) --> values of each features
            values = set(row[feature] for row in x) #equivalent to value=sorted(set(col)
            for val in values:
                gain = info_gain([row[feature] for row in x], y, val if isinstance(val, (int, float)) else val) #check
                if gain > best_gain:
                    best_gain = gain
                    split_index = feature
                    split_threshold = val
                    
        if best_gain <= 0:
            return Node(value=Counter(y).most_common(1)[0][0])

        #splitting dataset
        if isinstance(split_threshold, (int, float)):
            left_index = [i for i in range(len(x)) if x[i][split_index] <= split_threshold]
            right_index = [i for i in range(len(x)) if x[i][split_index] > split_threshold]

        else: #categorical
            left_index = [i for i in range(len(x)) if x[i][split_index] == split_threshold]
            right_index = [i for i in range(len(x)) if x[i][split_index] != split_threshold]

        left = self.grow_tree([x[i] for i in left_index], [y[i] for i in left_index], depth+1)
        right = self.grow_tree([x[i] for i in right_index], [y[i] for i in right_index], depth+1)

        #this is equivalent to the above left and right recursive calls
        #X_left  = [X[i] for i in left_idx]
        #y_left  = [y[i] for i in left_idx]
        #X_right = [X[i] for i in right_idx]
        #y_right = [y[i] for i in right_idx]

        # recursive calls
        #left_child  = self.grow_tree(X_left, y_left, depth + 1)
        #right_child = self.grow_tree(X_right, y_right, depth + 1)
        
        return Node(split_index, split_threshold, left, right)

        '''


    def grow_tree(self, x, y, depth=0):
        # base cases
        if len(x) == 0:
            return Node(value=None)
        if len(set(y)) == 1:
            return Node(value=y[0])
        if depth >= self.max_depth:
            return Node(value=Counter(y).most_common(1)[0][0])
    
        num_features = len(x[0])
        best_gain = -1.0
        split_index, split_threshold = None, None
    
        # loop through features
        for feature in range(num_features):
            values = sorted(set(row[feature] for row in x))
    
            # decide candidate thresholds
            if all(isinstance(v, (int, float)) for v in values):
                # numeric -> use midpoints
                candidates = [(values[i] + values[i+1]) / 2 for i in range(len(values)-1)]
            else:
                # categorical -> use unique values
                candidates = values
    
            # test each candidate
            for val in candidates:
                gain = info_gain([row[feature] for row in x], y, val)
                if gain > best_gain:
                    best_gain = gain
                    split_index = feature
                    split_threshold = val
    
        # if no useful split, make leaf
        if best_gain <= 0:
            return Node(value=Counter(y).most_common(1)[0][0])
    
        # split dataset
        if isinstance(split_threshold, (int, float)):
            left_index = [i for i in range(len(x)) if x[i][split_index] <= split_threshold]
            right_index = [i for i in range(len(x)) if x[i][split_index] > split_threshold]
        else:
            left_index = [i for i in range(len(x)) if x[i][split_index] == split_threshold]
            right_index = [i for i in range(len(x)) if x[i][split_index] != split_threshold]
    
        left = self.grow_tree([x[i] for i in left_index], [y[i] for i in left_index], depth+1)
        right = self.grow_tree([x[i] for i in right_index], [y[i] for i in right_index], depth+1)
    
        return Node(feature=split_index, threshold=split_threshold, left=left, right=right)


    def predict_one(self, x):
            node = self.root
            while not node.is_leaf():
                if isinstance(node.threshold, (int, float)):
                    if x[node.feature] <= node.threshold:
                        node = node.left
                    else:
                        node = node.right
                else: #categorical
                    if x[node.feature] == node.threshold:
                        node = node.left
                    else:
                        node = node.right
            return node.value

    def predict(self, X):
            return [self.predict_one(x) for x in X]
                

In [61]:
# Small dataset: [Age, Has_Job, Owns_House] -> Buy?
X = [
    [25, "No", "No"],
    [30, "Yes", "No"],
    [40, "Yes", "Yes"],
    [35, "Yes", "Yes"],
    [22, "No", "No"]
]
y = ["No", "Yes", "Yes", "Yes", "No"]

tree = DecisionTree(max_depth=3)
tree.fit(X, y)

print(tree.predict([[28, "Yes", "No"], [23, "No", "No"]]))
# Output: ['Yes', 'No']


['Yes', 'No']


In [71]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, LeaveOneOut
import pandas as pd

# load iris
iris = load_iris()
X = iris.data        # numeric features
y = iris.target      # labels (0,1,2)

# convert to list of lists (since your tree uses Python lists)
X = X.tolist()
y = y.tolist()

# train your custom tree
tree = DecisionTree(max_depth=3)
tree.fit(X, y)

# predict a few samples
print(tree.predict(X[30:]))
print("Actual:", y[30:])


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Actual: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [81]:
from sklearn.model_selection import train_test_split
iris = load_iris()
x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

tree = DecisionTree(max_depth=3)
tree.fit(x_train.tolist(), y_train.tolist())

#predict
y_pred = tree.predict(x_test.tolist())

accuracy = sum(1 for a, b in zip(y_pred, y_test) if a == b) / len(y_test)
print("Train-Test Split Accuracy:", accuracy)

print(y_pred)
print("Actual:", y_test[:])

Train-Test Split Accuracy: 0.9666666666666667
[1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
Actual: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [87]:
from sklearn.model_selection import LeaveOneOut

X = iris.data
Y = iris.target

loo = LeaveOneOut()
accuracies = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    tree = DecisionTree(max_depth=3)
    tree.fit(X_train.tolist(), Y_train.tolist())
    y_pred = tree.predict(X_test.tolist())
    accuracies.append(int(y_pred[0] == Y_test[0]))

print("LOOCV Accuracy:", sum(accuracies) / len(accuracies))

LOOCV Accuracy: 0.9533333333333334


In [89]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    tree = DecisionTree(max_depth=3)
    tree.fit(X_train.tolist(), y_train.tolist())

    y_pred = tree.predict(X_test.tolist())
    
    acc = sum(1 for a, b in zip(y_pred, y_test) if a == b) / len(y_test)
    accuracies.append(acc)

print("5-Fold Cross Validation Accuracy:", sum(accuracies) / len(accuracies))

5-Fold Cross Validation Accuracy: 0.96
