In [28]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [55]:
import numpy as np
from collections import Counter
import pandas as pd

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feature=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feature = n_feature
        self.root = None

    def fit(self, X, Y):
        self.n_feature = X.shape[1] if not self.n_feature else min(X.shape[1], self.n_feature)
        self.root = self.grow_tree(X, Y)

    def grow_tree(self, X, Y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(Y))

        # Stopping criteria
        if (depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self.most_common_label(Y)
            return Node(value=leaf_value)

        feat_indexes = np.random.choice(n_features, self.n_feature, replace=False)

        # Find the best split
        best_feature, best_threshold = self.best_split(X, Y, feat_indexes)

        # If no valid split is found, return a leaf node
        if best_feature is None or best_threshold is None:
            leaf_value = self.most_common_label(Y)
            return Node(value=leaf_value)

        # Create child nodes
        left_idxs, right_idxs = self.split(X[:, best_feature], best_threshold)
        left = self.grow_tree(X[left_idxs, :], Y[left_idxs], depth + 1)
        right = self.grow_tree(X[right_idxs, :], Y[right_idxs], depth + 1)
        return Node(best_feature, best_threshold, left, right)

    def best_split(self, X, Y, feature_indexes):
        best_gain = -1
        split_index, split_threshold = None, None

        for feat_index in feature_indexes:
            X_column = X[:, feat_index]
            thresholds = np.unique(X_column)

            for thr in thresholds:
                gain = self.information_gain(Y, X_column, thr)
                if gain > best_gain:
                    best_gain = gain
                    split_index = feat_index
                    split_threshold = thr

        # Return the best split (or None if no gain > -1)
        return split_index, split_threshold

    def information_gain(self, Y, X_column, threshold):
        parent_entropy = self.entropy(Y)
        left_idxs, right_idxs = self.split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(Y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self.entropy(Y[left_idxs]), self.entropy(Y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        info_gain = parent_entropy - child_entropy
        return info_gain

    def split(self, X_column, splitting_threshold):
        left_idxs = np.argwhere(X_column <= splitting_threshold).flatten()
        right_idxs = np.argwhere(X_column > splitting_threshold).flatten()
        return left_idxs, right_idxs

    def entropy(self, Y):
        cnt_of_each_element = np.bincount(Y)
        prob = cnt_of_each_element / len(Y)
        entropy = -np.sum([pi * np.log(pi) for pi in prob if pi > 0])
        return entropy

    def most_common_label(self, Y):
        cnt = Counter(Y)
        if not cnt:  # Handle empty Y
            return 0  # Default to 0 (Non-Churn) or choose another default
        value = cnt.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self.traverse_tree(x, self.root) for x in X])

    def traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        return self.traverse_tree(x, node.right)

In [25]:
iris_df = pd.read_csv('/content/iris.csv')
iris_df.head()
iris_df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [20]:
iris_df.shape

(150, 5)

In [21]:
iris_df.isnull().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [22]:
iris_df['species'].value_counts()

Unnamed: 0_level_0,count
species,Unnamed: 1_level_1
setosa,50
versicolor,50
virginica,50


In [23]:
x = iris_df.drop('species', axis = 1)
y = iris_df['species']
print(x)
print(y)

     sepal_length  sepal_width  petal_length  petal_width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
146           6.3          2.5           5.0          1.9
147           6.5          3.0           5.2          2.0
148           6.2          3.4           5.4          2.3
149           5.9          3.0           5.1          1.8

[150 rows x 4 columns]
0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [26]:
encoder = LabelEncoder()
iris_labels = encoder.fit_transform(y)
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


0 -> iris setosa

1 -> iris versicolor

2 -> iris virginica

In [27]:
iris_df['species'] = iris_labels
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


Data Standardization

In [29]:
scaler = StandardScaler()
scaler.fit(x)
standardized_data = scaler.transform(x)
print(standardized_data)

[[-9.00681170e-01  1.03205722e+00 -1.34127240e+00 -1.31297673e+00]
 [-1.14301691e+00 -1.24957601e-01 -1.34127240e+00 -1.31297673e+00]
 [-1.38535265e+00  3.37848329e-01 -1.39813811e+00 -1.31297673e+00]
 [-1.50652052e+00  1.06445364e-01 -1.28440670e+00 -1.31297673e+00]
 [-1.02184904e+00  1.26346019e+00 -1.34127240e+00 -1.31297673e+00]
 [-5.37177559e-01  1.95766909e+00 -1.17067529e+00 -1.05003079e+00]
 [-1.50652052e+00  8.00654259e-01 -1.34127240e+00 -1.18150376e+00]
 [-1.02184904e+00  8.00654259e-01 -1.28440670e+00 -1.31297673e+00]
 [-1.74885626e+00 -3.56360566e-01 -1.34127240e+00 -1.31297673e+00]
 [-1.14301691e+00  1.06445364e-01 -1.28440670e+00 -1.44444970e+00]
 [-5.37177559e-01  1.49486315e+00 -1.28440670e+00 -1.31297673e+00]
 [-1.26418478e+00  8.00654259e-01 -1.22754100e+00 -1.31297673e+00]
 [-1.26418478e+00 -1.24957601e-01 -1.34127240e+00 -1.44444970e+00]
 [-1.87002413e+00 -1.24957601e-01 -1.51186952e+00 -1.44444970e+00]
 [-5.25060772e-02  2.18907205e+00 -1.45500381e+00 -1.31297673e

In [32]:
x = standardized_data
y = iris_df['species']

In [34]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.2, random_state = 7, stratify= y)

In [36]:
print(x.shape, x_train.shape, x_test.shape)

(150, 4) (120, 4) (30, 4)


In [56]:
model = DecisionTree()
model.fit(x_train, y_train)
test_data_prediction = model.predict(x_test)

In [57]:
print(test_data_prediction)

[2 0 0 1 2 1 2 0 2 2 1 0 0 1 1 1 0 0 1 1 2 0 1 0 2 2 1 1 0 2]


In [58]:
def accuracy(y_test, y_pred):
  return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, test_data_prediction)
print(acc)

0.9666666666666667
