## **Lab: 05:-**

DECISION TREE USING ENTROPY

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
dataset = pd.read_csv('/content/drive/MyDrive/My_ML_Labs/Lab_5/Diabetes_Dataset.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

In [7]:
def split_dataset(X, y, feature, threshold):
    left_mask = X[:, feature] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

In [8]:
def best_split(X, y, criterion='entropy'):
    best_gain = 0
    best_feature, best_threshold = None, None
    current_impurity = entropy(y) if criterion == 'entropy' else gini(y)

    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            left_impurity = entropy(y_left) if criterion == 'entropy' else gini(y_left)
            right_impurity = entropy(y_right) if criterion == 'entropy' else gini(y_right)

            weighted_impurity = (len(y_left) / len(y) * left_impurity) + (len(y_right) / len(y) * right_impurity)
            gain = current_impurity - weighted_impurity

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

In [9]:
def build_tree(X, y, depth=0, max_depth=5, criterion='entropy'):
    if len(set(y)) == 1 or depth >= max_depth:
        return Counter(y).most_common(1)[0][0]

    feature, threshold = best_split(X, y, criterion)
    if feature is None:
        return Counter(y).most_common(1)[0][0]

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)

    left_subtree = build_tree(X_left, y_left, depth+1, max_depth, criterion)
    right_subtree = build_tree(X_right, y_right, depth+1, max_depth, criterion)

    return {"feature": feature, "threshold": threshold, "left": left_subtree, "right": right_subtree}

In [10]:
def predict(tree, sample):
    if isinstance(tree, dict):
        feature, threshold = tree["feature"], tree["threshold"]
        if sample[feature] <= threshold:
            return predict(tree["left"], sample)
        else:
            return predict(tree["right"], sample)
    else:
        return tree

In [12]:
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train_np, y_train_np = X_train.to_numpy(), y_train.to_numpy()
X_test_np, y_test_np = X_test.to_numpy(), y_test.to_numpy()

tree_entropy = build_tree(X_train_np, y_train_np, criterion='entropy')

y_pred_entropy = [predict(tree_entropy, sample) for sample in X_test_np]

In [14]:
print("Entropy Decision Tree Performance:")
print(confusion_matrix(y_test_np, y_pred_entropy))
print(classification_report(y_test_np, y_pred_entropy))

Entropy Decision Tree Performance:
[[83 16]
 [17 38]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.83        99
           1       0.70      0.69      0.70        55

    accuracy                           0.79       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.78      0.79      0.79       154



DECISION TREE USING GINI

In [15]:
def gini(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return 1 - np.sum([p**2 for p in probabilities])

In [16]:
tree_gini = build_tree(X_train_np, y_train_np, criterion='gini')

y_pred_gini = [predict(tree_gini, sample) for sample in X_test_np]

In [17]:
print("\nGini Decision Tree Performance:")
print(confusion_matrix(y_test_np, y_pred_gini))
print(classification_report(y_test_np, y_pred_gini))


Gini Decision Tree Performance:
[[88 11]
 [20 35]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85        99
           1       0.76      0.64      0.69        55

    accuracy                           0.80       154
   macro avg       0.79      0.76      0.77       154
weighted avg       0.80      0.80      0.79       154



DECISION TREE USING SCIKIT LEARN

In [18]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf.fit(X_train, y_train)
y_pred_sklearn = clf.predict(X_test)

print("\nScikit-Learn Decision Tree Performance:")
print(confusion_matrix(y_test, y_pred_sklearn))
print(classification_report(y_test, y_pred_sklearn))



Scikit-Learn Decision Tree Performance:
[[83 16]
 [17 38]]
              precision    recall  f1-score   support

           0       0.83      0.84      0.83        99
           1       0.70      0.69      0.70        55

    accuracy                           0.79       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.78      0.79      0.79       154



In [19]:
# Train Decision Tree with Max Depth = 4
clf_depth = DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=42)
clf_depth.fit(X_train, y_train)

# Predict on test data
y_pred_depth = clf_depth.predict(X_test)

# Evaluate performance
accuracy_depth = accuracy_score(y_test, y_pred_depth)
print("Accuracy using Max Depth Restriction (Depth=4):", accuracy_depth)

Accuracy using Max Depth Restriction (Depth=4): 0.6948051948051948


In [20]:
# Train Decision Tree with Min Samples Split = 10
clf_min_samples = DecisionTreeClassifier(criterion="gini", min_samples_split=10, random_state=42)
clf_min_samples.fit(X_train, y_train)

# Predict on test data
y_pred_min_samples = clf_min_samples.predict(X_test)

# Evaluate performance
accuracy_min_samples = accuracy_score(y_test, y_pred_min_samples)
print("Accuracy using Min Samples Split Restriction (Min Samples=10):", accuracy_min_samples)

Accuracy using Min Samples Split Restriction (Min Samples=10): 0.7597402597402597
