## Decision Tree 🌴 from scratch with full "explanation and logic" (at the bottom⬇️⬇️⬇️⬇️⬇️)

In [31]:
## Import important librarires
import pandas as pd
import numpy as np

In [32]:
## Load the dataset
df=pd.read_csv(r"C:\Users\ASUS\Desktop\cleaned_DataAnalysis\SVMtrain.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,Male,22.0,1,0,7.25,3
1,2,1,1,female,38.0,1,0,71.2833,1
2,3,1,3,female,26.0,0,0,7.925,3
3,4,1,1,female,35.0,1,0,53.1,3
4,5,0,3,Male,35.0,0,0,8.05,3


In [33]:
## Check if there is null values in the dataset
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [34]:
## Select relevant features and drop rows with missing values
df=df[['Survived','Pclass','Sex','Age','Fare','Embarked']].dropna()

In [35]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,Male,22.0,7.25,3
1,1,1,female,38.0,71.2833,1
2,1,3,female,26.0,7.925,3
3,1,1,female,35.0,53.1,3
4,0,3,Male,35.0,8.05,3


In [36]:
## Encode categorical variables
df['Sex']=df['Sex'].map({'Male':0,'Female':1})
df['Embarked']=df['Embarked'].map({'C':0,'Q':1,'S':2})

In [37]:
## Define feature and target
X=df.drop('Survived',axis=1)      ## Drop "Survived" from input features, so that it can be used as target value(Output)
y=df['Survived']

from sklearn.model_selection import train_test_split
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Gini Impurity:-it helps in deciding how to split the data at each node 🤔😲

In [38]:
##1. Gini Impurity
def gini(y):
    classes = np.unique(y)
    impurity = 1
    for cls in classes:
        p = np.sum(y == cls) / len(y)
        impurity -= p ** 2
    return impurity

## Let's split the dataset according to the threshold(used to create the split in the feature at each node) ⛳

In [39]:
## 2.Split the dataset
def split_dataset(X, y, feature_index, threshold):
    left_mask = X.iloc[:, feature_index] <= threshold
    right_mask = X.iloc[:, feature_index] > threshold
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

## Now we will iterate through each feature and samples of each feature to find the most "informative feature" and the most "appropriate thresold" 🏃

In [40]:
##3.Find best split
def best_split(X, y):
    m, n = X.shape
    best_gain = 0
    best_index, best_thresh = None, None
    parent_impurity = gini(y)

    for feature_index in range(n):
        thresholds = X.iloc[:, feature_index].unique()
        for threshold in thresholds:
            _, y_left, _, y_right = split_dataset(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            p = len(y_left) / len(y)
            gain = parent_impurity - (p * gini(y_left) + (1 - p) * gini(y_right))

            if gain > best_gain:
                best_gain = gain
                best_index = feature_index
                best_thresh = threshold

    return best_index, best_thresh

## Now we will build the tree according to the calculated Gini "Impurity" , "threshold" and "Information gain" 🏃

In [41]:
##4.Build the tree (Recursion)
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Leaf value (class)

def build_tree(X, y, depth=0, max_depth=3):
    num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
    predicted_class = np.argmax(num_samples_per_class)

    # Stop if pure or max depth
    if len(np.unique(y)) == 1 or depth == max_depth:
        return Node(value=predicted_class)

    feature_idx, threshold = best_split(X, y)
    if feature_idx is None:
        return Node(value=predicted_class)

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_idx, threshold)

    left_child = build_tree(X_left, y_left, depth + 1, max_depth)
    right_child = build_tree(X_right, y_right, depth + 1, max_depth)
    return Node(feature_idx, threshold, left_child, right_child)

In [42]:
##5.Predictiion
def predict_tree(X, tree):
    if tree.value is not None:
        return tree.value
    if X[tree.feature] <= tree.threshold:
        return predict_tree(X, tree.left)
    else:
        return predict_tree(X, tree.right)

def predict(X, tree):
    return np.array([predict_tree(X, tree) for X in X.values])

In [43]:
## Now evaluate on titanic dataset
tree = build_tree(X_train, y_train, max_depth=3)
y_pred = predict(X_test, tree)

# Accuracy
acc = np.mean(y_pred == y_test)
print("Accuracy:", acc)

Accuracy: 0.6685393258426966


In [44]:
## Hence , the accuracy of our model is 66.85%

## Now let's implement "Confusion Metrics" in our "decision Tree Model" 👍

In [45]:
#NOTE:- we apply confusion metrics because it's crucial for understanding and improving the model performance (Especially when dealing with "Imbalanced Data") 😲 🫡

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build tree
tree_root = build_tree(X_train, y_train, max_depth=3)

# Predictions
y_pred = predict(X_test, tree_root)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6685393258426966
              precision    recall  f1-score   support

           0       0.70      0.81      0.75       109
           1       0.60      0.45      0.51        69

    accuracy                           0.67       178
   macro avg       0.65      0.63      0.63       178
weighted avg       0.66      0.67      0.66       178



## Some important points about Decision Tree to keep in mind 🧠:-
#1: Gini Impurity hepls in reducing the randomness or impurity of the datas🤔  ⏭️
#2: Gini Impurity ranges from 0 to 0.5👍  ⏭️
#3: Lesser the Gini Impurity⬇️⬇️ - Lesser the randomness⬇️⬇️ - More the data belong to its class⬆️⬆️😲  ⏭️
#3: More the Gini Impurity⬆️⬆️ - More the randomness⬆️⬆️ - Less the data belong to its class⬇️⬇️😲  ⏭️
#4: Threshold hepls in dividing the datas into two nodes(classes): (1) Class having the values less than or equal to threshold. (2) Class having values greater than the threshold🏷️👌  ⏭️
#5: We should take the appropriate threshold to maintain balance between the two classes🫡  ⏭️
#6: Information Gain hepls in picking up the most relevant and important feature to split the dataset at each node  ⏭️
#7: Feature having the higher information gain is given priority to spli the datset  ⏭️