# **Decision tree Algorithm - Entropy, Information Gain and Gini Impurity**

## **Written by:** Aarish Asif Khan

## **Date:** 17 February 2024

In [26]:
# Import library
import math 

In [27]:
# Example dataset
n_A = 4
n_B = 6

# Total
total_number = n_A + n_B
total_number

10

In [28]:
# Calculate the proportion
proportion_A = n_A / total_number
proportion_B = n_B / total_number

# Print out the results
print("Proportion of A:", proportion_A)
print("Proportion of B:", proportion_B)

Proportion of A: 0.4
Proportion of B: 0.6


In [29]:
# Calculate the Entropy
entropy = - (proportion_A * math.log2(proportion_A) + proportion_B * math.log2(proportion_B))

# Print Entropy
print("Entropy:", entropy)

Entropy: 0.9709505944546686


# **Gini Impurity**

In [30]:
# Calculate Gini Impurity
gini_impurity = 1 - (proportion_A ** 2 + proportion_B ** 2)

# Print Gini Impurity
print("Gini Impurity:", gini_impurity)

Gini Impurity: 0.48


# **Information Gain**

In [31]:
n_1_A, n_1_B = 2, 3
n_2_A, n_2_B = 2, 3

p_1_A = n_1_A / (n_1_A + n_1_B)
p_1_B = n_1_B / (n_1_A + n_1_B)
entropy_1 = -p_1_A * math.log2(p_1_A) - p_1_B * math.log2(p_1_B) if p_1_A and p_1_B else 0

p_2_A = n_2_A / (n_2_A + n_2_B)
p_2_B = n_2_B / (n_2_A + n_2_B)
entropy_2 = -p_2_A * math.log2(p_2_A) - p_2_B * math.log2(p_2_B) if p_2_A and p_2_B else 0

# Calculating information gain
info_gain = entropy - ((n_1_A + n_1_B) / total_number * entropy_1 + (n_2_A + n_2_B) / total_number * entropy_2)
print("Information Gain: ", info_gain)

Information Gain:  0.0


# **Decision Tree on Titanic dataset**

In [32]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [33]:
# Load the dataset
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [34]:
df.isnull().sum().sort_index(ascending=False)

who              0
survived         0
sibsp            0
sex              0
pclass           0
parch            0
fare             0
embarked         2
embark_town      2
deck           688
class            0
alone            0
alive            0
age            177
adult_male       0
dtype: int64

In [35]:
# Drop deck column
df.drop("deck", axis=1, inplace=True)

# Impute missing values of columns age and fare using median with simple imputer
imputer = SimpleImputer(strategy="median")
df[["age", "fare"]] = imputer.fit_transform(df[["age", "fare"]])

In [36]:
# Impute missing values of embark and embarked_town using mode by simple imputer
imputer = SimpleImputer(strategy="most_frequent")
df[["embarked", "embark_town"]] = imputer.fit_transform(df[["embarked", "embark_town"]])

In [37]:
# Check if there is any null values remaining
df.isnull().sum().sort_index()

adult_male     0
age            0
alive          0
alone          0
class          0
embark_town    0
embarked       0
fare           0
parch          0
pclass         0
sex            0
sibsp          0
survived       0
who            0
dtype: int64

In [38]:
# Encode the categorical and object variables using for loop and labelencoder
le = LabelEncoder()
for col in df.select_dtypes(include=['category', 'object']):
    df[col] = le.fit_transform(df[col])

In [39]:
# split the data into X and y
X = df.drop(['survived', 'alive'], axis=1)
y = df['survived']

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# create and train teh model with pred
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)

In [41]:
# predict the model
y_pred = model.predict(X_test)

# evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[82 23]
 [21 53]]
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       105
           1       0.70      0.72      0.71        74

    accuracy                           0.75       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.75      0.75       179



In [42]:
# Save the decision tree classifier
from sklearn.tree import export_graphviz

export_graphviz(model, out_file='./titanic.dot', feature_names=X.columns, filled=True, rounded=True)