<a href="https://colab.research.google.com/github/blendaguedes/ml_analysis/blob/main/Entropy_and_Information_gain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import numpy as np
import pandas as pd

# Defining the calculation of the **entropy**


$$H(p_1) = -p_1 \text{log}_2(p_1) - (1- p_1) \text{log}_2(1- p_1)$$

# And **information gain** is

$$\text{Information Gain} = H(p_1^\text{node})- \left(w^{\text{left}}H\left(p_1^\text{left}\right) + w^{\text{right}}H\left(p_1^\text{right}\right)\right),$$


> Being as small as the entropy is better

> As big as the information gain is better

In [53]:
def entropy(p):
    if p == 0 or p == 1:
        return 0.
    else:
        return -p * np.log2(p) - (1 - p) * np.log2(1 - p)

In [54]:
# Test
entropy(0.5), entropy(0), entropy(1), entropy(0.99)

(1.0, 0.0, 0.0, 0.08079313589591124)

In [55]:
# Generating a small dataset for example

X_train = np.array([[1, 1, 1],
[0, 0, 1],
[0, 1, 0],
[1, 0, 1],
[1, 1, 1],
[1, 1, 0],
[0, 0, 0],
[1, 1, 0],
[0, 1, 0],
[0, 1, 0]])

y_train = np.array([1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

In [56]:
def split_indices(X, index_feature):
    """Given a dataset and a index feature, return two lists for the two split nodes, the left node has the animals that have
    that feature = 1 and the right node those that have the feature = 0
    index feature = 0 => ear shape
    index feature = 1 => face shape
    index feature = 2 => whiskers
    """
    left_indices = np.where(X[:, index_feature] == 1)[0].tolist()
    right_indices = np.where(X[:, index_feature] != 1)[0].tolist()

    return left_indices, right_indices

In [57]:
def weighted_entropy(X,y,left_indices,right_indices):
    """
    This function takes the splitted dataset, the indices we chose to split and returns the weighted entropy.
    """
    w_left = len(left_indices)/len(X)
    w_right = len(right_indices)/len(X)
    p_left = sum(y[left_indices])/len(left_indices)
    p_right = sum(y[right_indices])/len(right_indices)

    weighted_entropy = w_left * entropy(p_left) + w_right * entropy(p_right)
    return weighted_entropy

In [58]:
left_indices, right_indices = split_indices(X_train, 0)

In [59]:
def information_gain(X, y, left_indices, right_indices):
    """
    Here, X has the elements in the node and y is theirs respectives classes
    """
    p_node = sum(y)/len(y)
    h_node = entropy(p_node)
    w_entropy = weighted_entropy(X,y,left_indices,right_indices)
    return h_node - w_entropy

In [60]:
information_gain(X_train, y_train, left_indices, right_indices)

0.2780719051126377

In [61]:
for i, feature_name in enumerate(['Ear Shape', 'Face Shape', 'Whiskers']):
    left_indices, right_indices = split_indices(X_train, i)
    i_gain = information_gain(X_train, y_train, left_indices, right_indices)
    print(f"Feature: {feature_name}, information gain if we split the root node using this feature: {i_gain:.2f}")


Feature: Ear Shape, information gain if we split the root node using this feature: 0.28
Feature: Face Shape, information gain if we split the root node using this feature: 0.03
Feature: Whiskers, information gain if we split the root node using this feature: 0.12


In [64]:
X_train_num = np.array([[1, 1, 1, 11.],
[0, 0, 1, 12.],
[0, 1, 0, 13.],
[1, 0, 1, 9.],
[1, 1, 1, 8.],
[1, 1, 0, 11.],
[0, 0, 0, 12.5],
[1, 1, 0, 10.],
[0, 1, 0, 14.],
[0, 1, 0, 9.5]])

In [65]:
X_num = X_train_num[:, 3]
X_num

array([11. , 12. , 13. ,  9. ,  8. , 11. , 12.5, 10. , 14. ,  9.5])

In [67]:
np.unique(np.sort(X_num))

array([ 8. ,  9. ,  9.5, 10. , 11. , 12. , 12.5, 13. , 14. ])

In [75]:
X_num[X_num>10].mean()

12.25

# Node **variance** is

$$\text{Information gain in variance} = Var^\text{node}- (w^{\text{left}}Var^{\text{left}} + w^{\text{right}}Var^{\text{right}})$$

In [88]:
def choose_num_split_point(X, y):
    root_var = np.var(X)
    X_sort = np.unique(np.sort(X))

    res_var = 10e100
    split_point = 0
    for v in X_sort:
        right_indices = np.where(X<=v)[0]

        if sum(y[right_indices]) == 0 or len(right_indices) == 0 :
            continue

        p_right = sum(y[right_indices])/len(right_indices)
        right_var = X[right_indices].var()

        left_indices = np.where(X>v)[0]
        if sum(y[left_indices]) == 0 or len(left_indices) == 0 :
            continue

        p_left = sum(y[left_indices])/len(left_indices)
        left_var = X[left_indices].var()

        local_var = root_var - (p_right * right_var + p_left * left_var)

        if local_var < res_var:
            res_var = local_var
            split_point = v

    return split_point, res_var


In [89]:
split_point_num, var = choose_num_split_point(X_num, y_train)
X_num[X_num<=split_point_num], X_num[X_num>split_point_num]

(array([9., 8.]), array([11. , 12. , 13. , 11. , 12.5, 10. , 14. ,  9.5]))

In [90]:
var

2.1015625