In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
from math import log2

def entropy(target_col):
    """
    Calculate the entropy of a dataset.
    The only parameter of this function is the target_col parameter which specifies the target column
    """
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data, split_attribute_name, target_name="class"):
    """
    Calculate the information gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default value is "class"
    """
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    
    # Calculate the values and the corresponding counts for the split attribute 
    vals, counts= np.unique(data[split_attribute_name],return_counts=True)
    
    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Example usage (Note: this will not run here as we don't have the actual dataset loaded)
# Assuming df is our DataFrame and 'class' is the target column
# info_gain_example = InfoGain(df, 'some_feature', 'class')
# print(info_gain_example)

# This code defines the entropy and information gain calculation. 
# The next step will be to use these in the actual ID3 algorithm implementation to build the decision tree.


In [None]:
def ID3(data, originaldata, features, target_attribute_name="class", parent_node_class = None):
    """
    ID3 Algorithm: Builds the decision tree based on maximizing information gain.
    
    - data: the current subset of the dataset being processed (Pandas DataFrame).
    - originaldata: the original dataset before any splits (Pandas DataFrame).
    - features: the list of feature names (columns) that are candidates for splitting.
    - target_attribute_name: the name of the target attribute (string).
    - parent_node_class: the majority class of the parent node for the current node being processed.
    """
    
    # If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    # If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
    
    # If the feature space is empty, return the mode target feature value of the direct parent node 
    elif len(features) == 0:
        return parent_node_class
    
    else:
        # Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        
        # Select the feature which best splits the dataset
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features] # Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure. The root gets the name of the best feature
        tree = {best_feature:{}}
        
        # Remove the feature with the best info gain from the feature space
        features = [i for i in features if i != best_feature]
        
        # Grow a branch under the root node for each possible value of the root node feature
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            
            # Call the ID3 algorithm for each of those sub_datasets with the new parameters
            subtree = ID3(sub_data, originaldata, features, target_attribute_name, parent_node_class)
            
            # Add the subtree, under the root node
            tree[best_feature][value] = subtree
            
        return tree

# Note: This code is a template and won't run here as it requires a dataset (df) and a list of features.
# To use it, you would need to call it like this:
# tree = ID3(df, df, list(df.columns[:-1]), 'class')
# This assumes your DataFrame is named df and the last column is the target variable.
