In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
from collections import Counter
from graphviz import Digraph


In [2]:
# Reading the data from 'play_tennis.csv' into a DataFrame named df_tennis
df_tennis=pd.read_csv('play_tennis.csv')

In [3]:
# Display the first few rows of the DataFrame
df_tennis.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [4]:
# Entropy=-Σ(p_i * log2(p_i))
# Function to calculate the entropy of a list
def entropy_list(a_list):
    
     # Count the occurrences of each unique value in the list using Counter
    cnt=Counter(x for x in a_list)
    # Calculate the total number of instances in the list
    num_instance=len(a_list)*1.0
     # Calculate the probabilities of each unique value in the list
    probs=[x/num_instance for x in cnt.values()]
        
    # Calculate the entropy using the formula: entropy = -Σ(p_i * log2(p_i))
    entropy = -sum(p * np.log2(p) for p in probs if p > 0)
    return entropy(probs)



In [5]:
#  We can also calculate math by using this method
import math
def entropy(probs):
    return sum([-prob*math.log(prob,2)] for prob in probs)

In [6]:
# Information Gain: Gain(T,X)=Entropy(T)-Entropy(T,X)
# Function to calculate the Information Gain of a split in a DataFrame
def info_gain(df, split, target, trace=0):
    # Group the DataFrame by the given 'split' attribute
    df_split = df.groupby(split)
    
    # Calculate the total number of observations in the DataFrame
    nobs = len(df.index) * 1.0
    
    # Calculate the entropy and proportion of observed instances for each group after the split
    df_agg_ent = df_split.agg({target: [entropy_list, lambda x: len(x) / nobs]})
    
    # Rename the columns to more meaningful names
    df_agg_ent.columns = ['Entropy', 'PropObserved']
    
    # Calculate the new entropy after the split by summing the product of entropy and proportion for each group
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent["PropObserved"])
    
    # Calculate the entropy of the target variable before the split
    old_entropy = entropy_list(df[target])
    
    # Calculate and return the Information Gain by subtracting the new entropy from the old entropy
    return old_entropy - new_entropy


In [7]:
# Function to build the ID3 decision tree
def id3(df, target, attribute_name, default_class=None):
    # Count the occurrences of each class (target) in the DataFrame
    cnt = Counter(x for x in df[target])
    
    # If there is only one class in the DataFrame, return that class as the leaf node
    if len(cnt) == 1:
        return next(iter(cnt))
    
    # If the DataFrame is empty or there are no attributes left to consider, return the default class
    elif df.empty or (not attribute_name):
        return default_class
    
    # Otherwise, proceed with building the decision tree
    else:
        # Set the default class to the class with the maximum occurrences
        default_class = max(cnt.keys())
        
        # Calculate the Information Gain for each attribute in the attribute_name list
        gains = [info_gain(df, attr, target) for attr in attribute_name]
        
        # Find the index of the attribute with the maximum Information Gain
        index_max = gains.index(max(gains))
        
        # Select the attribute with the maximum Information Gain as the best attribute for the current node
        best_attr = attribute_name[index_max]
        
        # Initialize the decision tree with the best attribute as the root node
        tree = {best_attr: {}}
        
        # Remove the best attribute from the list of remaining attributes
        remaining_attribute = {x for x in attribute_name if x != best_attr}
        
        # Split the data based on the values of the best attribute and build subtrees recursively
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset, target, remaining_attribute, default_class)
            tree[best_attr][attr_val] = subtree
        
        # Return the constructed decision tree
        return tree


In [8]:
# Function to classify an instance using the decision tree
def classify(instance, tree, default=None):
    # Get the attribute of the current node (root node of the subtree)
    attribute = next(iter(tree))
    
    # Check if the value of the instance for the current attribute is present in the tree
    if instance[attribute] in tree[attribute].keys():
        # If the value is present, move to the corresponding subtree
        result = tree[attribute][instance[attribute]]
        
        # Check if the result is a dictionary, indicating there are more nodes in the tree
        if isinstance(result, dict):
            # Recursively call the classify function to move further down the tree
            return classify(instance, result)
        else:
            # If the result is not a dictionary, it is the class label (leaf node)
            return result
    else:
        # If the value is not present in the tree, return the default class label (if provided)
        return default


In [9]:
# Convert all column names in the DataFrame to lowercase for case insensitivity
df_tennis.columns = [col.lower() for col in df_tennis.columns]

# Check if the 'PlayTennis' attribute is present in the DataFrame columns
if 'playtennis' in df_tennis.columns:
    attribute_names = list(df_tennis.columns)
    attribute_names.remove('playtennis')  # Remove the class attribute
    tree = id3(df_tennis, 'playtennis', attribute_names)

    print("\n\n The Resultant Decision Tree is:\n")
    print(tree)
else:
    print("The 'PlayTennis' attribute is not present in the DataFrame columns.")


The 'PlayTennis' attribute is not present in the DataFrame columns.


The 'playtennis' attribute is not present in the DataFrame columns.
