In [1]:
import numpy as np
from math import log2

def calculate_entropy(labels):
    # Count the occurrences of each label
    label_counts = np.unique(labels, return_counts=True)[1]

    # Calculate the probabilities of each label
    probabilities = label_counts / len(labels)

    # Calculate the entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def calculate_information_gain(examples, attribute_index, labels):
    # Get the unique values of the attribute
    attribute_values = np.unique(examples[:, attribute_index])

    # Calculate the entropy of the parent node
    parent_entropy = calculate_entropy(labels)

    # Calculate the weighted average entropy of the child nodes
    weighted_average_entropy = 0.0
    for value in attribute_values:
        # Get the indices of examples with the current attribute value
        indices = np.where(examples[:, attribute_index] == value)[0]

        # Get the labels of the examples with the current attribute value
        subset_labels = labels[indices]

        # Calculate the entropy of the child node
        entropy = calculate_entropy(subset_labels)

        # Calculate the weight of the child node
        weight = len(indices) / len(labels)

        # Update the weighted average entropy
        weighted_average_entropy += weight * entropy

    # Calculate the information gain
    information_gain = parent_entropy - weighted_average_entropy

    return information_gain

def choose_best_attribute(examples, attributes, labels):
    # Calculate the information gain for each attribute
    information_gains = [calculate_information_gain(examples, i, labels) for i in attributes]

    # Find the index of the attribute with the highest information gain
    best_attribute_index = np.argmax(information_gains)
#     print("Information gains",best_attribute_index, "\n", information_gains)
    return best_attribute_index

def create_decision_tree(examples, attributes, labels):
    # Create a new decision tree node
    node = {}

    # Check if all examples have the same label
    if len(np.unique(labels)) == 1:
        node['label'] = labels[0]
        return node

    # Check if there are no more attributes to split on
    if len(attributes) == 0:
        unique_labels, label_counts = np.unique(labels, return_counts=True)
        node['label'] = unique_labels[np.argmax(label_counts)]
        return node

    # Choose the best attribute to split on
    best_attribute_index = choose_best_attribute(examples, attributes, labels)
    best_attribute = attributes[best_attribute_index]

    # Set the best attribute as the splitting criterion for the current node
    node['attribute'] = att_names[best_attribute]
    node['children'] = {}

    # Remove the best attribute from the list of attributes
    attributes = np.delete(attributes, best_attribute_index)

    # Create a child node for each possible value of the best attribute
    attribute_values = np.unique(examples[:, best_attribute])
    for value in attribute_values:
        # Get the indices of examples with the current attribute value
        indices = np.where(examples[:, best_attribute] == value)[0]

        # Get the examples with the current attribute value
        subset_examples = examples[indices]

        # Get the labels of the examples with the current attribute value
        subset_labels = labels[indices]

        # Recursively create a subtree for the current child node
        node['children'][value] = create_decision_tree(subset_examples, 
            attributes.copy(), subset_labels)
        print(node)
    return node

### Processing from csv file

In [2]:
import pandas as pd

In [3]:
data=pd.read_csv("ID3.csv")

In [4]:
print(data)

     Outlook Temperature Humidity    Wind Target
0      Sunny         Hot     High    Weak     No
1      Sunny         Hot     High  Strong     No
2   Overcast         Hot     High    Weak    Yes
3      Rainy        Mild     High    Weak    Yes
4      Rainy        Cool   Normal    Weak    Yes
5      Rainy        Cool   Normal  Strong     No
6   Overcast        Cool   Normal  Strong    Yes
7      Sunny        Mild     High    Weak     No
8      Sunny        Cool   Normal    Weak    Yes
9      Rainy        Mild   Normal    Weak    Yes
10     Sunny        Mild   Normal  Strong    Yes
11  Overcast        Mild     High  Strong    Yes
12  Overcast         Hot   Normal    Weak    Yes
13     Rainy        Mild     High  Strong     No


In [5]:
att_names = {i:col_name for i,col_name in enumerate(data.columns)}
att_names

{0: 'Outlook', 1: 'Temperature', 2: 'Humidity', 3: 'Wind', 4: 'Target'}

In [6]:
# Get the attributes (feature indices)
attributes = np.arange(data.shape[1] - 1)

# Get the labels
labels = np.array(data["Target"])

# Create the decision tree
decision_tree = create_decision_tree(np.array(data), attributes, labels)

# Print the decision tree
print("\nThe Final Decision Tree is:\n")
print(decision_tree)

{'attribute': 'Outlook', 'children': {'Overcast': {'label': 'Yes'}}}
{'attribute': 'Wind', 'children': {'Strong': {'label': 'No'}}}
{'attribute': 'Wind', 'children': {'Strong': {'label': 'No'}, 'Weak': {'label': 'Yes'}}}
{'attribute': 'Outlook', 'children': {'Overcast': {'label': 'Yes'}, 'Rainy': {'attribute': 'Wind', 'children': {'Strong': {'label': 'No'}, 'Weak': {'label': 'Yes'}}}}}
{'attribute': 'Humidity', 'children': {'High': {'label': 'No'}}}
{'attribute': 'Humidity', 'children': {'High': {'label': 'No'}, 'Normal': {'label': 'Yes'}}}
{'attribute': 'Outlook', 'children': {'Overcast': {'label': 'Yes'}, 'Rainy': {'attribute': 'Wind', 'children': {'Strong': {'label': 'No'}, 'Weak': {'label': 'Yes'}}}, 'Sunny': {'attribute': 'Humidity', 'children': {'High': {'label': 'No'}, 'Normal': {'label': 'Yes'}}}}}

The Final Decision Tree is:

{'attribute': 'Outlook', 'children': {'Overcast': {'label': 'Yes'}, 'Rainy': {'attribute': 'Wind', 'children': {'Strong': {'label': 'No'}, 'Weak': {'lab

In [7]:
att_index = {col_name:i for i,col_name in enumerate(data.columns)}
att_index

{'Outlook': 0, 'Temperature': 1, 'Humidity': 2, 'Wind': 3, 'Target': 4}

In [8]:
decision_tree

{'attribute': 'Outlook',
 'children': {'Overcast': {'label': 'Yes'},
  'Rainy': {'attribute': 'Wind',
   'children': {'Strong': {'label': 'No'}, 'Weak': {'label': 'Yes'}}},
  'Sunny': {'attribute': 'Humidity',
   'children': {'High': {'label': 'No'}, 'Normal': {'label': 'Yes'}}}}}

In [25]:
def predict(sample):
    found=False
    predicted_val=""
    dt=decision_tree.copy()
    while not found:
        node=sample[att_index[dt['attribute']]]
        dt=dt['children'][node]
        if list(dt.values())[0]=="Yes" or list(dt.values())[0]=="No":
            found=True
            predicted_val=list(dt.values())[0]
    return predicted_val

In [35]:
data_arr=np.array(data)
data_arr

array([['Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['Rainy', 'Mild', 'High', 'Weak', 'Yes'],
       ['Rainy', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rainy', 'Cool', 'Normal', 'Strong', 'No'],
       ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['Rainy', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       ['Rainy', 'Mild', 'High', 'Strong', 'No']], dtype=object)

In [40]:
ypred=[]
for i in data_arr:
    ypred.append(predict(i))
ypred

['No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No']

In [41]:
truth=list(data['Target'])
truth

['No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No']

In [42]:
truth==ypred

True

In [43]:
predict(["Overcast", "Mild", "High", "Strong"])

'Yes'