In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [9]:
def calculate_entropy(labels):
    label_counts = labels.value_counts()
    probabilities = label_counts / len(labels)
    entropy = -sum(probabilities * np.log2(probabilities))
    return entropy

In [10]:
def calculate_information_gain(data, feature, target):
    total_entropy = calculate_entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * calculate_entropy(data[data[feature] == values[i]][target]) for i in range(len(values)))
    information_gain = total_entropy - weighted_entropy
    return information_gain

In [11]:
def find_best_feature(data, features, target):
    information_gains = {feature: calculate_information_gain(data, feature, target) for feature in features}
    best_feature = max(information_gains, key=information_gains.get)
    return best_feature

In [12]:
def id3(data, features, target, tree=None):
    labels = data[target]

    if len(np.unique(labels)) == 1:
        return labels.iloc[0]

    if len(features) == 0:
        return labels.mode()[0]

    best_feature = find_best_feature(data, features, target)
    if tree is None:
        tree = {}
    tree[best_feature] = {}

    remaining_features = [f for f in features if f != best_feature]
    for value in np.unique(data[best_feature]):
        subset = data[data[best_feature] == value]
        subtree = id3(subset, remaining_features, target)
        tree[best_feature][value] = subtree

    return tree

In [13]:
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree 

    feature = list(tree.keys())[0]
    value = sample.get(feature)
    subtree = tree[feature].get(value)
    if subtree is None:
        return None  
    return predict(subtree, sample)

In [14]:
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "->", tree)
        return
    for feature, branches in tree.items():
        for value, subtree in branches.items():
            print(indent + f"{feature} = {value}")
            print_tree(subtree, indent + "    ")

In [15]:
data = pd.read_csv('PlayTennis.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
features = [col for col in train_data.columns if col != 'Play Tennis']
tree = id3(train_data, features, 'Play Tennis')
print("Decision Tree:")
print_tree(tree)

Decision Tree:
Outlook = Overcast
    -> Yes
Outlook = Rain
    Wind = Strong
        -> No
    Wind = Weak
        -> Yes
Outlook = Sunny
    Humidity = High
        -> No
    Humidity = Normal
        -> Yes


In [16]:
def accuracy(tree, data, target):
    correct_predictions = 0
    for _, row in data.iterrows():
        sample = row.drop(target).to_dict()  
        actual_label = row[target]
        predicted_label = predict(tree, sample)
        if predicted_label == actual_label:
            correct_predictions += 1
    accuracy = correct_predictions / len(data)
    return accuracy

accuracy = accuracy(tree, test_data,'Play Tennis')
print("Accuracy on training data:", accuracy)

Accuracy on training data: 1.0
