# **Dhruv Karmokar**
# **21BAI1604**

 ### ID3 Decision Trees ( Applied On Abalone Dataset )

### Read and Parse The CSV Dataset

In [None]:
import csv
from collections import Counter
import random
import graphviz
import math

def parse(filename):
    data = []
    with open(filename, 'r') as file:
        csv_file = csv.reader(file) 
        headers = next(csv_file)
        for row in csv_file:
            data.append(dict(zip(headers, row)))
    return data

name_of_file =  "/content/abalone.csv" 
data = parse(name_of_file)
print(*data, sep = "\n")
print()
print(str(len(data)))

{'Class': '2', 'Sex': 'M', 'Length': '2', 'Diameter': '2', 'Height': '1', 'Whole Weight': '2', 'Shucked Weight': '2', 'Viscera Weight': '2', 'Shell Weight': '2'}
{'Class': '0', 'Sex': 'M', 'Length': '1', 'Diameter': '1', 'Height': '1', 'Whole Weight': '1', 'Shucked Weight': '1', 'Viscera Weight': '1', 'Shell Weight': '1'}
{'Class': '1', 'Sex': 'F', 'Length': '2', 'Diameter': '2', 'Height': '2', 'Whole Weight': '2', 'Shucked Weight': '2', 'Viscera Weight': '2', 'Shell Weight': '2'}
{'Class': '1', 'Sex': 'M', 'Length': '1', 'Diameter': '2', 'Height': '2', 'Whole Weight': '2', 'Shucked Weight': '2', 'Viscera Weight': '2', 'Shell Weight': '2'}
{'Class': '0', 'Sex': 'I', 'Length': '1', 'Diameter': '1', 'Height': '1', 'Whole Weight': '1', 'Shucked Weight': '1', 'Viscera Weight': '1', 'Shell Weight': '1'}
{'Class': '0', 'Sex': 'I', 'Length': '1', 'Diameter': '1', 'Height': '1', 'Whole Weight': '1', 'Shucked Weight': '1', 'Viscera Weight': '1', 'Shell Weight': '1'}
{'Class': '2', 'Sex': 'F', '

Five-Fold stratified cross-validation

In [None]:
import csv
import random
from collections import Counter

class Node:
    def __init__(self, label):
        self.label = label
        self.attribute = None
        self.attribute_values = []
        self.children = {}
        self.instances_labeled = []
        self.parent_attribute = None
        self.parent_attribute_value = None
        self.pruned = False

def parse(filename):
    data = []
    with open(filename, 'r') as file:
        csv_file = csv.reader(file) 
        headers = next(csv_file)
        for row in csv_file:
            data.append(dict(zip(headers, row)))
    return data

def get_five_folds(instances):
    fold0 = []
    fold1 = []
    fold2 = []
    fold3 = []
    fold4 = []

    random.shuffle(instances)
    classes = []  
    for instance in instances:
        classes.append(instance['Class'])
    unique_classes = list(Counter(classes).keys())
    for uniqueclass in unique_classes:
        counter = 0
        for instance in instances:
            if uniqueclass == instance['Class']:
                if counter == 0:
                    fold0.append(instance) 
                    counter += 1
                elif counter == 1:
                    fold1.append(instance)
                    counter += 1
                elif counter == 2:
                    fold2.append(instance)
                    counter += 1
                elif counter == 3:
                    fold3.append(instance)
                    counter += 1
                else:
                    fold4.append(instance)
                    counter = 0

    random.shuffle(fold0)
    random.shuffle(fold1)
    random.shuffle(fold2)
    random.shuffle(fold3)
    random.shuffle(fold4)

    return fold0, fold1, fold2, fold3, fold4

ID3 code:

In [None]:
import math
from collections import Counter

class Node:
    def __init__(self, label=None):
        self.label = label
        self.attribute = None
        self.attribute_values = []
        self.children = {}

def ID3(instances, default):
    if len(instances) == 0:
        return Node(default)
    
    classes = [instance['Class'] for instance in instances]
    if len(set(classes)) == 1:
        return Node(classes[0])
    
    attributes = list(instances[0].keys())
    attributes.remove('Class')
    
    if len(attributes) == 0:
        return Node(mode_class(instances))
    
    best_attribute = most_informative_attribute(instances)
    if best_attribute is None:
        return Node(mode_class(instances))
    
    tree = Node(mode_class(instances))
    tree.attribute = best_attribute
    tree.attribute_values = list(set(instance[best_attribute] for instance in instances))
    
    for attr_value in tree.attribute_values:
        instances_i = [instance for instance in instances if instance[best_attribute] == attr_value]
        subtree = ID3(instances_i, mode_class(instances))
        tree.children[attr_value] = subtree
    
    return tree


def entropy(instances, attribute, attribute_value):
    classes = [instance['Class'] for instance in instances if instance[attribute] == attribute_value]
    counter = Counter(classes)

    if len(counter) == 1:
        return 0
    else:
        entropy = 0
        for count in counter.values():
            probability = count / len(classes)
            entropy += probability * math.log(probability, 2)
        return -entropy

def prior_entropy(instances):
    classes = [instance['Class'] for instance in instances]
    counter = Counter(classes)

    if len(counter) == 1:
        return 0
    else:
        entropy = 0
        for count in counter.values():
            probability = count / len(classes)
            entropy += probability * math.log(probability, 2)
        return -entropy

def gain_ratio(instances, attribute):
    priorentropy = prior_entropy(instances)
    values = [instance[attribute] for instance in instances]
    counter = Counter(values)
    remaining_entropy = 0
    split_information = 0
    for attr_value, count in counter.items():
        probability = count / len(values)
        remaining_entropy += probability * entropy(instances, attribute, attr_value)
        split_information += probability * math.log(probability, 2)
    information_gain = priorentropy - remaining_entropy
    split_information = -split_information
    gainratio = None
    if split_information != 0:
        gainratio = information_gain / split_information
    else:
        gainratio = -1000
    return gainratio

def most_informative_attribute(instances):
    attributes = list(instances[0].keys())
    attributes.remove('Class')
    max_gain_ratio = -1
    selected_attribute = None
    for attribute in attributes:
        gain = gain_ratio(instances, attribute)
        if gain > max_gain_ratio:
            max_gain_ratio = gain
            selected_attribute = attribute
    return selected_attribute

def mode_class(instances):
    classes = [instance['Class'] for instance in instances]
    counter = Counter(classes)
    return counter.most_common(1)[0][0]

def accuracy(trained_tree, test_instances):
    no_of_correct_predictions = 0
    for test_instance in test_instances:
        if predict(trained_tree, test_instance) == test_instance['Class']:
            no_of_correct_predictions += 1
    return no_of_correct_predictions / len(test_instances)

def predict(tree, instance):
    if tree.label is not None:
        return tree.label
    attribute_value = instance[tree.attribute]
    if attribute_value not in tree.children:
        return None
    child_node = tree.children[attribute_value]
    return predict(child_node, instance)

def prune(node, val_instances):
    def prune_node(node, val_instances):
        if len(node.children) == 0:
            accuracy_before_pruning = accuracy(TREE, val_instances)
            node.pruned = True
            if accuracy_before_pruning >= accuracy(TREE, val_instances):
                node.pruned = False
            return
        for value, child_node in node.children.items():
            prune_node(child_node, val_instances)
        accuracy_before_pruning = accuracy(TREE, val_instances)
        node.pruned = True
        if accuracy_before_pruning >= accuracy(TREE, val_instances):
            node.pruned = False

    prune_node(node, val_instances)


Driver Program to print the Tree

In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import OneHotEncoder
import pydot

def plot_decision_tree(tree):
    dot_data = export_graphviz(tree, out_file=None, filled=True, rounded=True, special_characters=True)
    graph = pydot.graph_from_dot_data(dot_data)
    image = graph[0].create_png()
    with open('decision_tree.png', 'wb') as f:
        f.write(image)

data = pd.read_csv('/content/abalone.csv')
X = data.drop('Class', axis=1)
y = data['Class']

encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X[['Sex']])
feature_names = ['Sex_' + category for category in encoder.categories_[0][1:]]
X_encoded = pd.DataFrame(X_encoded.toarray(), columns=feature_names)
X = pd.concat([X.drop('Sex', axis=1), X_encoded], axis=1)

tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X, y)

plot_decision_tree(tree)
