In [2]:

import pandas as pd
import numpy as np
from math import log2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score
)

# Load dataset
file_path = "../mushroom.csv"
columns = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
    'ring-type', 'spore-print-color', 'population', 'habitat'
]
df = pd.read_csv(file_path, header=None, names=columns)
df = df[df['stalk-root'] != '?']

# Encode categorical data
label_encoders = {}
for col in df.columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

train_df = X_train.copy()
train_df['class'] = y_train

# C4.5 Algorithm
class C45Tree:
    def __init__(self):
        self.tree = None

    def fit(self, data, target_attr):
        self.tree = self._build_tree(data, target_attr)

    def _entropy(self, data, target_attr):
        values = data[target_attr].value_counts(normalize=True)
        return -sum(values * np.log2(values))

    def _info_gain_ratio(self, data, attr, target_attr):
        total_entropy = self._entropy(data, target_attr)
        values = data[attr].value_counts(normalize=True)
        split_info = -sum(values * np.log2(values))

        subset_entropy = 0
        for val in data[attr].unique():
            subset = data[data[attr] == val]
            weight = len(subset) / len(data)
            subset_entropy += weight * self._entropy(subset, target_attr)

        info_gain = total_entropy - subset_entropy
        return info_gain / split_info if split_info != 0 else 0

    def _best_attr(self, data, attributes, target_attr):
        return max(attributes, key=lambda attr: self._info_gain_ratio(data, attr, target_attr))

    def _majority_class(self, data, target_attr):
        return data[target_attr].mode()[0]

    def _build_tree(self, data, target_attr, attributes=None):
        if attributes is None:
            attributes = data.columns.drop(target_attr)

        if len(data[target_attr].unique()) == 1:
            return data[target_attr].iloc[0]

        if len(attributes) == 0:
            return self._majority_class(data, target_attr)

        best = self._best_attr(data, attributes, target_attr)
        tree = {best: {}}
        for val in data[best].unique():
            subset = data[data[best] == val]
            subtree = self._build_tree(
                subset.drop(columns=[best]), target_attr,
                attributes.drop(best)
            )
            tree[best][val] = subtree
        return tree

    def predict_one(self, input_row, tree=None):
        if tree is None:
            tree = self.tree
        if not isinstance(tree, dict):
            return tree
        attr = next(iter(tree))
        if input_row[attr] in tree[attr]:
            return self.predict_one(input_row, tree[attr][input_row[attr]])
        else:
            return 0  # fallback

    def predict(self, df):
        return df.apply(lambda row: self.predict_one(row), axis=1)

# Train and predict
c45 = C45Tree()
c45.fit(train_df, 'class')
y_pred = c45.predict(X_test)

# Evaluation
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)
j_index = recall + specificity - 1

# Show results
print("Confusion Matrix:")
print(f"TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity (Recall): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Youden’s J-index (SPB): {j_index:.4f}")


Confusion Matrix:
TP: 636, FP: 0, FN: 0, TN: 1058
Accuracy: 1.0000
Precision: 1.0000
Sensitivity (Recall): 1.0000
Specificity: 1.0000
F1 Score: 1.0000
Youden’s J-index (SPB): 1.0000
