In [270]:
# cell 1: imports and data
!mkdir -p datasets
!wget -q -O datasets/mushroom.csv https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data
!wget -q -O datasets/votes.csv https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data

import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier

In [271]:
# cell 2: encodes each column in the dataset and returns splits

def splits(data):
    for c in data.columns: # goes through every column
        var = data[c].astype(str) # changes them to strings
        data[c] = LabelEncoder().fit_transform(var) # encodes them

    X = data.drop(columns=[0])  # all but first column
    y = data[0]  # first column is label

    # 80% for training, 20 is then split into 10 for dev and 10 for test
    D_80, D_20, Y_80, Y_20 = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
    D_dev, D_test, Y_dev, Y_test = train_test_split(D_20, Y_20, test_size=0.5, stratify=Y_20, random_state=1)
    return D_80, D_dev, D_test, Y_80, Y_dev, Y_test

In [272]:
# cell 3: load mushroom and vote datasets and prep splits

mushroom = pd.read_csv("datasets/mushroom.csv", header=None).fillna("missing")
votes = pd.read_csv("datasets/votes.csv", header=None).replace("?", "missing")

# performs splits function on training, test, dev sets
mushroomTrain, mushroomDev, mushroomTest, mushroomY_Train, mushroomY_Dev, mushroomY_Test = splits(mushroom)
votesTrain, votesDev, votesTest, votesY_Train, votesY_Dev, votesY_Test = splits(votes)

# testing purposes
# print("mushroom split:", mushroomTrain.shape, mushroomDev.shape, mushroomTest.shape)
# print("vote split:", votesTrain.shape, votesDev.shape, votesTest.shape)


In [273]:
# cell 4: Naive Bayes --> influenced by lecture pseudocode (lec 10)

# estimate P(y) for each class y using laplace smoothing
def prior_prob(Y, alpha=1.0):
    #labels and count each class
    cY = list(Counter(Y).keys())
    cC = list(Counter(Y).values())

    laplace = len(Y) + alpha * len(cY)
    res = dict(zip(cY, [(num + alpha)/ laplace for num in cC]))
    return res, cY

# estimate P(x_j | y) for each feature j and class y using laplace smoothing
def like(D, Y, labels, alpha=1.0):
    n, d = D.shape
    chance = {}
    for label in labels:
        rows = D[Y == label]  # get rows with the label
        chance[label] = []
        for j in range(d):
            column = rows[:, j]  # get column j for label
            num = Counter(column)  # count each value in column
            origin = set(D[:, j])  # all values in the original feature

            laplace = len(column) + alpha * len(origin)
            # use laplace
            sol = dict(zip(origin, [(num[v] + alpha) / laplace for v in origin]))
            chance[label].append(sol)
    return chance

# Step 3: Prediction function
def NB(D, res, like, labels):
    D = np.array(D)
    result = []

    for row in D:  # for each example in the dataset
        num = {}
        for label in labels:  # try each label
            score = np.log(res[label])  # start with log of prior probability
            for j in range(len(row)):  # check each feature
                if row[j] in like[label][j]:  # if value was seen in training
                    chance = like[label][j][row[j]]
                    score += np.log(chance)  #  add log likelihood
            num[label] = score  #save total score for this label
        result.append(max(num, key=num.get))  # add best label

    return np.array(result)


In [274]:
# cell 5: Decision Tree --> influenced by Decision Tree slides (lecs 12/13)

# entropy of a label distribution
def entropy(y):
    e = 0
    for i in Counter(y).values():
        p = i / len(y)
        if p > 0:
            e -= p * np.log2(p)
    return e

# information gain for splitting on a feature
def info_gain(y, column):
    split = sum((len(y[column == v]) / len(y)) * entropy(y[column == v]) for v in set(column))
    return entropy(y) - split

# recursive function to build decision tree (slide 19 lec 13)
def induce_tree(D, Y, depth, max_depth):
    # if all labels are the same, return that label
    if len(set(Y)) == 1 or depth == max_depth:
        return Counter(Y).most_common(1)[0][0]

    # get the best feature (f*) to split on
    gain = [info_gain(Y, D[:, f]) for f in range(D.shape[1])]
    f_star = np.argmax(gain)

    # build children for each value k in Values(f*)
    tree = (f_star, {})
    for k in np.unique(D[:, f_star]):
        x = D[:, f_star] == k
        D_k, Y_k = D[x], Y[x]
        tree[1][k] = induce_tree(D_k, Y_k, depth + 1, max_depth)
    return tree

def get_tree(D, tree, labels):
    results = []
    for row in D:  # go through each input row
        node = tree
        while isinstance(node, tuple):
            column, path = node  # split on f
            node = path.get(row[column], labels[0])  # go to branch or label
        results.append(node)  # add
    return np.array(results)

In [275]:
# cell 6: train models and results

# naive bayes training/predicting
p_mushroom, labels_mushroom = prior_prob(mushroomY_Train.values)
l_mushroom = like(mushroomTrain.values, mushroomY_Train.values, labels_mushroom)
res_nb_mushroom = NB(mushroomTest.values, p_mushroom, l_mushroom, labels_mushroom)
p_votes, labels_votes = prior_prob(votesY_Train.values)
l_votes = like(votesTrain.values, votesY_Train.values, labels_votes)
res_nb_votes = NB(votesTest.values, p_votes, l_votes, labels_votes)

print("Naive Bayes --> Mushroom Accuracy:", round(accuracy_score(mushroomY_Test, res_nb_mushroom) * 100, 2), "%")
print(confusion_matrix(mushroomY_Test, res_nb_mushroom))
print("Naive Bayes --> Vote Accuracy:", round(accuracy_score(votesY_Test, res_nb_votes) * 100, 2), "%")
print(confusion_matrix(votesY_Test, res_nb_votes))
print("\n")

# decision Tree: find best depth ---
d = 2
best_depth = None
highest_acc = 0
while True:
    t_mushroom = induce_tree(mushroomTrain.values, mushroomY_Train.values, 0, d)
    res_dev = get_tree(mushroomDev.values, t_mushroom, labels_mushroom)
    a = accuracy_score(mushroomY_Dev, res_dev)
    print("Depth =", d, "Accuracy =", round(a * 100, 2), "%")
    if a > highest_acc:
        highest_acc = a
        best_depth = d
        d += 1  # try next depth
    else:
        break  # stop if accuracy stops getting better

print("Best depth:", best_depth)
print("\n")


# decision trees trained on best depth ---
t_mushroom = induce_tree(mushroomTrain.values, mushroomY_Train.values, 0, best_depth)
t_votes = induce_tree(votesTrain.values, votesY_Train.values, 0, best_depth)
res_dt_mushroom = get_tree(mushroomTest.values, t_mushroom, labels_mushroom)
res_dt_votes = get_tree(votesTest.values, t_votes, labels_votes)

print("Decision Tree --> Mushroom Accuracy:", round(accuracy_score(mushroomY_Test, res_dt_mushroom) * 100, 2), "%")
print(confusion_matrix(mushroomY_Test, res_dt_mushroom))
print("Decision Tree --> Vote Accuracy:", round(accuracy_score(votesY_Test, res_dt_votes) * 100, 2), "%")
print(confusion_matrix(votesY_Test, res_dt_votes))


Naive Bayes --> Mushroom Accuracy: 95.82 %
[[419   2]
 [ 32 360]]
Naive Bayes --> Vote Accuracy: 88.64 %
[[23  4]
 [ 1 16]]


Depth = 2 Accuracy = 99.26 %
Depth = 3 Accuracy = 99.63 %
Depth = 4 Accuracy = 100.0 %
Depth = 5 Accuracy = 100.0 %
Best depth: 4


Decision Tree --> Mushroom Accuracy: 100.0 %
[[421   0]
 [  0 392]]
Decision Tree --> Vote Accuracy: 95.45 %
[[26  1]
 [ 1 16]]


In [276]:
# sklearn models for mushroom
sk_nb_m = CategoricalNB().fit(mushroomTrain, mushroomY_Train)
sk_dt_m = DecisionTreeClassifier(max_depth=best_depth).fit(mushroomTrain, mushroomY_Train)

# sklearn models for Votes
sk_nb_v = CategoricalNB().fit(votesTrain, votesY_Train)
sk_dt_v = DecisionTreeClassifier(max_depth=best_depth).fit(votesTrain, votesY_Train)

print("Sklearn Naive-Bayes (Mushroom) Accuracy:", round(accuracy_score(mushroomY_Test, sk_nb_m.predict(mushroomTest)) * 100, 2), "%")
print("Sklearn Decision Tree (Mushroom) Accuracy:", round(accuracy_score(mushroomY_Test, sk_dt_m.predict(mushroomTest)) * 100, 2), "%")
print("\n")
print("Sklearn Naive-Bayes (Voting) Accuracy:", round(accuracy_score(votesY_Test, sk_nb_v.predict(votesTest)) * 100, 2), "%")
print("Sklearn Decision Tree (Voting) Accuracy:", round(accuracy_score(votesY_Test, sk_dt_v.predict(votesTest)) * 100, 2), "%")

Sklearn Naive-Bayes (Mushroom) Accuracy: 95.82 %
Sklearn Decision Tree (Mushroom) Accuracy: 97.79 %


Sklearn Naive-Bayes (Voting) Accuracy: 88.64 %
Sklearn Decision Tree (Voting) Accuracy: 95.45 %


In [277]:
readme = """
Naive Bayes and Decision Trees
By: Amir Noori

Overview:
This homework uses two classification models being Naive Bayes and Decision Trees to test their accuracies with two different datasets. The datasets being a mushroom dataset to classify edibility and a congressional voting record dataset to classify political party.

Key Features:
First, the Naive Bayes classifier is implemented using Laplace smoothing.
Then the mushroom and vote datasets are loaded and split with the Naive Bayes model tested on both.
The same process is repeated using a Decision Tree classifier built from scratch using entropy and information gain.
Each custom model is then compared to their sklearn equivalents to see how they match up.
This is done by training the data under the classification models and printing and tracking their results.

Files:
- `mushroom.csv`: Mushroom classification dataset (UCI)
- `votes.csv`: Voting records dataset (UCI)

To run this:
1. Open the Colab notebook.
2. Run each cell from top to bottom (or all at once using the Runtime --> Run All shortcut).
3. All files are automatically downloaded and preprocessed during execution.
4. View resulting model accuracy and confusion matrices for both tasks.

"""
with open("README.txt", "w") as f:
    f.write(readme)
