In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier

# House dataset

In [111]:
column_names_house = [
    "Class Name",
    "handicapped-infants",
    "water-project-cost-sharing",
    "adoption-of-the-budget-resolution",
    "physician-fee-freeze",
    "el-salvador-aid",
    "religious-groups-in-schools",
    "anti-satellite-test-ban",
    "aid-to-nicaraguan-contras",
    "mx-missile",
    "immigration",
    "synfuels-corporation-cutback",
    "education-spending",
    "superfund-right-to-sue",
    "crime",
    "duty-free-exports",
    "export-administration-act-south-africa"
]

In [112]:
with open("house-votes-84.data", "r") as f:
    data = f.read().splitlines()
    data = [line.split(",") for line in data]
    data = pd.DataFrame(data, columns = column_names_house)



X_house = data.drop("Class Name", axis = 1)
Y_house = data["Class Name"]

data.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [113]:
# Needs one hot encoding and for class to be 0 and 1's
# Though unknown values are inputed as "?" in the df, I shall ignore unknowns to prevent bugs due to unknown/ extra categories.
encoder = OneHotEncoder(sparse_output= False, handle_unknown= "ignore")
X_house_encoded = encoder.fit_transform(X_house)
# Convert to pandas dataframe.
X_house_encoded = pd.DataFrame(X_house_encoded)
X_house_encoded.head()

Y_house = Y_house.replace({'democrat': 1, 'republican': 0}).astype(int)


  Y_house = Y_house.replace({'democrat': 1, 'republican': 0}).astype(int)


In [114]:
# Splitting into test, train and dev set.
X_train_h, X_testdev_h, Y_train_h, Y_testdev_h = train_test_split(
    X_house_encoded,
    Y_house,
    train_size = 0.8,
    test_size = 0.2,
    random_state = 42,
    stratify= Y_house)

X_test_h, X_dev_h, Y_test_h, Y_dev_h = train_test_split(
    X_testdev_h,
    Y_testdev_h,
    train_size = 0.5,
    test_size = 0.5,
    random_state = 42,
    stratify= Y_testdev_h)

print(f"Train: {len(X_train_h)} samples")
print(f"Dev:   {len(X_dev_h)} samples")
print(f"Test:  {len(X_test_h)} samples")


Train: 348 samples
Dev:   44 samples
Test:  43 samples


In [115]:
# turning sets into numpy array for classification models
X_train_h = X_train_h.to_numpy()
X_dev_h = X_dev_h.to_numpy()
X_test_h = X_test_h.to_numpy()

Y_train_h = Y_train_h.to_numpy()
Y_dev_h = Y_dev_h.to_numpy()
Y_test_h = Y_test_h.to_numpy()

# Mushroom dataset

In [116]:
# Added extra column name for class.
column_names_mushroom = [
    "class(poison/edible)",
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises?",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat"
]

In [117]:
with open("agaricus-lepiota.data", "r") as f:
    data = f.read().splitlines()
    data = [line.split(",") for line in data]
    data = pd.DataFrame(data, columns = column_names_mushroom)

X_mushroom = data.drop("class(poison/edible)", axis = 1)
Y_mushroom = data["class(poison/edible)"]
data.head()


Unnamed: 0,class(poison/edible),cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [118]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown= "ignore")
X_mushroom_encoded = encoder.fit_transform(X_mushroom)
X_mushroom_encoded = pd.DataFrame(X_mushroom_encoded)
X_mushroom_encoded.head()

Y_mushroom = Y_mushroom.replace({'p': 1, 'e': 0}).astype(int)



  Y_mushroom = Y_mushroom.replace({'p': 1, 'e': 0}).astype(int)


In [119]:
# Splitting into test, train and dev sets
X_train_m, X_testdev_m, Y_train_m, Y_testdev_m = train_test_split(
    X_mushroom_encoded,
    Y_mushroom,
    train_size= 0.8,
    test_size = 0.2,
    random_state = 42,
    stratify = Y_mushroom)
X_test_m, X_dev_m, Y_test_m, Y_dev_m = train_test_split(
    X_testdev_m,
    Y_testdev_m,
    train_size = 0.5,
    test_size = 0.5,
    random_state = 42,
    stratify = Y_testdev_m
)

print(f"Train: {len(X_train_m)} samples")
print(f"Dev:   {len(X_dev_m)} samples")
print(f"Test:  {len(X_test_m)} samples")

Train: 6499 samples
Dev:   813 samples
Test:  812 samples


In [120]:
# turn to numpy array, because the classifiers expect numpy
X_train_m = X_train_m.to_numpy()
X_dev_m = X_dev_m.to_numpy()
X_test_m = X_test_m.to_numpy()

Y_train_m = Y_train_m.to_numpy()
Y_dev_m = Y_dev_m.to_numpy()
Y_test_m = Y_test_m.to_numpy()


# My model of Naive Bayes



In [121]:
class NaiveBayes ():
  def fit(self, X, y):
    n_samples, n_features = X.shape
    self._classes = np.unique(y) # number of unique classes. Here it would be 0 and 1
    n_classes = len(self._classes) # number of classes

    # claculate mean, var, and prior for each class
    self._mean = np.zeros((n_classes, n_features), dtype = np.float64)
    self._var = np.zeros((n_classes, n_features), dtype = np.float64)
    self._prior = np.zeros((n_classes), dtype = np.float64)

    for index, c in enumerate(self._classes):
      X_c = X[y==c]
      self._mean[index, :] = X_c.mean(axis = 0)
      self._var[index, :] = X_c.var(axis = 0) + 1e-9 # avoid divide-by-zero

      self._prior[index] = X_c.shape[0]/float(n_samples)

  def predict(self, X):
    y_pred = [self._predict(x) for x in X]
    return np.array(y_pred)

  def _predict(self, x):
    posts_list = []
    for index, c in enumerate(self._classes):
      prior = np.log(self._prior[index])

      # clipping values, to prevent small values or log(0)
      pdf_values = self._pdf(index, x)
      pdf_values = np.clip(pdf_values, 1e-10, None)

      conditional = np.sum(np.log(pdf_values))
      post = prior + conditional
      posts_list.append(post)
    # returns class with highest posterior.
    return self._classes[np.argmax(posts_list)]

  def _pdf(self, class_index, x):
    mean = self._mean[class_index]
    var = self._var[class_index]
    numerator = np.exp(-((x-mean) ** 2) / (2* var))
    denominator = np.sqrt( 2 * np.pi * var)
    return numerator / denominator

## Naive Bayes for Mushroom dataset

In [122]:
# Initialize and train the model.
nb_mushroom = NaiveBayes()
nb_mushroom.fit(X_train_m, Y_train_m) # train instead

# Getting predictions for each set.
Y_train_pred_m = nb_mushroom.predict(X_train_m)
Y_dev_pred_m = nb_mushroom.predict(X_dev_m)
Y_test_pred_m = nb_mushroom.predict(X_test_m)

print("Accuracy of test set: ", accuracy_score(Y_test_m, Y_test_pred_m))
print("Accuracy of dev set: ", accuracy_score(Y_dev_m, Y_dev_pred_m))
print("Accuracy of train set: ", accuracy_score(Y_train_m, Y_train_pred_m))
print("Confusion matrix \n", confusion_matrix(Y_test_m, Y_test_pred_m))

Accuracy of test set:  0.9482758620689655
Accuracy of dev set:  0.9434194341943419
Accuracy of train set:  0.9412217264194491
Confusion matrix 
 [[387  34]
 [  8 383]]


### Sklearn models run on Mushroom dataset for comparison

In [123]:
# Bernoulli Model from sklearn
bnb_mushroom = BernoulliNB()
bnb_mushroom.fit(X_train_m, Y_train_m)

print("Accuracy of test set: ", bnb_mushroom.score(X_test_m, Y_test_m))
print("Accuracy of dev set: ", bnb_mushroom.score(X_dev_m, Y_dev_m))
print("Accuracy of train set: ", bnb_mushroom.score(X_train_m, Y_train_m))
print("Confusion matrix \n", confusion_matrix(Y_test_m, bnb_mushroom.predict(X_test_m)))

Accuracy of test set:  0.9433497536945813
Accuracy of dev set:  0.922509225092251
Accuracy of train set:  0.9421449453762117
Confusion matrix 
 [[417   4]
 [ 42 349]]


In [124]:
# Guassian Model from sklearn
gnb_mushroom = GaussianNB()
gnb_mushroom.fit(X_train_m, Y_train_m)

print("Accuracy of test set: ", gnb_mushroom.score(X_test_m, Y_test_m))
print("Accuracy of dev set: ", gnb_mushroom.score(X_dev_m, Y_dev_m))
print("Accuracy of train set: ", gnb_mushroom.score(X_train_m, Y_train_m))
print("Confusion matrix \n", confusion_matrix(Y_test_m, gnb_mushroom.predict(X_test_m)))

Accuracy of test set:  0.9605911330049262
Accuracy of dev set:  0.9630996309963099
Accuracy of train set:  0.956916448684413
Confusion matrix 
 [[390  31]
 [  1 390]]


## Naive Bayes for House Dataset

In [125]:
# Initialize and train the model.
nb_house = NaiveBayes()
nb_house.fit(X_train_h, Y_train_h)

# Getting predictions for each set.
Y_train_pred_h = nb_house.predict(X_train_h)
Y_dev_pred_h   = nb_house.predict(X_dev_h)
Y_test_pred_h  = nb_house.predict(X_test_h)

# Accuracy
print("Accuracy of test set:  ", accuracy_score(Y_test_h, Y_test_pred_h))
print("Accuracy of dev set:   ", accuracy_score(Y_dev_h, Y_dev_pred_h))
print("Accuracy of train set: ", accuracy_score(Y_train_h, Y_train_pred_h))
print("Confusion matrix \n", confusion_matrix(Y_test_h, Y_test_pred_h))

Accuracy of test set:   0.8604651162790697
Accuracy of dev set:    0.9772727272727273
Accuracy of train set:  0.9568965517241379
Confusion matrix 
 [[14  3]
 [ 3 23]]


### Sklearn models run on house dataset for comparison

In [126]:
# Bernoulli Model from sklearn
bnb_house = BernoulliNB()
bnb_house.fit(X_train_h, Y_train_h)

print("Accuracy of test set: ", bnb_house.score(X_test_h, Y_test_h))
print("Accuracy of dev set: ", bnb_house.score(X_dev_h, Y_dev_h))
print("Accuracy of train set: ", bnb_house.score(X_train_h, Y_train_h))
print("Confusion matrix \n", confusion_matrix(Y_test_h, bnb_house.predict(X_test_h)))

Accuracy of test set:  0.8372093023255814
Accuracy of dev set:  0.9318181818181818
Accuracy of train set:  0.9109195402298851
Confusion matrix 
 [[15  2]
 [ 5 21]]


In [127]:
# Guassian Model from sklearn
gnb_house = GaussianNB()
gnb_house.fit(X_train_h, Y_train_h)

print("Accuracy of test set: ", gnb_house.score(X_test_h, Y_test_h))
print("Accuracy of dev set: ", gnb_house.score(X_dev_h, Y_dev_h))
print("Accuracy of train set: ", gnb_house.score(X_train_h, Y_train_h))
print("Confusion matrix \n", confusion_matrix(Y_test_h, gnb_house.predict(X_test_h)))

Accuracy of test set:  0.8604651162790697
Accuracy of dev set:  0.9545454545454546
Accuracy of train set:  0.9540229885057471
Confusion matrix 
 [[14  3]
 [ 3 23]]


# Desision tree

In [128]:
class Node():
  def __init__(self, feature=None, threshold=None, left=None, right=None, gain=None, value=None):
    # for decision node
    self.feature = feature
    self.threshold = threshold
    # assesing right and left child.
    self.left = left
    self.right = right
    # Stores the information gain by the split caused by this specific node.
    self.gain = gain

    # For leaf node. Value is just the majority class of the node
    self.value = value

In [129]:
class DecisionTree():
  def __init__(self, min_samples_split = 2, max_depth = 100, criterion="gini"):
    # if in a paticular node, the number of samples becomes
    # less then the min samples(2) then that node won't be split.
    self.criterion = criterion  # "gini" or "entropy"
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth

    # Initializing the root/head of the tree.
    self.root = None

  def buildtree(self, X, Y, depth=0):
    n_samples, n_features = X.shape

    if n_samples >= self.min_samples_split and depth <= self.max_depth:
        best_split = self.get_best_split(X, Y, n_samples, n_features)

        if best_split and best_split["gain"] > 0:
            left_subtree = self.buildtree(best_split["X_left"], best_split["Y_left"], depth + 1)
            right_subtree = self.buildtree(best_split["X_right"], best_split["Y_right"], depth + 1)
            return Node(best_split["feature"], best_split["threshold"], left_subtree, right_subtree, best_split["gain"])

    leaf_value = self.get_leaf_values(Y)
    return Node(value=leaf_value)

  def get_best_split(self, X, Y, n_samples, n_features):
    best_split = {}
    max_gain = -float("inf")

    for feature in range(n_features):
        feature_values = X[:, feature]
        thresholds = np.unique(feature_values)

        for threshold in thresholds:
            X_left, X_right, Y_left, Y_right = self.split(X, Y, feature, threshold)

            if len(X_left) > 0 and len(X_right) > 0:
                curr_gain = self.info_gain(Y, Y_left, Y_right, mode=self.criterion)

                if curr_gain > max_gain:
                    max_gain = curr_gain
                    best_split = {
                        "feature": feature,
                        "threshold": threshold,
                        "X_left": X_left,
                        "X_right": X_right,
                        "Y_left": Y_left,
                        "Y_right": Y_right,
                        "gain": curr_gain
                    }
    return best_split


  def split(self, X, Y, feature, threshold):
    left_indices = X[:, feature] <= threshold
    right_indices = X[:, feature] > threshold

    X_left = X[left_indices]
    Y_left = Y[left_indices]
    X_right = X[right_indices]
    Y_right = Y[right_indices]

    return X_left, X_right, Y_left, Y_right

  def info_gain(self, parent, left_child, right_child, mode="gini"):
    weight_left = len(left_child) / len(parent)
    weight_right = len(right_child) / len(parent)

    if mode == "gini":
        gain = self.gini_index(parent) - (weight_left*self.gini_index(left_child) + weight_right*self.gini_index(right_child))
    else:
        gain = self.entropy(parent) - (weight_left*self.entropy(left_child) + weight_right*self.entropy(right_child))
    return gain


  def entropy(self, Y):
    class_labels = np.unique(Y)
    entropy = 0
    for label in class_labels:
      # get probability of class
      p_class = len(Y[Y == label])/len(Y)
      entropy += -p_class * np.log2(p_class)
    return entropy

  def gini_index(self,Y):
    class_labels = np.unique(Y)
    gini = 0
    for label in class_labels:
      # get probability of class
      p_class = len(Y[Y == label])/len(Y)
      gini += p_class**2
    return 1 - gini

  def get_leaf_values(self, Y):
    return np.bincount(Y).argmax()  # returns most common class


  def print_tree(self, tree = None, indent = " "):
    if not tree:
      tree = self.root
    if tree.value is not None:
      print(tree.value)
    else:
      print("X_" + str(tree.feature), "<=", tree.threshold, "?", tree.gain)
      print("%sleft:" % (indent), end = "")
      self.print_tree(tree.left, indent + indent)
      print("%sright:" % (indent), end = "")
      self.print_tree(tree.right, indent + indent)

  def fit(self, X, Y):
    self.n_features = X.shape[1]
    self.root = self.buildtree(X, Y)

  def predict(self, X):
      return np.array([self.make_prediction(x, self.root) for x in X])

  def make_prediction(self, x, tree):
      if tree.value is not None:
          return tree.value

      if x[tree.feature] <= tree.threshold:
          return self.make_prediction(x, tree.left)
      else:
          return self.make_prediction(x, tree.right)






## Decision Tree for mushroom dataset

In [130]:
dt_mushroom = DecisionTree(max_depth=5)
dt_mushroom.fit(X_train_m, Y_train_m)
Y_test_pred_m = dt_mushroom.predict(X_test_m)
Y_dev_pred_m = dt_mushroom.predict(X_dev_m)
Y_train_pred_m = dt_mushroom.predict(X_train_m)

print("Accuracy of test set:", accuracy_score(Y_test_m, Y_test_pred_m))
print("Accuracy of dev set:", accuracy_score(Y_dev_m, Y_dev_pred_m))
print("Accuracy of train set:", accuracy_score(Y_train_m, Y_train_pred_m))
print("Confusion matrix \n", confusion_matrix(Y_test_m, Y_test_pred_m))


Accuracy of test set: 0.9987684729064039
Accuracy of dev set: 0.998769987699877
Accuracy of train set: 0.9998461301738729
Confusion matrix 
 [[421   0]
 [  1 390]]


In [131]:
dt_mushroom_enthropy = DecisionTree(max_depth=5, criterion= "entropy")
dt_mushroom_enthropy.fit(X_train_m, Y_train_m)
Y_test_pred_m_enthropy = dt_mushroom_enthropy.predict(X_test_m)
Y_dev_pred_m_enthropy = dt_mushroom_enthropy.predict(X_dev_m)
Y_train_pred_m_enthropy = dt_mushroom_enthropy.predict(X_train_m)

print("Accuracy of test set:", accuracy_score(Y_test_m, Y_test_pred_m_enthropy))
print("Accuracy of dev set:", accuracy_score(Y_dev_m, Y_dev_pred_m_enthropy))
print("Accuracy of train set:", accuracy_score(Y_train_m, Y_train_pred_m_enthropy))
print("Confusion matrix \n", confusion_matrix(Y_test_m, Y_test_pred_m_enthropy))


Accuracy of test set: 0.9987684729064039
Accuracy of dev set: 0.998769987699877
Accuracy of train set: 0.9998461301738729
Confusion matrix 
 [[421   0]
 [  1 390]]


In [132]:
sk_dt = DecisionTreeClassifier(max_depth = 5)
sk_dt.fit(X_train_m, Y_train_m)

print("Accuracy of test set:", sk_dt.score(X_test_m, Y_test_m))
print("Accuracy of dev set:", sk_dt.score(X_dev_m, Y_dev_m))
print("Accuracy of train set:", sk_dt.score(X_train_m, Y_train_m))
print("Confusion matrix \n", confusion_matrix(Y_test_m, sk_dt.predict(X_test_m)))

Accuracy of test set: 0.9987684729064039
Accuracy of dev set: 0.998769987699877
Accuracy of train set: 0.9998461301738729
Confusion matrix 
 [[421   0]
 [  1 390]]


## Decision Tree for house dataset

In [133]:
# gini used for information gain
dt_house = DecisionTree(max_depth=5)
dt_house.fit(X_train_h, Y_train_h)

Y_test_pred_h = dt_house.predict(X_test_h)
Y_dev_pred_h = dt_house.predict(X_dev_h)
Y_train_pred_h = dt_house.predict(X_train_h)

print("Accuracy of test set:", accuracy_score(Y_test_h, Y_test_pred_h))
print("Accuracy of dev set:", accuracy_score(Y_dev_h, Y_dev_pred_h))
print("Accuracy of train set:", accuracy_score(Y_train_h, Y_train_pred_h))
print("Confusion matrix \n", confusion_matrix(Y_test_h, Y_test_pred_h))



Accuracy of test set: 0.8837209302325582
Accuracy of dev set: 1.0
Accuracy of train set: 0.9971264367816092
Confusion matrix 
 [[15  2]
 [ 3 23]]


In [134]:
# enthropy used for information gain
dt_house_entropy = DecisionTree(max_depth=5, criterion="entropy")
dt_house_entropy.fit(X_train_h, Y_train_h)

Y_test_pred_h_entropy = dt_house_entropy.predict(X_test_h)
Y_dev_pred_h_entropy = dt_house_entropy.predict(X_dev_h)
Y_train_pred_h_entropy = dt_house_entropy.predict(X_train_h)

print("Accuracy of test set:", accuracy_score(Y_test_h, Y_test_pred_h_entropy))
print("Accuracy of dev set:", accuracy_score(Y_dev_h, Y_dev_pred_h_entropy))
print("Accuracy of train set:", accuracy_score(Y_train_h, Y_train_pred_h_entropy))
print("Confusion matrix \n", confusion_matrix(Y_test_h, Y_test_pred_h_entropy))

Accuracy of test set: 0.8604651162790697
Accuracy of dev set: 1.0
Accuracy of train set: 0.9971264367816092
Confusion matrix 
 [[14  3]
 [ 3 23]]


In [135]:
sk_dt_h = DecisionTreeClassifier(max_depth = 5)
sk_dt_h.fit(X_train_h, Y_train_h)

print("Accuracy of test set:", sk_dt_h.score(X_test_h, Y_test_h))
print("Accuracy of dev set:", sk_dt_h.score(X_dev_h, Y_dev_h))
print("Accuracy of train set:", sk_dt_h.score(X_train_h, Y_train_h))
print("Confusion matrix \n", confusion_matrix(Y_test_h, sk_dt_h.predict(X_test_h)))

Accuracy of test set: 0.8837209302325582
Accuracy of dev set: 1.0
Accuracy of train set: 0.9913793103448276
Confusion matrix 
 [[15  2]
 [ 3 23]]
