## Naive Bayes Classifiers

In [1]:
import numpy as np
from pandas import read_csv
from collections import defaultdict
import re
import matplotlib.pyplot as plt
import pandas as pd
#import csv
%matplotlib inline

In [2]:
# Datasets and their respective headers
data_head_dict = {
    'breast-cancer-wisconsin.data' : ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'class'], 
    'mushroom.data' : ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'],
    'lymphography.data' : ['class', 'lymphatics', 'block of affere', 'bl. of lymph. c', 'bl. of lymph. s', 'by pass', 'extravasates', 'regeneration of', 'early uptake in', 'lym.nodes dimin', 'lym.nodes enlar', 'changes in lym.', 'defect in node', 'changes in node', 'changes in stru', 'special forms', 'dislocation of', 'exclusion of no', 'no. of nodes in'],
    'wine.data' : ['class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'],
    'car.data' : ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'],
    'somerville.data' : ['class', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'],
    'adult.data' : ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class'],
    'bank.data' : ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'class']
}

In [21]:
#just for checking data - don't run
def count_inst(filestream, class_dict):
    n_instances = 0
    for line in filestream.readlines()[1:]:
        n_instances += 1
        class_dict[line.strip().split(",")[-1]] += 1
    return n_instances

In [22]:
#just for checking data - don't run
class_dict = defaultdict(int)
f=open("wine.data", 'r')
print('Our total number of instances is:',count_inst(f,class_dict))
for lbl in class_dict.keys():
    print('For class', lbl, 'we have', class_dict[lbl], 'instances.')

f.close()

Our total number of instances is: 698
For class 2 we have 457 instances.
For class 4 we have 241 instances.


In [23]:
#just for checking data - don't run
def check_data(filestream,n_fields):
    ret_val = True
    for line in filestream:
        if (len(line.strip().split(","))!=n_fields): ret_val = False
    return ret_val

In [28]:
#just for checking data - don't run
f = open("mushroom.data",'r')

header = data_head_dict["mushroom.data"]
print("Header: ",header)
n_fields = len(header)

print("CSV passes? ",check_data(f,n_fields))
f.close()

Header:  ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
CSV passes?  True


In [3]:
# This function should prepare the data by reading it from a file and converting it into a useful format for training and testing
def preprocess_rest(filename):
    df = pd.read_csv(filename, names=data_head_dict[filename])
    names=data_head_dict[filename]
    if filename == "wine.data" or filename == "somerville.data":
        for header in names:
            #removing rows with '?'
            df.drop(df.loc[df[header]=='?'].index, inplace=True)
    return df

#This is for discretising the datasets
def preprocess_wine(filename):
        
    # read data and add column headers
    df = pd.read_csv(filename, names=data_head_dict[filename])
    names=data_head_dict[filename]
    #discretising and splitting continuous data into 3 bins
    for header in names:
        if header != 'class':
            minimum = df[header].min()
            maximum = df[header].max()
            step = (maximum-minimum)/3
            df[header]=pd.cut(x = df[header], 
                              bins = [minimum,minimum+step,maximum-step, 
                                      maximum], labels = [0, 1, 2])
    return df

def preprocess_adult(filename):
        
    # read data and add column headers
    df = pd.read_csv(filename, names=data_head_dict[filename])
    continuous = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    for header in continuous:
        minimum = df[header].min()
        maximum = df[header].max()
        step = (maximum-minimum)/3
        df[header]=pd.cut(x = df[header], bins = [minimum,minimum+step,maximum-step, maximum], labels = [0, 1, 2])
    #removing rows with '?'
    for header in data_head_dict[filename]:
        df.drop(df.loc[df[header]=='?'].index, inplace=True)
    return df

def preprocess_bank(filename):
        
    # read data and add column headers
    df = pd.read_csv(filename, names=data_head_dict[filename])
    continuous = ["age", "balance", "day", "duration", "campaign"]
    for header in continuous:
            minimum = df[header].min()
            maximum = df[header].max()
            step = (maximum-minimum)/3
            df[header]=pd.cut(x = df[header], 
                              bins = [minimum,minimum+step,maximum-step, 
                                      maximum], labels = [0, 1, 2])
            
    return df


In [4]:
# This function should calculate prior probabilities and likelihoods from the training data and using
# them to build a naive Bayes model

def train(training_set, filename):
    prior = {}
    distinct_class = []
    length_training_set = len(training_set)
    class_column = training_set['class']
    for class_header in class_column:
        if class_header not in distinct_class:
            distinct_class.append(class_header)
    total = training_set['class'].value_counts().to_dict()
    for header in distinct_class:
        #storing prior probability of each class
        prior[header] = total[header]/length_training_set

    if filename == "wine.data" or filename == "somerville.data":
        #continuous data
        #calculating variance and mean dataset
        data_variance=training_set.groupby('class', as_index=False).var()
        data_variance

        data_mean=training_set.groupby('class', as_index=False).mean()
        data_mean

        #calculate mean of each attribute per class
        d_means = {}
        headers = training_set.columns
        for class_header in distinct_class:
            temp = data_mean.loc[data_mean['class'] == class_header].to_numpy()
            d_means[class_header] = temp[0][1:]

        #calculate variance of each attribute per class
        d_variances = {}
        for class_header in distinct_class:
            temp = data_variance.loc[data_variance['class'] == class_header].to_numpy()
            d_variances[class_header] = temp[0][1:]
        return d_means, d_variances, prior   
    #else:
        #categorical data
        #did'nt have time to do this part :(
        

In [5]:
from math import sqrt
from math import pi
from math import exp
#calculates p(x|y), the probability density of one term of the likelihood
def calculate_probability(x, mean, var):
    exponent = exp(-((x-mean)**2 / (2 * var )))
    return (1 / (sqrt(2 * pi) * sqrt(var))) * exponent

In [6]:
#predicts classes of instances from the testing_set
def predict(testing_set, d_means, d_variances, prior):
    class_headers = testing_set['class']
    testing_set = testing_set.drop('class', 1)
    predictions = {}
    predicted_class = {}
    total_probability = 1
    count = 0
    for i in range(len(class_headers)): 
        feature = testing_set.iloc[i]
        for class_header in prior.keys(): 
            for attribute in testing_set.columns:
                total_probability*=calculate_probability(feature[attribute], d_means[class_header][count], d_variances[class_header][count])
                count+=1
            #calculating numerator of posterior for each class
            total_probability*=prior[class_header]
            predictions[class_header] = total_probability
            total_probability = 1
            count = 0
            total_probability
        #storing the class with the highest probability since that is the prediction
        predicted_class[i] = max(predictions, key=predictions.get)

    return predicted_class

In [7]:
#checks if predicted_class is the same as class_headers and returns accuracy
def evaluate(predicted_class, testing_set):
    class_headers = testing_set['class']
    count=0
    for i in range(len(class_headers)):
        if class_headers.iloc[i] == predicted_class[i]:
            count+=1
    accuracy = count/len(class_headers)
    return accuracy

In [8]:
#running and printing relvant info on datasets
wine = preprocess_rest("wine.data")
mean,var,prior = train(wine, "wine.data")
pred = predict(wine, mean, var, prior)
acc_wine = evaluate(pred, wine)

somerville = preprocess_rest("somerville.data")
mean,var,prior = train(somerville, "somerville.data")
pred = predict(somerville, mean, var, prior)
acc_somerville = evaluate(pred, somerville)

print("For wine.data we have:")
print("Number of features (columns): %d" % len(wine.columns))
print("Number of instances (rows): %d" % len(wine))
print("Accuracy: %f" % acc_wine)

print("For somerville.data we have:")
print("Number of features (columns): %d" % len(somerville.columns))
print("Number of instances (rows): %d" % len(somerville))
print("Accuracy: %f" % acc_somerville)


For wine.data we have:
Number of features (columns): 14
Number of instances (rows): 178
Accuracy: 0.988764
For somerville.data we have:
Number of features (columns): 7
Number of instances (rows): 143
Accuracy: 0.608392


  res_values = method(rvalues)


Discretising continuous variables, then passing it through the classifier on average yielded the same performance. The wine dataset yielded a 98.89% without discretising and 100% when discretising in the preprocess stage. However, this is pretty trivial as both the continuous data and discrete data produced very high results. On the other hand the adult and bank datasets yielded around 25% originally and 27% when the continuous variables were discretised. All three datasets were discretised into 3 equal width bins. Using more bins yielded very similar results. We can see that discretising continuous variables yields a very small but greater accuracy. This means that discretisation preserves the conditional probability of each class for each instance. While we do see a small increase in accuracy, we can conclude that discretisation has little to no effect on the datasets and yields similar performance to continuous data in the naive bayes classifier.


The ordinal dataset “somerville.data” was treated as a numeric variable. Simply by comparing accuracies between “somerville.data” and “wine.data” which is originally a numeric dataset, we can see that the ordinal data has an accuracy of 60.84% where the numeric data has an accuracy of 98.89%. If we look at the number of features and instances of these datasets, the wine dataset has more features and instances with 178 instances and 14 features whereas the somerville dataset has 143 instances and 7 features. Despite having more features and instances the wine dataset outperformed the somerville dataset by nearly 40%. This implies that the classification prefers numeric data and finds it difficult to predict ordinal data that are treated as numeric, especially since the wine dataset yielded nearly a 100% accuracy. However, testing was done on the same data as the training, so results would naturally be higher than if we tested the data on unknown instances.
