### Exp 6 Naive Bayes Classifier

In [1]:
from csv import reader
from math import sqrt
from math import exp
from math import pi
import random

In [2]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
                if not row:
                    continue
                dataset.append(row)
    return dataset

In [3]:
def str_column_to_float(dataset, column):
        for row in dataset:
            row[column] = float(row[column].strip())

In [4]:
def str_column_to_int(dataset, column):
        class_values = [row[column] for row in dataset]
        unique = set(class_values)
        lookup = dict()
        for i, value in enumerate(unique):
            lookup[value] = i
            print('[%s] => %d' % (value, i))
        for row in dataset:
            row[column] = lookup[row[column]]
        return lookup

In [5]:
def separate_by_class(dataset):
        separated = dict()
        for i in range(len(dataset)):
            vector = dataset[i]
            class_value = vector[-1]
            if (class_value not in separated):
                separated[class_value] = list()
            separated[class_value].append(vector)
        return separated

In [6]:
def mean(numbers):
        return sum(numbers)/float(len(numbers))

def stdev(numbers):
        avg = mean(numbers)
        variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
        return sqrt(variance)


In [7]:
def summarize_dataset(dataset):
        summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
        del(summaries[-1])
        return summaries

In [8]:
def summarize_by_class(dataset):
        separated = separate_by_class(dataset)
        summaries = dict()
        for class_value, rows in separated.items():
            summaries[class_value] = summarize_dataset(rows)
        return summaries

In [9]:
def calculate_probability(x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [10]:
def calculate_class_probabilities(summaries, row):
        total_rows = sum([summaries[label][0][2] for label in summaries])
        probabilities = dict()
        for class_value, class_summaries in summaries.items():
            probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
            for i in range(len(class_summaries)):
                mean, stdev, _ = class_summaries[i]
                probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
        return probabilities

In [11]:
def predict(summaries, row):
        probabilities = calculate_class_probabilities(summaries, row)
        best_label, best_prob = None, -1
        for class_value, probability in probabilities.items():
            if best_label is None or probability > best_prob:
                best_prob = probability
                best_label = class_value
        return best_label

### IRIS dataset 

In [12]:
filename = 'iris.csv'
dataset = load_csv(filename)
import random
random.shuffle(dataset) #shuffle the data
training = dataset[:120] #train data
testing=dataset[120:] #test data
for i in range(len(dataset[0])-1): #encode class labels
     str_column_to_float(dataset, i)
'''
Metadata
[Iris-setosa] => 0
[Iris-virginica] => 1
[Iris-versicolor] => 2
'''
str_column_to_int(dataset, len(dataset[0])-1)
# fit model / get summaries @(mean,std,ntotal)
model = summarize_by_class(dataset)

'''
@done
'''
"""
# define a new record
row = [5.7,2.9,4.2,1.3]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
"""

[Iris-virginica] => 0
[Iris-versicolor] => 1
[Iris-setosa] => 2
Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1


 ## testing
 ## Metadata
 <ul>
 <li>150 total dataset samples
 <li>120 used for training
 <li>30 examples are used for testing


In [13]:
testing[:5] #samples

[[6.2, 3.4, 5.4, 2.3, 0],
 [7.2, 3.6, 6.1, 2.5, 0],
 [5.2, 3.4, 1.4, 0.2, 2],
 [7.0, 3.2, 4.7, 1.4, 1],
 [6.3, 2.5, 5.0, 1.9, 0]]

In [14]:
tfeat=[]
tlabel=[]
rowlen = len(testing[0])
for i in range(len(testing)):
    tfeat.append(testing[i][:rowlen-1]) #features
    tlabel.append(testing[i][-1]) #labels last column
#got f and l

In [15]:
def evaluate(model,features , labels ):
    count=0
    assert len(features)==len(labels) , "Testing data error!\n"
    dlen=len(features)
    for i in range(dlen):
        yaht = predict(model, features[i])
        if yaht==labels[i]:
            count+=1
    return(count/dlen)

In [16]:
print("Accuracy={}%".format(round(evaluate(model,tfeat,tlabel),4)*100))

Accuracy=96.67%
