In [None]:
# Building Naive Bayes classifier algorithm for iris flower species case study

In [48]:
# load csv file
from csv import reader
def load_csv(filename):
    dataset=list()
    open_file=open(filename)
    read_file=reader(open_file)
    for row in read_file:
        if not row:
            continue
        dataset.append(row)
    return dataset

In [49]:
# Converting String column values to float values
def convert_str_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column])

In [50]:
# Convert categorical value into string integer value
def convert_str_to_int(dataset,column):
    class_value=[row[column] for row in dataset]
    unique=set(class_value)
    unique_value=dict()
    for i,value in enumerate(unique):
        unique_value[value]=i
    for row in dataset:
        row[column]=unique_value[row[column]]
    return unique_value

In [27]:
# Finding min and max values for each attribute or column
def column_minmax(dataset):
    minmax=list()
    for i in range(len(dataset[0])):
        column_values=[row[i] for row in dataset]
        min_value=min(column_values)
        max_value=max(column_values)
        minmax.append([min_value,max_value])
    return minmax

In [28]:
# Nomalizing our dataset using normalize scale technique
def normalize_scale(dataset,minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i]=(row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])

In [59]:
# Calculating accuracy of an algorithm
def classification_accuracy(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1
    return correct/float(len(actual))*100.0

In [54]:
# Evaluate model accuracy by KFold crossvalidation data split technique
from random import seed
from random import randrange
def KFold(dataset,folds):
    fold_values=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/folds)
    for _ in range(folds):
        fold=list()
        while len(fold)<fold_size:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        fold_values.append(fold)
    return fold_values

In [31]:
# Evaluate accuracy of model
def evaluate_model(dataset,algorithm,folds,*args):
    folds=KFold(dataset,folds)
    predictions=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set,[])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            test_set.append(row_copy)
            row_copy[-1]=None
        predicted=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        accuracy=classification_accuracy(actual,predicted)
        predictions.append(accuracy)
    return predictions

In [32]:
# Building Naive Bayes classification algorithm for Iris Flower Species case study
def naive_bayes_classification(train,test):
    predictions=list()
    # split dataset by class values and calculate statistics for each row
    summarize=summarize_by_class(train)
    for row in test:
        # Getting predictions for test data based on trained model
        predict=predicted(summarize,row)
        predictions.append(predict)
    return predictions

In [33]:
# Split dataset by class values then calculate statistics for each row
def summarize_by_class(train):
    # Separate dataset by class values and store all values in dictionary
    summarize=dict()
    separate=separate_by_class(train)
    # Calculate Statistics for each row
    for class_value,value in separate.items():
        summarize[class_value]=summarize_dataset(value)
    return summarize

In [34]:
# Separate dataset by class values
def separate_by_class(train):
    # Storing all separated class values in dictionary
    separate=dict()
    # split train dataset one by one row and store class values separatly 
    # Make one list for each class value
    # Store all those values in dictionary
    for i in range(len(train)):
        vector=train[i]
        class_value=vector[-1]
        if (class_value not in separate):
            separate[class_value]=list()
        separate[class_value].append(vector)
    return separate

In [35]:
# Calculate mean, standard deviation and count for each column
def summarize_dataset(value):
    summarize=[(mean(column),std_dev(column),len(column) )for column in zip(*value)]
    del(summarize[-1])
    return summarize

In [36]:
# Mean for each column
def mean(column):
    return sum(column)/float(len(column))

In [56]:
# Standard deviation for each column
from math import sqrt
def std_dev(column):
    variance=sum([(x-mean(column))**2 for x in column])/float(len(column)-1)
    return sqrt(variance)

In [38]:
# predict the class for a given row
def predicted(summarize,row):
    probabilities=calculate_class_probabilities(summarize,row)
    best_label,best_prob=None,-1
    for class_value, probability in probabilities.items():
        if best_label is None or probabilities.items():
            best_prob=probability
            best_label=class_value
    return best_label

In [39]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries,row):
    total_rows=sum([summaries[label][0][2] for label in summaries])
    probabilities=dict()
    for class_value,class_summaries in summaries.items():
        probabilities[class_value]=summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean,stdev,_=class_summaries[i]
            probabilities[class_value]*=calculate_probability(row[i],mean,stdev)
    return probabilities

In [40]:
# calculate the gaussian probability distribution funtion for x
from math import exp
from math import pi
def calculate_probability(x,mean,stdev):
    exponent=exp(-((x-mean)**2/(2*stdev**2)))
    return (1/(sqrt(2*pi)*stdev))*exponent

In [62]:
# test Navie bayes on iris dataset
from random import seed
seed(1)
filename='iris.csv'
dataset=load_csv(filename)
convert_str_to_int(dataset,-1)
for i in range(len(dataset[0])):
    convert_str_to_float(dataset,i)
minmax=column_minmax(dataset)
normalize_scale(dataset,minmax)
folds=5
accuracy=evaluate_model(dataset,naive_bayes_classification,folds)
print(accuracy)

[30.0, 30.0, 50.0, 33.33333333333333, 20.0]


In [63]:
# Mean accuracy of a model
print("Means Accuracy of an algorithm",sum(accuracy)/float(len(accuracy)))

('Means Accuracy of an algorithm', 32.666666666666664)
