## Implementing Naive Bayes From Scratch

## Table of Content

* [Separate Class inot Dataset](separate)
* [Summarize Dataset](#summary)
* [Summarize Data By Class](#data_by_class)
* [ Gaussian Probability Density Function](#gaussian_pdf)
* [Class Probabilities](#probabilities)

<a id='separate'></a>

### Separate Class into Dataset

In [20]:
def seperate_by_class(dataset):
    seperated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        label = vector[-1]
        if label not in seperated:
            seperated[label] = list()
        seperated[label].append(vector)
    return seperated

<a id ='data_by_class'></a>

### Summarize Dataset

In [21]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [22]:
def standard_deviation(numbers):
    import math
    avg = mean(numbers)
    variance = sum([(x - avg)**2 for x in numbers])/float(len(numbers) - 1)
    return math.sqrt(variance)

In [23]:
def summarize_data(dataset):
    summaries = [(mean(column), standard_deviation(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [24]:

dataset = [[3.393533211,2.331273381,0],[3.110073483,1.781539638,0],[1.343808831,3.368360954,0],[3.582294042,4.67917911,0],[2.280362439,2.866990263,0],[7.423436942,4.696522875,1],[5.745051997,3.533989803,1],[9.172168622,2.511101045,1],[7.792783481,3.424088941,1],[7.939820817,0.791637231,1]]
summary = summarize_data(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


<a id ='summary'></a>

### Summarize Data By Class

In [25]:

#split data into classes and calaculate statistics for each row
def summarize_by_class(dataset):
    seperated = seperate_by_class(dataset)
    summaries = dict()
    for class_value, rows in seperated.items():
        summaries[class_value] = summarize_data(rows)
    return summaries

In [26]:
summarize_by_class(dataset)

{0: [(2.7420144012, 0.9265683289298018, 5),
  (3.0054686692, 1.1073295894898725, 5)],
 1: [(7.6146523718, 1.2344321550313704, 5),
  (2.9914679790000003, 1.4541931384601618, 5)]}

<a id='gaussian_pdf'></a>

### Gaussian Probability Density Function

In [27]:
def calculate_gaussian_probability(x, mean, std):
    import math
    exponent = math.exp(-((x - mean)**2 / (2 * std**2)))
    return (1 / (math.sqrt(2 * math.pi) * std)) * exponent

In [28]:
print(calculate_gaussian_probability(1.0, 1.0, 1.0))


0.3989422804014327


<a id='probabilities'></a>

### Class Probabilities

In [29]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    #get probability of each class
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, std, count = class_summaries[i]
            probabilities[class_value] *= calculate_gaussian_probability(row[i], mean, std)
    return probabilities

In [30]:

summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)

{0: 0.05032427673372076, 1: 0.00011557718379945765}
