In [1]:
import math

def gaussian_pdf(x, mean, std):
    exponent = math.exp(-((x - mean) ** 2 / (2 * std ** 2)))
    return (1 / (math.sqrt(2 * math.pi) * std)) * exponent

In [2]:
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(vector)
    return separated

In [3]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del summaries[-1]  # Exclude label column
    return summaries

def mean(numbers):
    return sum(numbers) / float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

In [4]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [5]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = math.log(summaries[class_value][0][2] / float(total_rows))
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] += math.log(gaussian_pdf(row[i], mean, stdev))
    return probabilities


def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

def gaussian_naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = []
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

In [6]:
# Example Data
# Features: [feature1, feature2, ..., label]
dataset = [
    [3.393533211, 2.331273381, 0],
    [3.110073483, 1.781539638, 0],
    [1.343808831, 3.368360954, 0],
    [3.582294042, 4.67917911, 0],
    [2.280362439, 2.866990263, 0],
    [7.423436942, 4.696522875, 1],
    [5.745051997, 3.533989803, 1],
    [9.172168622, 2.511101045, 1],
    [7.792783481, 3.424088941, 1],
    [7.939820817, 0.791637231, 1]
]

# Train model
model = summarize_by_class(dataset)

# Test model
test_set = [[3.393533211, 2.331273381], [2.280362439, 2.866990263], [7.792783481, 3.424088941]]
predictions = [predict(model, row) for row in test_set]
print(predictions)


[0, 0, 1]
