Step 1: Separate By Class

In [3]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Test separating data by class
# merah=2, hitam=3, sport=4, SUV=5, domestik=6, import=7, laris=1, tidak=0
dataset = [[2,4,6,1],
	[2,4,6,1],
	[3,4,6,0],
	[3,4,7,1],
	[3,5,7,0],
	[3,5,7,1],
	[3,5,6,0],
	[2,5,7,0],
	[2,4,7,1]]
separated = separate_by_class(dataset)
for label in separated:
	print(label)
	for row in separated[label]:
		print(row)

1
[2, 4, 6, 1]
[2, 4, 6, 1]
[3, 4, 7, 1]
[3, 5, 7, 1]
[2, 4, 7, 1]
0
[3, 4, 6, 0]
[3, 5, 7, 0]
[3, 5, 6, 0]
[2, 5, 7, 0]


Step 2: Summarize Dataset

Step 3: Summarize Data By Class

In [8]:
# Example of summarizing a dataset
from math import sqrt
 
# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)
 
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

# test summarizing a dataset
dataset = [[2,4,6,1],
	[2,4,6,1],
	[3,4,6,0],
	[3,4,7,1],
	[3,5,7,0],
	[3,5,7,1],
	[3,5,6,0],
	[2,5,7,0],
	[2,4,7,1]]
summary = summarize_by_class(dataset)
for label in summary:
	print(label)
	for row in summary[label]:
		print(row)

1
(2.4, 0.5477225575051661, 5)
(4.2, 0.4472135954999579, 5)
(6.6, 0.5477225575051662, 5)
0
(2.75, 0.5, 4)
(4.75, 0.5, 4)
(6.5, 0.5773502691896257, 4)


Step 4: Gaussian Probability Density Function


In [10]:
# Example of Gaussian PDF
from math import sqrt
from math import pi
from math import exp
 
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
# Test Gaussian PDF
print(calculate_probability(2, 5, 6))

0.05867755446071659


Step 5: Class Probabilities

In [11]:
# Example of calculating class probabilities
from math import sqrt
from math import pi
from math import exp
 
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated
 
# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))
 
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)
 
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries
 
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries
 
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities
 
# Test calculating class probabilities
dataset = [[2,4,6,1],
	[2,4,6,1],
	[3,4,6,0],
	[3,4,7,1],
	[3,5,7,0],
	[3,5,7,1],
	[3,5,6,0],
	[2,5,7,0],
	[2,4,7,1]]
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)

{1: 0.10000079223856109, 0: 0.014162674798217145}
