In [58]:
import csv
import random
import math

In [59]:
def loadCsv(filename) :
    lines = csv.reader(open(filename, 'rt'))
    dataset = list(lines)
    for i in range(len(dataset)) :
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [60]:
filename = 'data/pima-indians_data.csv'
dataset = loadCsv(filename) 
print('Loaded data file {0} with {1} rows'.format(filename, len(dataset)))

Loaded data file data/pima-indians_data.csv with 768 rows


In [61]:
def splitDataset(dataset, splitRatio) :
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize :
        index = random.randrange(len(copy)) 
        trainSet.append(copy.pop(index))
    return trainSet, copy 

In [62]:
dataset = [[1], [2], [3], [4], [5]]
splitRatio = 0.67
train, test = splitDataset(dataset, splitRatio)
print("train : ", train)
print('test : ' , test)

train :  [[3], [1], [2]]
test :  [[4], [5]]


In [63]:
def separateByClass(dataset) :
    separated = {}
    for i in range(len(dataset)) :
        vector = dataset[i]
        if(vector[-1] not in separated) :
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated 

In [64]:
dataset =[[1, 20, 1], [2, 21, 0], [3, 22, 1]]
separated = separateByClass(dataset)
print('separated : ', separated)

separated :  {1: [[1, 20, 1], [3, 22, 1]], 0: [[2, 21, 0]]}


In [65]:
def mean(numbers) :
    return sum(numbers) / float(len(numbers))

In [66]:
def stdev(numbers) :
    avg = mean(numbers) 
#     print('numbers : ', numbers)
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers) -1)
    return math.sqrt(variance)

In [67]:
numbers = [1,2,3,4,5]
print("mean : ", mean(numbers), ", stdev : " , stdev(numbers))

mean :  3.0 , stdev :  1.5811388300841898


In [70]:
def summarize(dataset) :
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [69]:
dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print('Attribute summaries: {0}'.format(summary))

Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [43]:
def summarizeByClass(dataset) :
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items() :
        summaries[classValue] = summarize(instances) 
    return summaries

In [44]:
dataset = [[1,20,1], [2,21,0], [3,22,1], [4, 22, 0]]
summary = summarizeByClass(dataset)
print('Attribute summaries: {0}'.format(summary))

Attribute summaries: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}


In [45]:
def calculateProbability(x, mean, stdev) :
    exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev, 2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [46]:
x = 71.5
mean = 73
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print('Probability of belonging to this class: {0}'.format(probability))

Probability of belonging to this class: 0.06248965759370005


In [47]:
def calculateClassProbabilities(summaries, inputVector) :
    probabilities = {}
    for classValue, classSummaries in summaries.items() :
        probabilities[classValue] = 1
        for i in range(len(classSummaries)) :
            mean ,stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [48]:
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1, 0.0]
probabilities = calculateClassProbabilities(summaries, inputVector)
print('Probabilities for each class: {0}'.format(probabilities))

Probabilities for each class: {0: 0.7978845608028654, 1: 5.838938515829206e-05}


In [73]:
def predict(summaries, inputVector) :
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb :
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [50]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [21.1, '?']
result = predict(summaries, inputVector)
print('Prediction: {0}'.format(result))

[21.1, '?']
Prediction: B


In [75]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
#         print(testSet[i])
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [52]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [21.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Predictions: {0}'.format(predictions))

[1.1, '?']
[1.1, '?']
[21.1, '?']
[21.1, '?']
Predictions: ['A', 'B']


In [53]:
def getAccuracy(testSet, prediction) :
    correct = 0
    for x in range(len(testSet)) :
        if testSet[x][-1] == prediction[x] :
            correct += 1 
    return (correct/float(len(testSet))) * 100

In [54]:
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}'.format(accuracy))

Accuracy: 66.66666666666666


In [55]:
def main():
	filename = 'data/pima-indians_data.csv'
	splitRatio = 0.67
	dataset = loadCsv(filename)
	trainingSet, testSet = splitDataset(dataset, splitRatio)
	print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
	# prepare model
	summaries = summarizeByClass(trainingSet)
	# test model
	predictions = getPredictions(summaries, testSet)
	accuracy = getAccuracy(testSet, predictions)
	print('Accuracy: {0}%'.format(accuracy))

In [76]:
main()

Split 768 rows into train=514 and test=254 rows
Accuracy: 74.01574803149606%
