From 8f914f9e74469962d79d2a7a86dc5b82017c5acd Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Wed, 29 Mar 2017 19:07:15 +0300 Subject: [PATCH 1/8] Add Gaussian Function --- utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils.py b/utils.py index ed44f1e9e..f5d960442 100644 --- a/utils.py +++ b/utils.py @@ -258,6 +258,10 @@ def step(x): """Return activation value of x with sign function""" return 1 if x >= 0 else 0 +def gaussian(mean, st_dev, x): + """Given the mean and standard deviation of a distribution, it returns the probability of x.""" + return 1/(math.sqrt(2*math.pi)*st_dev)*math.e**(-0.5*(float(x-mean)/st_dev)**2) + try: # math.isclose was added in Python 3.5; but we might be in 3.4 from math import isclose From 293a414087eb3ef5bfcdbc4f0bbc81491dda930e Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Wed, 29 Mar 2017 19:08:39 +0300 Subject: [PATCH 2/8] Added Tests Add tests for Continuous Naive Bayes + Means/Standard Deviation --- tests/test_learning.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/test_learning.py b/tests/test_learning.py index 4f618f7c1..8b29437b6 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -22,6 +22,20 @@ def test_weighted_replicate(): assert weighted_replicate('ABC', [1, 2, 1], 4) == ['A', 'B', 'B', 'C'] +def test_means_and_deviation(): + iris = DataSet(name="iris") + + means, deviations = iris.find_means_and_deviations() + + assert means["setosa"] == [5.006, 3.418, 1.464, 0.244] + assert means["versicolor"] == [5.936, 2.77, 4.26, 1.326] + assert means["virginica"] == [6.588, 2.974, 5.552, 2.026] + + assert round(deviations["setosa"][0],3) == 0.352 + assert round(deviations["versicolor"][0],3) == 0.516 + assert round(deviations["virginica"][0],3) == 0.636 + + def test_plurality_learner(): zoo = DataSet(name="zoo") @@ -32,8 +46,14 @@ def test_plurality_learner(): def test_naive_bayes(): iris = DataSet(name="iris") - nB = NaiveBayesLearner(iris) - assert nB([5,3,1,0.1]) == "setosa" + # Discrete + nBD = NaiveBayesLearner(iris) + assert nBD([5,3,1,0.1]) == "setosa" + + # Continuous + nBC = NaiveBayesLearner(iris, continuous=True) + assert nBC([5,3,1,0.1]) == "setosa" + assert nBC([7,3,6.5,2]) == "virginica" def test_k_nearest_neighbors(): From 1abefb856858cc0c5443eb17727d454e5234f02c Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Wed, 29 Mar 2017 19:13:58 +0300 Subject: [PATCH 3/8] Update learning.py --- learning.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 7 deletions(-) diff --git a/learning.py b/learning.py index ec685131d..1768782e6 100644 --- a/learning.py +++ b/learning.py @@ -1,7 +1,7 @@ """Learn to estimate functions from examples. (Chapters 18-20)""" from utils import ( - removeall, unique, product, mode, argmax, argmax_random_tie, isclose, + removeall, unique, product, mode, argmax, argmax_random_tie, isclose, gaussian, dotproduct, vector_add, scalar_vector_product, weighted_sample_with_replacement, weighted_sampler, num_or_str, normalize, clip, sigmoid, print_table, DataFile ) @@ -11,7 +11,7 @@ import math import random -from statistics import mean +from statistics import mean, stdev from collections import defaultdict # ______________________________________________________________________________ @@ -174,6 +174,45 @@ def remove_examples(self, value=""): self.examples = [x for x in self.examples if value not in x] self.update_values() + def split_values_by_classes(self): + """Split values into buckets according to their class.""" + buckets = defaultdict(lambda: []) + target_names = self.values[self.target] + + for v in self.examples: + item = [a for a in v if a not in target_names] # Remove target from item + buckets[v[self.target]].append(item) # Add item to bucket of its class + + return buckets + + def find_means_and_deviations(self): + """Finds the means and standard deviations of self.dataset. + means : A dictionary for each class/target. Holds a list of the means + of the features for the class. + deviations: A dictionary for each class/target. Holds a list of the sample + standard deviations of the features for the class.""" + target_names = self.values[self.target] + feature_numbers = len(self.inputs) + + item_buckets = self.split_values_by_classes() + + means = defaultdict(lambda: [0 for i in range(feature_numbers)]) + deviations = defaultdict(lambda: [0 for i in range(feature_numbers)]) + + for t in target_names: + # Find all the item feature values for item in class t + features = [[] for i in range(feature_numbers)] + for item in item_buckets[t]: + features = [features[i] + [item[i]] for i in range(feature_numbers)] + + # Calculate means and deviations fo the class + for i in range(feature_numbers): + means[t][i] = mean(features[i]) + deviations[t][i] = stdev(features[i]) + + return means, deviations + + def __repr__(self): return ''.format( self.name, len(self.examples), len(self.attrs)) @@ -263,15 +302,22 @@ def predict(example): # ______________________________________________________________________________ -def NaiveBayesLearner(dataset): +def NaiveBayesLearner(dataset, continuous=True): + if(continuous): + return NaiveBayesContinuous(dataset) + else: + return NaiveBayesDiscrete(dataset) + + +def NaiveBayesDiscrete(dataset): """Just count how many times each value of each input attribute occurs, conditional on the target value. Count the different target values too.""" - targetvals = dataset.values[dataset.target] - target_dist = CountingProbDist(targetvals) + target_vals = dataset.values[dataset.target] + target_dist = CountingProbDist(target_vals) attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr]) - for gv in targetvals + for gv in target_vals for attr in dataset.inputs} for example in dataset.examples: targetval = example[dataset.target] @@ -286,7 +332,27 @@ def class_probability(targetval): return (target_dist[targetval] * product(attr_dists[targetval, attr][example[attr]] for attr in dataset.inputs)) - return argmax(targetvals, key=class_probability) + return argmax(target_vals, key=class_probability) + + return predict + + +def NaiveBayesContinuous(dataset): + means, deviations = dataset.find_means_and_deviations() + + target_vals = dataset.values[dataset.target] + target_dist = CountingProbDist(target_vals) + + def predict(example): + """Predict the target value for example. Consider each possible value, + and pick the most likely by looking at each attribute independently.""" + def class_probability(targetval): + prob = target_dist[targetval] + for attr in dataset.inputs: + prob *= gaussian(means[targetval][attr], deviations[targetval][attr], example[attr]) + return prob + + return argmax(target_vals, key=class_probability) return predict From 5b627d4ca1e4e3f372c24bf10cc1c21ce5cc4a93 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Wed, 29 Mar 2017 19:21:22 +0300 Subject: [PATCH 4/8] Commenting Fix --- learning.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/learning.py b/learning.py index 1768782e6..caf1af60a 100644 --- a/learning.py +++ b/learning.py @@ -338,6 +338,8 @@ def class_probability(targetval): def NaiveBayesContinuous(dataset): + """Count how many times each target value occurs. + Also, find the means and deviations of input attribute values for each target value.""" means, deviations = dataset.find_means_and_deviations() target_vals = dataset.values[dataset.target] From cf5d7727187e4c0d2c71a37ba1d566637bcec9ad Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Thu, 30 Mar 2017 20:47:46 +0300 Subject: [PATCH 5/8] Add test for gaussian --- tests/test_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 76e0421b3..6e0ca160d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -136,6 +136,12 @@ def test_sigmoid(): assert isclose(0.2689414213699951, sigmoid(-1)) +def test_gaussian(): + assert gaussian(1,0.5,0.7) == 0.6664492057835993 + assert gaussian(5,2,4.5) == 0.19333405840142462 + assert gaussian(3,1,3) == 0.3989422804014327 + + def test_step(): assert step(1) == step(0.5) == 1 assert step(0) == 1 From a86c1027112c541410038c5df7200cdbc53bcb51 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Fri, 7 Apr 2017 12:27:44 +0300 Subject: [PATCH 6/8] test for every class --- tests/test_learning.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_learning.py b/tests/test_learning.py index 8b29437b6..4b1d2ecb3 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -49,10 +49,13 @@ def test_naive_bayes(): # Discrete nBD = NaiveBayesLearner(iris) assert nBD([5,3,1,0.1]) == "setosa" + assert nBD([6,5,3,1.5]) == "versicolor" + assert nBD([7,3,6.5,2]) == "virginica" # Continuous nBC = NaiveBayesLearner(iris, continuous=True) assert nBC([5,3,1,0.1]) == "setosa" + assert nBC([6,5,3,1.5]) == "versicolor" assert nBC([7,3,6.5,2]) == "virginica" From c8ae88a9f702872c13bd3194886edddc266901d3 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Fri, 7 Apr 2017 22:33:06 +0300 Subject: [PATCH 7/8] Update test_learning.py --- tests/test_learning.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_learning.py b/tests/test_learning.py index 109c7a7d4..5a3302cbf 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -35,6 +35,20 @@ def test_weighted_replicate(): assert weighted_replicate('ABC', [1, 2, 1], 4) == ['A', 'B', 'B', 'C'] +def test_means_and_deviation(): + iris = DataSet(name="iris") + + means, deviations = iris.find_means_and_deviations() + + assert means["setosa"] == [5.006, 3.418, 1.464, 0.244] + assert means["versicolor"] == [5.936, 2.77, 4.26, 1.326] + assert means["virginica"] == [6.588, 2.974, 5.552, 2.026] + + assert round(deviations["setosa"][0],3) == 0.352 + assert round(deviations["versicolor"][0],3) == 0.516 + assert round(deviations["virginica"][0],3) == 0.636 + + def test_plurality_learner(): zoo = DataSet(name="zoo") From 1442d42c2facfae84e939322d91c7406608bf2cf Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Thu, 13 Apr 2017 15:29:10 +0300 Subject: [PATCH 8/8] Round float results to make sure test passes --- tests/test_learning.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_learning.py b/tests/test_learning.py index 5a3302cbf..6f3a2ca03 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -40,13 +40,13 @@ def test_means_and_deviation(): means, deviations = iris.find_means_and_deviations() - assert means["setosa"] == [5.006, 3.418, 1.464, 0.244] - assert means["versicolor"] == [5.936, 2.77, 4.26, 1.326] - assert means["virginica"] == [6.588, 2.974, 5.552, 2.026] + assert round(means["setosa"][0], 3) == 5.006 + assert round(means["versicolor"][0], 3) == 5.936 + assert round(means["virginica"][0], 3) == 6.588 - assert round(deviations["setosa"][0],3) == 0.352 - assert round(deviations["versicolor"][0],3) == 0.516 - assert round(deviations["virginica"][0],3) == 0.636 + assert round(deviations["setosa"][0], 3) == 0.352 + assert round(deviations["versicolor"][0], 3) == 0.516 + assert round(deviations["virginica"][0], 3) == 0.636 def test_plurality_learner():