diff --git a/learning.py b/learning.py index fffbccf83..cdce29143 100644 --- a/learning.py +++ b/learning.py @@ -1,7 +1,7 @@ """Learn to estimate functions from examples. (Chapters 18-20)""" from utils import ( - removeall, unique, product, mode, argmax, argmax_random_tie, isclose, + removeall, unique, product, mode, argmax, argmax_random_tie, isclose, gaussian, dotproduct, vector_add, scalar_vector_product, weighted_sample_with_replacement, weighted_sampler, num_or_str, normalize, clip, sigmoid, print_table, DataFile ) @@ -11,7 +11,7 @@ import math import random -from statistics import mean +from statistics import mean, stdev from collections import defaultdict # ______________________________________________________________________________ @@ -178,6 +178,45 @@ def remove_examples(self, value=""): self.examples = [x for x in self.examples if value not in x] self.update_values() + def split_values_by_classes(self): + """Split values into buckets according to their class.""" + buckets = defaultdict(lambda: []) + target_names = self.values[self.target] + + for v in self.examples: + item = [a for a in v if a not in target_names] # Remove target from item + buckets[v[self.target]].append(item) # Add item to bucket of its class + + return buckets + + def find_means_and_deviations(self): + """Finds the means and standard deviations of self.dataset. + means : A dictionary for each class/target. Holds a list of the means + of the features for the class. + deviations: A dictionary for each class/target. Holds a list of the sample + standard deviations of the features for the class.""" + target_names = self.values[self.target] + feature_numbers = len(self.inputs) + + item_buckets = self.split_values_by_classes() + + means = defaultdict(lambda: [0 for i in range(feature_numbers)]) + deviations = defaultdict(lambda: [0 for i in range(feature_numbers)]) + + for t in target_names: + # Find all the item feature values for item in class t + features = [[] for i in range(feature_numbers)] + for item in item_buckets[t]: + features = [features[i] + [item[i]] for i in range(feature_numbers)] + + # Calculate means and deviations fo the class + for i in range(feature_numbers): + means[t][i] = mean(features[i]) + deviations[t][i] = stdev(features[i]) + + return means, deviations + + def __repr__(self): return ''.format( self.name, len(self.examples), len(self.attrs)) @@ -267,15 +306,22 @@ def predict(example): # ______________________________________________________________________________ -def NaiveBayesLearner(dataset): +def NaiveBayesLearner(dataset, continuous=True): + if(continuous): + return NaiveBayesContinuous(dataset) + else: + return NaiveBayesDiscrete(dataset) + + +def NaiveBayesDiscrete(dataset): """Just count how many times each value of each input attribute occurs, conditional on the target value. Count the different target values too.""" - targetvals = dataset.values[dataset.target] - target_dist = CountingProbDist(targetvals) + target_vals = dataset.values[dataset.target] + target_dist = CountingProbDist(target_vals) attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr]) - for gv in targetvals + for gv in target_vals for attr in dataset.inputs} for example in dataset.examples: targetval = example[dataset.target] @@ -290,7 +336,29 @@ def class_probability(targetval): return (target_dist[targetval] * product(attr_dists[targetval, attr][example[attr]] for attr in dataset.inputs)) - return argmax(targetvals, key=class_probability) + return argmax(target_vals, key=class_probability) + + return predict + + +def NaiveBayesContinuous(dataset): + """Count how many times each target value occurs. + Also, find the means and deviations of input attribute values for each target value.""" + means, deviations = dataset.find_means_and_deviations() + + target_vals = dataset.values[dataset.target] + target_dist = CountingProbDist(target_vals) + + def predict(example): + """Predict the target value for example. Consider each possible value, + and pick the most likely by looking at each attribute independently.""" + def class_probability(targetval): + prob = target_dist[targetval] + for attr in dataset.inputs: + prob *= gaussian(means[targetval][attr], deviations[targetval][attr], example[attr]) + return prob + + return argmax(target_vals, key=class_probability) return predict diff --git a/tests/test_learning.py b/tests/test_learning.py index 1bac9a4cc..8ab7b3d1b 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -35,6 +35,20 @@ def test_weighted_replicate(): assert weighted_replicate('ABC', [1, 2, 1], 4) == ['A', 'B', 'B', 'C'] +def test_means_and_deviation(): + iris = DataSet(name="iris") + + means, deviations = iris.find_means_and_deviations() + + assert round(means["setosa"][0], 3) == 5.006 + assert round(means["versicolor"][0], 3) == 5.936 + assert round(means["virginica"][0], 3) == 6.588 + + assert round(deviations["setosa"][0], 3) == 0.352 + assert round(deviations["versicolor"][0], 3) == 0.516 + assert round(deviations["virginica"][0], 3) == 0.636 + + def test_plurality_learner(): zoo = DataSet(name="zoo") @@ -48,6 +62,14 @@ def test_naive_bayes(): # Discrete nBD = NaiveBayesLearner(iris) assert nBD([5, 3, 1, 0.1]) == "setosa" + assert nBD([6, 5, 3, 1.5]) == "versicolor" + assert nBD([7, 3, 6.5, 2]) == "virginica" + + # Continuous + nBC = NaiveBayesLearner(iris, continuous=True) + assert nBC([5, 3, 1, 0.1]) == "setosa" + assert nBC([6, 5, 3, 1.5]) == "versicolor" + assert nBC([7, 3, 6.5, 2]) == "virginica" def test_k_nearest_neighbors(): diff --git a/tests/test_utils.py b/tests/test_utils.py index ae39cf50e..f444dccca 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -148,6 +148,12 @@ def test_sigmoid(): assert isclose(0.2689414213699951, sigmoid(-1)) +def test_gaussian(): + assert gaussian(1,0.5,0.7) == 0.6664492057835993 + assert gaussian(5,2,4.5) == 0.19333405840142462 + assert gaussian(3,1,3) == 0.3989422804014327 + + def test_step(): assert step(1) == step(0.5) == 1 assert step(0) == 1 diff --git a/utils.py b/utils.py index d738f62e6..592258c41 100644 --- a/utils.py +++ b/utils.py @@ -258,6 +258,10 @@ def step(x): """Return activation value of x with sign function""" return 1 if x >= 0 else 0 +def gaussian(mean, st_dev, x): + """Given the mean and standard deviation of a distribution, it returns the probability of x.""" + return 1/(math.sqrt(2*math.pi)*st_dev)*math.e**(-0.5*(float(x-mean)/st_dev)**2) + try: # math.isclose was added in Python 3.5; but we might be in 3.4 from math import isclose