Skip to content

Commit

Permalink
Merge pull request #6573 from noahnovsak/naive-bayes-tests
Browse files Browse the repository at this point in the history
Refactor tests for NaiveBayesLearner
  • Loading branch information
markotoplak committed Sep 14, 2023
2 parents 484becf + 67fc41c commit 480930e
Showing 1 changed file with 75 additions and 161 deletions.
236 changes: 75 additions & 161 deletions Orange/tests/test_naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,42 @@
from Orange.classification import NaiveBayesLearner
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.evaluation import CrossValidation, CA
from Orange.tests import test_filename


# This class is used to force predict_storage to fall back to the slower
# procedure instead of calling `predict`
from Orange.tests import test_filename


class NotATable(Table): # pylint: disable=too-many-ancestors,abstract-method
@classmethod
def from_file(cls, *args, **kwargs):
table = super().from_file(*args, **kwargs)
return cls(table)


def assert_predictions_equal(data, model, exp_probs):
exp_vals = np.argmax(np.atleast_2d(exp_probs), axis=1)
np.testing.assert_almost_equal(model(data, ret=model.Probs), exp_probs)
np.testing.assert_equal(model(data), exp_vals)
values, probs = model(data, ret=model.ValueProbs)
np.testing.assert_almost_equal(probs, exp_probs)
np.testing.assert_equal(values, exp_vals)


def assert_model_equal(model, results):
np.testing.assert_almost_equal(
model.class_prob,
results[0])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
results[1])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
results[2])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
results[3])


class TestNaiveBayesLearner(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down Expand Up @@ -100,20 +122,22 @@ def test_compare_results_of_predict_and_predict_storage(self):

def test_predictions(self):
self._test_predictions(sparse=None)
self._test_predictions_with_absent_class(sparse=None)
self._test_predictions(sparse=None, absent_class=True)
self._test_predict_missing_attributes(sparse=None)

def test_predictions_csr_matrix(self):
self._test_predictions(sparse=sp.csr_matrix)
self._test_predictions_with_absent_class(sparse=sp.csr_matrix)
self._test_predictions(sparse=sp.csr_matrix, absent_class=True)
self._test_predict_missing_attributes(sparse=sp.csr_matrix)

def test_predictions_csc_matrix(self):
self._test_predictions(sparse=sp.csc_matrix)
self._test_predictions_with_absent_class(sparse=sp.csc_matrix)
self._test_predictions(sparse=sp.csc_matrix, absent_class=True)
self._test_predict_missing_attributes(sparse=sp.csc_matrix)

def _test_predictions(self, sparse):
@staticmethod
def _create_prediction_data(sparse, absent_class=False):
""" The following was computed manually """
x = np.array([
[1, 0, 0],
[0, np.nan, 0],
Expand All @@ -127,27 +151,13 @@ def _test_predictions(self, sparse):
x = sparse(x)

y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
domain = Domain(
[DiscreteVariable("a", values="ab"),
DiscreteVariable("b", values="abc"),
DiscreteVariable("c", values="a")],
DiscreteVariable("y", values="abc"))
data = Table.from_numpy(domain, x, y)

model = self.learner(data)
np.testing.assert_almost_equal(
model.class_prob,
[4/11, 4/11, 3/11]
)
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
[[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
[[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
[[4/11], [4/11], [3/11]])
class_var = DiscreteVariable("y", values="abc")
results = [
[4/11, 4/11, 3/11],
[[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]],
[[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]],
[[4/11], [4/11], [3/11]]
]

test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
# Classifiers reject csc matrices in the base class
Expand All @@ -156,7 +166,7 @@ def _test_predictions(self, sparse):
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
test_y = np.full((6, ), np.nan)
# The following was computed manually, too

exp_probs = np.array([
[0.47368421052632, 0.31578947368421, 0.21052631578947],
[0.39130434782609, 0.26086956521739, 0.34782608695652],
Expand All @@ -166,155 +176,54 @@ def _test_predictions(self, sparse):
[0.15000000000000, 0.45000000000000, 0.40000000000000]
])

# Test the faster algorithm for Table (numpy matrices)
test_data = Table.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test the slower algorithm for non-Table data (iteration in Python)
test_data = NotATable.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction directly on numpy
probs = model(test_x, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_x)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_x, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction on instances
for inst, exp_prob in zip(test_data, exp_probs):
np.testing.assert_almost_equal(
model(inst, ret=model.Probs),
exp_prob)
self.assertEqual(model(inst), np.argmax(exp_prob))
value, prob = model(inst, ret=model.ValueProbs)
np.testing.assert_almost_equal(prob, exp_prob)
self.assertEqual(value, np.argmax(exp_prob))

# Test prediction by directly calling predict. This is needed to test
# csc_matrix, but doesn't hurt others
if sparse is sp.csc_matrix:
test_x = sparse(test_x)
values, probs = model.predict(test_x)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

def _test_predictions_with_absent_class(self, sparse):
"""Empty classes should not affect predictions"""
x = np.array([
[1, 0, 0],
[0, np.nan, 0],
[0, 1, 0],
[0, 0, 0],
[1, 2, 0],
[1, 1, 0],
[1, 2, 0],
[0, 1, 0]])
if sparse is not None:
x = sparse(x)
if absent_class:
y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
class_var = DiscreteVariable("y", values="abcd")
for i, row in enumerate(results):
row.insert(1, i and [0]*len(row[0]))
exp_probs = np.insert(exp_probs, 1, 0, axis=1)

y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
domain = Domain(
[DiscreteVariable("a", values="ab"),
DiscreteVariable("b", values="abc"),
DiscreteVariable("c", values="a")],
DiscreteVariable("y", values="abcd"))
class_var)
data = Table.from_numpy(domain, x, y)

model = self.learner(data)
np.testing.assert_almost_equal(
model.class_prob,
[4/11, 0, 4/11, 3/11]
)
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
[[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
[[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
[[4/11], [0], [4/11], [3/11]])
return data, domain, results, test_x, test_y, exp_probs

test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
# Classifiers reject csc matrices in the base class
# Naive bayesian classifier supports them if predict_storage is
# called directly, which we do below
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
test_y = np.full((6, ), np.nan)
# The following was computed manually, too
exp_probs = np.array([
[0.47368421052632, 0, 0.31578947368421, 0.21052631578947],
[0.39130434782609, 0, 0.26086956521739, 0.34782608695652],
[0.24324324324324, 0, 0.32432432432432, 0.43243243243243],
[0.31578947368421, 0, 0.47368421052632, 0.21052631578947],
[0.26086956521739, 0, 0.39130434782609, 0.34782608695652],
[0.15000000000000, 0, 0.45000000000000, 0.40000000000000]
])
def _test_predictions(self, sparse, absent_class=False):
(data, domain, results,
test_x, test_y, exp_probs) = self._create_prediction_data(sparse, absent_class)

model = self.learner(data)
assert_model_equal(model, results)

# Test the faster algorithm for Table (numpy matrices)
test_data = Table.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
assert_predictions_equal(test_data, model, exp_probs)

# Test the slower algorithm for non-Table data (iteration in Python)
test_data = NotATable.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
assert_predictions_equal(test_data, model, exp_probs)

# Test prediction directly on numpy
probs = model(test_x, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_x)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_x, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
assert_predictions_equal(test_x, model, exp_probs)

# Test prediction on instances
for inst, exp_prob in zip(test_data, exp_probs):
np.testing.assert_almost_equal(
model(inst, ret=model.Probs),
exp_prob)
self.assertEqual(model(inst), np.argmax(exp_prob))
value, prob = model(inst, ret=model.ValueProbs)
np.testing.assert_almost_equal(prob, exp_prob)
self.assertEqual(value, np.argmax(exp_prob))
assert_predictions_equal(inst, model, exp_prob)

# Test prediction by directly calling predict. This is needed to test
# csc_matrix, but doesn't hurt others
if sparse is sp.csc_matrix:
test_x = sparse(test_x)
values, probs = model.predict(test_x)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_almost_equal(probs, exp_probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

def _test_predict_missing_attributes(self, sparse):
@staticmethod
def _create_missing_attributes(sparse):
x = np.array([
[1, 0, 0],
[0, 1, 0],
Expand All @@ -325,24 +234,29 @@ def _test_predict_missing_attributes(self, sparse):
[1, 2, np.nan]])
if sparse is not None:
x = sparse(x)
y = np.array([1,0,0,0,1,1,1])
y = np.array([1, 0, 0, 0, 1, 1, 1])

test_x = np.array([[np.nan, np.nan, np.nan],
[np.nan, 0, np.nan],
[0, np.nan, np.nan]])
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
exp_probs = np.array([[(3 + 1) / (7 + 2), (4 + 1) / (7 + 2)],
[(1 + 1) / (2 + 2), (1 + 1) / (2 + 2)],
[(3 + 1) / (3 + 2), (0 + 1) / (3 + 2)]])

domain = Domain(
[DiscreteVariable("a", values="ab"),
DiscreteVariable("b", values="abc"),
DiscreteVariable("c", values="a")],
DiscreteVariable("y", values="AB"))
data = Table.from_numpy(domain, x, y)
return Table.from_numpy(domain, x, y), test_x, exp_probs

def _test_predict_missing_attributes(self, sparse):
data, test_x, exp_probs = self._create_missing_attributes(sparse)
model = self.learner(data)
test_x = np.array([[np.nan, np.nan, np.nan],
[np.nan, 0, np.nan],
[0, np.nan, np.nan]])
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
probs = model(test_x, ret=model.Probs)
np.testing.assert_almost_equal(probs, [[(3+1)/(7+2), (4+1)/(7+2)],
[(1+1)/(2+2), (1+1)/(2+2)],
[(3+1)/(3+2), (0+1)/(3+2)]])
np.testing.assert_almost_equal(probs, exp_probs)

def test_no_attributes(self):
y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
Expand Down

0 comments on commit 480930e

Please sign in to comment.