Merge pull request #255 from noahnovsak/owinteractions-fix-calculation

OWInteractions: calculation avoids long operation
biolab · Nov 22, 2023 · 0d12c42 · 0d12c42
2 parents 7469083 + 3d8faa0
commit 0d12c42
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 75 deletions.
diff --git a/orangecontrib/prototypes/interactions.py b/orangecontrib/prototypes/interactions.py
@@ -0,0 +1,79 @@
+import numpy as np
+
+
+def get_row_ids(ar):
+    row_ids = ar[:, 0].copy()
+    # Assuming the data has been discretized into fewer
+    # than 10000 bins and that `ar` has up to 3 columns,
+    # this should work.
+    # Alternatively generating steps like so might be safer:
+    # steps = ar[:, :-1].max(axis=0) + 1
+    # step_i = np.prod(steps[:i])
+    for i in range(1, ar.shape[1]):
+        row_ids += ar[:, i] * 10000 ** i
+    return row_ids
+
+
+def distribution(ar):
+    nans = np.isnan(ar)
+
+    if ar.ndim == 1:
+        if nans.any():
+            ar = ar[~nans]
+    else:
+        if nans.any():
+            ar = ar[~nans.any(axis=1)]
+
+        # Using `np.unique` with `axis=0` to get row frequency
+        # slows down the main thread!
+        # I'm not sure why, but my guess is, that the underlying
+        # implementation doesn't release the GIL. The simplest
+        # solution seems to be generating unique numbers/ids
+        # based on the contents of each row.
+        ar = get_row_ids(ar)
+
+    _, counts = np.unique(ar, return_counts=True)
+    return counts / ar.shape[0]
+
+
+def entropy(ar):
+    p = distribution(ar)
+    return -np.sum(p * np.log2(p))
+
+
+class InteractionScorer:
+    def __init__(self, data):
+        self.data = data
+        self.class_entropy = 0
+        self.information_gain = np.zeros(data.X.shape[1])
+
+        self.precompute()
+
+    def precompute(self):
+        """
+        Precompute information gain of each attribute to speed up
+        computation and to create heuristic.
+
+        Only removes necessary NaNs to keep as much data as possible.
+        This preserves entropies and information gains invariant of
+        third attribute. This also has the unintended side effect of
+        producing negative information gains in certain situations as
+        well as negative interactions with greater magnitude than the
+        combined information gain.
+        """
+        self.class_entropy = entropy(self.data.Y)
+        for attr in range(self.information_gain.size):
+            self.information_gain[attr] = self.class_entropy \
+                               + entropy(self.data.X[:, attr]) \
+                               - entropy(np.column_stack((self.data.X[:, attr], self.data.Y)))
+
+    def __call__(self, attr1, attr2):
+        attrs = np.column_stack((self.data.X[:, attr1], self.data.X[:, attr2]))
+        return self.class_entropy \
+               - self.information_gain[attr1] \
+               - self.information_gain[attr2] \
+               + entropy(attrs) \
+               - entropy(np.column_stack((attrs, self.data.Y)))
+
+    def normalize(self, score):
+        return score / self.class_entropy
diff --git a/orangecontrib/prototypes/widgets/owinteractions.py b/orangecontrib/prototypes/widgets/owinteractions.py
@@ -25,6 +25,8 @@
 from Orange.preprocess import Discretize, Remove
 import Orange.widgets.data.owcorrelations
 
+from orangecontrib.prototypes.interactions import InteractionScorer
+
 
 SIZE_LIMIT = 1000000
 
@@ -43,52 +45,6 @@ def items():
 		return ["InfoGain Heuristic", "Random Search"]
 
 
-class Interaction:
-	def __init__(self, disc_data):
-		self.data = disc_data
-		self.n_attrs = len(self.data.domain.attributes)
-		self.class_h = self.entropy(self.data.Y)
-		self.attr_h = np.zeros(self.n_attrs)
-		self.gains = np.zeros(self.n_attrs)
-		self.removed_h = np.zeros((self.n_attrs, self.n_attrs))
-
-		# Precompute information gain of each attribute for faster overall
-		# computation and to create heuristic. Only removes necessary NaN values
-		# to keep as much data as possible and keep entropies and information gains
-		# invariant of third attribute.
-		# In certain situations this can cause unexpected results i.e. negative
-		# information gains or negative interactions lower than individual
-		# attribute information.
-		self.compute_gains()
-
-	@staticmethod
-	def distribution(ar):
-		nans = np.isnan(ar)
-		if nans.any():
-			if len(ar.shape) == 1:
-				ar = ar[~nans]
-			else:
-				ar = ar[~nans.any(axis=1)]
-		_, counts = np.unique(ar, return_counts=True, axis=0)
-		return counts / len(ar)
-
-	def entropy(self, ar):
-		p = self.distribution(ar)
-		return -np.sum(p * np.log2(p))
-
-	def compute_gains(self):
-		for attr in range(self.n_attrs):
-			self.attr_h[attr] = self.entropy(self.data.X[:, attr])
-			self.gains[attr] = self.attr_h[attr] + self.class_h \
-				- self.entropy(np.c_[self.data.X[:, attr], self.data.Y])
-
-	def __call__(self, attr1, attr2):
-		attrs = np.c_[self.data.X[:, attr1], self.data.X[:, attr2]]
-		self.removed_h[attr1, attr2] = self.entropy(attrs) + self.class_h - self.entropy(np.c_[attrs, self.data.Y])
-		score = self.removed_h[attr1, attr2] - self.gains[attr1] - self.gains[attr2]
-		return score
-
-
 class Heuristic:
 	def __init__(self, weights, heuristic_type=None):
 		self.n_attributes = len(weights)
@@ -193,7 +149,7 @@ class InteractionRank(Orange.widgets.data.owcorrelations.CorrelationRank):
 
 	def __init__(self, *args):
 		VizRankDialogAttrPair.__init__(self, *args)
-		self.interaction = None
+		self.scorer = None
 		self.heuristic = None
 		self.use_heuristic = False
 		self.sel_feature_index = None
@@ -219,19 +175,17 @@ def initialize(self):
 		self.use_heuristic = False
 		self.sel_feature_index = self.master.feature and data.domain.index(self.master.feature)
 		if data:
-			if self.interaction is None or self.interaction.data != data:
-				self.interaction = Interaction(data)
+			if self.scorer is None or self.scorer.data != data:
+				self.scorer = InteractionScorer(data)
 			self.use_heuristic = len(data) * len(self.attrs) ** 2 > SIZE_LIMIT
 			if self.use_heuristic and not self.sel_feature_index:
-				self.heuristic = Heuristic(self.interaction.gains, self.master.heuristic_type)
+				self.heuristic = Heuristic(self.scorer.information_gain, self.master.heuristic_type)
 
 	def compute_score(self, state):
-		attr1, attr2 = state
-		h = self.interaction.class_h
-		score = self.interaction(attr1, attr2) / h
-		gain1 = self.interaction.gains[attr1] / h
-		gain2 = self.interaction.gains[attr2] / h
-		return score, gain1, gain2
+		scores = (self.scorer(*state),
+		          self.scorer.information_gain[state[0]],
+		          self.scorer.information_gain[state[1]])
+		return tuple(self.scorer.normalize(score) for score in scores)
 
 	def row_for_state(self, score, state):
 		attrs = sorted((self.attrs[x] for x in state), key=attrgetter("name"))

diff --git a/orangecontrib/prototypes/widgets/tests/test_owinteractions.py b/orangecontrib/prototypes/widgets/tests/test_owinteractions.py
@@ -12,7 +12,8 @@
 from Orange.widgets.visualize.owscatterplot import OWScatterPlot
 from Orange.widgets.widget import AttributeList
 from orangecontrib.prototypes.widgets.owinteractions import \
-	OWInteractions, Heuristic, HeuristicType, Interaction, InteractionRank
+	OWInteractions, Heuristic, HeuristicType, InteractionRank
+from orangecontrib.prototypes.interactions import InteractionScorer
 
 
 class TestOWInteractions(WidgetTest):
@@ -275,29 +276,23 @@ def test_compute_score(self):
 		y = np.array([0, 1, 1, 1])
 		domain = Domain([DiscreteVariable(str(i)) for i in range(2)], DiscreteVariable("3"))
 		data = Table(domain, x, y)
-		self.interaction = Interaction(data)
-		npt.assert_almost_equal(self.interaction(0, 1), -0.1226, 4)
-		npt.assert_almost_equal(self.interaction.class_h, 0.8113, 4)
-		npt.assert_almost_equal(self.interaction.attr_h[0], 1., 4)
-		npt.assert_almost_equal(self.interaction.attr_h[1], 0.8113, 4)
-		npt.assert_almost_equal(self.interaction.gains[0], 0.3113, 4)
-		npt.assert_almost_equal(self.interaction.gains[1], 0.1226, 4)
-		npt.assert_almost_equal(self.interaction.removed_h[0, 1], 0.3113, 4)
+		self.scorer = InteractionScorer(data)
+		npt.assert_almost_equal(self.scorer(0, 1), -0.1226, 4)
+		npt.assert_almost_equal(self.scorer.class_entropy, 0.8113, 4)
+		npt.assert_almost_equal(self.scorer.information_gain[0], 0.3113, 4)
+		npt.assert_almost_equal(self.scorer.information_gain[1], 0.1226, 4)
 
 	def test_nans(self):
 		"""Check score calculation with sparse data"""
 		x = np.array([[1, 1], [0, 1], [1, 1], [0, 0], [1, np.nan], [np.nan, 0], [np.nan, np.nan]])
 		y = np.array([0, 1, 1, 1, 0, 0, 1])
 		domain = Domain([DiscreteVariable(str(i)) for i in range(2)], DiscreteVariable("3"))
 		data = Table(domain, x, y)
-		self.interaction = Interaction(data)
-		npt.assert_almost_equal(self.interaction(0, 1), 0.0167, 4)
-		npt.assert_almost_equal(self.interaction.class_h, 0.9852, 4)
-		npt.assert_almost_equal(self.interaction.attr_h[0], 0.9710, 4)
-		npt.assert_almost_equal(self.interaction.attr_h[1], 0.9710, 4)
-		npt.assert_almost_equal(self.interaction.gains[0], 0.4343, 4)
-		npt.assert_almost_equal(self.interaction.gains[1], 0.0343, 4)
-		npt.assert_almost_equal(self.interaction.removed_h[0, 1], 0.4852, 4)
+		self.scorer = InteractionScorer(data)
+		npt.assert_almost_equal(self.scorer(0, 1), 0.0167, 4)
+		npt.assert_almost_equal(self.scorer.class_entropy, 0.9852, 4)
+		npt.assert_almost_equal(self.scorer.information_gain[0], 0.4343, 4)
+		npt.assert_almost_equal(self.scorer.information_gain[1], 0.0343, 4)
 
 
 class TestHeuristic(unittest.TestCase):
@@ -307,8 +302,8 @@ def setUpClass(cls):
 
 	def test_heuristic(self):
 		"""Check attribute pairs returned by heuristic"""
-		score = Interaction(self.zoo)
-		heuristic = Heuristic(score.gains, heuristic_type=HeuristicType.INFOGAIN)
+		scorer = InteractionScorer(self.zoo)
+		heuristic = Heuristic(scorer.information_gain, heuristic_type=HeuristicType.INFOGAIN)
 		self.assertListEqual(
 			list(heuristic.get_states(None))[:9],
 			[(14, 6), (14, 10), (14, 15), (6, 10), (14, 5), (6, 15), (14, 11), (6, 5), (10, 15)]