Skip to content

Commit

Permalink
Merge pull request #255 from noahnovsak/owinteractions-fix-calculation
Browse files Browse the repository at this point in the history
OWInteractions: calculation avoids long operation
  • Loading branch information
markotoplak committed Nov 22, 2023
2 parents 7469083 + 3d8faa0 commit 0d12c42
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 75 deletions.
79 changes: 79 additions & 0 deletions orangecontrib/prototypes/interactions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np


def get_row_ids(ar):
row_ids = ar[:, 0].copy()
# Assuming the data has been discretized into fewer
# than 10000 bins and that `ar` has up to 3 columns,
# this should work.
# Alternatively generating steps like so might be safer:
# steps = ar[:, :-1].max(axis=0) + 1
# step_i = np.prod(steps[:i])
for i in range(1, ar.shape[1]):
row_ids += ar[:, i] * 10000 ** i
return row_ids


def distribution(ar):
nans = np.isnan(ar)

if ar.ndim == 1:
if nans.any():
ar = ar[~nans]
else:
if nans.any():
ar = ar[~nans.any(axis=1)]

# Using `np.unique` with `axis=0` to get row frequency
# slows down the main thread!
# I'm not sure why, but my guess is, that the underlying
# implementation doesn't release the GIL. The simplest
# solution seems to be generating unique numbers/ids
# based on the contents of each row.
ar = get_row_ids(ar)

_, counts = np.unique(ar, return_counts=True)
return counts / ar.shape[0]


def entropy(ar):
p = distribution(ar)
return -np.sum(p * np.log2(p))


class InteractionScorer:
def __init__(self, data):
self.data = data
self.class_entropy = 0
self.information_gain = np.zeros(data.X.shape[1])

self.precompute()

def precompute(self):
"""
Precompute information gain of each attribute to speed up
computation and to create heuristic.
Only removes necessary NaNs to keep as much data as possible.
This preserves entropies and information gains invariant of
third attribute. This also has the unintended side effect of
producing negative information gains in certain situations as
well as negative interactions with greater magnitude than the
combined information gain.
"""
self.class_entropy = entropy(self.data.Y)
for attr in range(self.information_gain.size):
self.information_gain[attr] = self.class_entropy \
+ entropy(self.data.X[:, attr]) \
- entropy(np.column_stack((self.data.X[:, attr], self.data.Y)))

def __call__(self, attr1, attr2):
attrs = np.column_stack((self.data.X[:, attr1], self.data.X[:, attr2]))
return self.class_entropy \
- self.information_gain[attr1] \
- self.information_gain[attr2] \
+ entropy(attrs) \
- entropy(np.column_stack((attrs, self.data.Y)))

def normalize(self, score):
return score / self.class_entropy
66 changes: 10 additions & 56 deletions orangecontrib/prototypes/widgets/owinteractions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from Orange.preprocess import Discretize, Remove
import Orange.widgets.data.owcorrelations

from orangecontrib.prototypes.interactions import InteractionScorer


SIZE_LIMIT = 1000000

Expand All @@ -43,52 +45,6 @@ def items():
return ["InfoGain Heuristic", "Random Search"]


class Interaction:
def __init__(self, disc_data):
self.data = disc_data
self.n_attrs = len(self.data.domain.attributes)
self.class_h = self.entropy(self.data.Y)
self.attr_h = np.zeros(self.n_attrs)
self.gains = np.zeros(self.n_attrs)
self.removed_h = np.zeros((self.n_attrs, self.n_attrs))

# Precompute information gain of each attribute for faster overall
# computation and to create heuristic. Only removes necessary NaN values
# to keep as much data as possible and keep entropies and information gains
# invariant of third attribute.
# In certain situations this can cause unexpected results i.e. negative
# information gains or negative interactions lower than individual
# attribute information.
self.compute_gains()

@staticmethod
def distribution(ar):
nans = np.isnan(ar)
if nans.any():
if len(ar.shape) == 1:
ar = ar[~nans]
else:
ar = ar[~nans.any(axis=1)]
_, counts = np.unique(ar, return_counts=True, axis=0)
return counts / len(ar)

def entropy(self, ar):
p = self.distribution(ar)
return -np.sum(p * np.log2(p))

def compute_gains(self):
for attr in range(self.n_attrs):
self.attr_h[attr] = self.entropy(self.data.X[:, attr])
self.gains[attr] = self.attr_h[attr] + self.class_h \
- self.entropy(np.c_[self.data.X[:, attr], self.data.Y])

def __call__(self, attr1, attr2):
attrs = np.c_[self.data.X[:, attr1], self.data.X[:, attr2]]
self.removed_h[attr1, attr2] = self.entropy(attrs) + self.class_h - self.entropy(np.c_[attrs, self.data.Y])
score = self.removed_h[attr1, attr2] - self.gains[attr1] - self.gains[attr2]
return score


class Heuristic:
def __init__(self, weights, heuristic_type=None):
self.n_attributes = len(weights)
Expand Down Expand Up @@ -193,7 +149,7 @@ class InteractionRank(Orange.widgets.data.owcorrelations.CorrelationRank):

def __init__(self, *args):
VizRankDialogAttrPair.__init__(self, *args)
self.interaction = None
self.scorer = None
self.heuristic = None
self.use_heuristic = False
self.sel_feature_index = None
Expand All @@ -219,19 +175,17 @@ def initialize(self):
self.use_heuristic = False
self.sel_feature_index = self.master.feature and data.domain.index(self.master.feature)
if data:
if self.interaction is None or self.interaction.data != data:
self.interaction = Interaction(data)
if self.scorer is None or self.scorer.data != data:
self.scorer = InteractionScorer(data)
self.use_heuristic = len(data) * len(self.attrs) ** 2 > SIZE_LIMIT
if self.use_heuristic and not self.sel_feature_index:
self.heuristic = Heuristic(self.interaction.gains, self.master.heuristic_type)
self.heuristic = Heuristic(self.scorer.information_gain, self.master.heuristic_type)

def compute_score(self, state):
attr1, attr2 = state
h = self.interaction.class_h
score = self.interaction(attr1, attr2) / h
gain1 = self.interaction.gains[attr1] / h
gain2 = self.interaction.gains[attr2] / h
return score, gain1, gain2
scores = (self.scorer(*state),
self.scorer.information_gain[state[0]],
self.scorer.information_gain[state[1]])
return tuple(self.scorer.normalize(score) for score in scores)

def row_for_state(self, score, state):
attrs = sorted((self.attrs[x] for x in state), key=attrgetter("name"))
Expand Down
33 changes: 14 additions & 19 deletions orangecontrib/prototypes/widgets/tests/test_owinteractions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from Orange.widgets.visualize.owscatterplot import OWScatterPlot
from Orange.widgets.widget import AttributeList
from orangecontrib.prototypes.widgets.owinteractions import \
OWInteractions, Heuristic, HeuristicType, Interaction, InteractionRank
OWInteractions, Heuristic, HeuristicType, InteractionRank
from orangecontrib.prototypes.interactions import InteractionScorer


class TestOWInteractions(WidgetTest):
Expand Down Expand Up @@ -275,29 +276,23 @@ def test_compute_score(self):
y = np.array([0, 1, 1, 1])
domain = Domain([DiscreteVariable(str(i)) for i in range(2)], DiscreteVariable("3"))
data = Table(domain, x, y)
self.interaction = Interaction(data)
npt.assert_almost_equal(self.interaction(0, 1), -0.1226, 4)
npt.assert_almost_equal(self.interaction.class_h, 0.8113, 4)
npt.assert_almost_equal(self.interaction.attr_h[0], 1., 4)
npt.assert_almost_equal(self.interaction.attr_h[1], 0.8113, 4)
npt.assert_almost_equal(self.interaction.gains[0], 0.3113, 4)
npt.assert_almost_equal(self.interaction.gains[1], 0.1226, 4)
npt.assert_almost_equal(self.interaction.removed_h[0, 1], 0.3113, 4)
self.scorer = InteractionScorer(data)
npt.assert_almost_equal(self.scorer(0, 1), -0.1226, 4)
npt.assert_almost_equal(self.scorer.class_entropy, 0.8113, 4)
npt.assert_almost_equal(self.scorer.information_gain[0], 0.3113, 4)
npt.assert_almost_equal(self.scorer.information_gain[1], 0.1226, 4)

def test_nans(self):
"""Check score calculation with sparse data"""
x = np.array([[1, 1], [0, 1], [1, 1], [0, 0], [1, np.nan], [np.nan, 0], [np.nan, np.nan]])
y = np.array([0, 1, 1, 1, 0, 0, 1])
domain = Domain([DiscreteVariable(str(i)) for i in range(2)], DiscreteVariable("3"))
data = Table(domain, x, y)
self.interaction = Interaction(data)
npt.assert_almost_equal(self.interaction(0, 1), 0.0167, 4)
npt.assert_almost_equal(self.interaction.class_h, 0.9852, 4)
npt.assert_almost_equal(self.interaction.attr_h[0], 0.9710, 4)
npt.assert_almost_equal(self.interaction.attr_h[1], 0.9710, 4)
npt.assert_almost_equal(self.interaction.gains[0], 0.4343, 4)
npt.assert_almost_equal(self.interaction.gains[1], 0.0343, 4)
npt.assert_almost_equal(self.interaction.removed_h[0, 1], 0.4852, 4)
self.scorer = InteractionScorer(data)
npt.assert_almost_equal(self.scorer(0, 1), 0.0167, 4)
npt.assert_almost_equal(self.scorer.class_entropy, 0.9852, 4)
npt.assert_almost_equal(self.scorer.information_gain[0], 0.4343, 4)
npt.assert_almost_equal(self.scorer.information_gain[1], 0.0343, 4)


class TestHeuristic(unittest.TestCase):
Expand All @@ -307,8 +302,8 @@ def setUpClass(cls):

def test_heuristic(self):
"""Check attribute pairs returned by heuristic"""
score = Interaction(self.zoo)
heuristic = Heuristic(score.gains, heuristic_type=HeuristicType.INFOGAIN)
scorer = InteractionScorer(self.zoo)
heuristic = Heuristic(scorer.information_gain, heuristic_type=HeuristicType.INFOGAIN)
self.assertListEqual(
list(heuristic.get_states(None))[:9],
[(14, 6), (14, 10), (14, 15), (6, 10), (14, 5), (6, 15), (14, 11), (6, 5), (10, 15)]
Expand Down

0 comments on commit 0d12c42

Please sign in to comment.