Feature scoring (info gain, gain ratio, gini).

biolab · Jun 3, 2013 · dba0297 · dba0297
1 parent 3bf63c8
commit dba0297
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 0 deletions.
diff --git a/Orange/feature/__init__.py b/Orange/feature/__init__.py
diff --git a/Orange/feature/scoring.py b/Orange/feature/scoring.py
@@ -0,0 +1,73 @@
+import numpy as np
+from Orange.statistics import contingency
+
+
+class Score:
+    def __new__(cls, *args):
+        self = super().__new__(cls)
+        if args:
+            return self(*args)
+        else:
+            return self
+
+    def __call__(self, feature, data):
+        cont = contingency.Discrete(data, feature)
+        return self.from_contingency(cont)
+
+
+def _entropy(D):
+    """Entropy of class-distribution matrix"""
+    P = D / np.sum(D, axis=0)
+    PC = np.clip(P, 1e-15, 1)
+    return np.sum(np.sum(- P * np.log2(PC), axis=0) * np.sum(D, axis=0) / np.sum(D))
+
+
+def _gini(D):
+    """Gini index of class-distribution matrix"""
+    P = D / np.sum(D, axis=0)
+    return sum((np.ones(1 if len(D.shape) == 1 else D.shape[1]) - np.sum(np.square(P), axis=0)) \
+               * 0.5 * np.sum(D, axis=0) / np.sum(D))
+
+
+class InfoGain(Score):
+    """
+    Information gain of a feature in class-labeled data set.
+
+    :param feature: feature id
+    :param data: data set
+    :type data: Orange.data.Table
+    :return: float
+    """
+    def from_contingency(self, cont):
+        h_class = _entropy(np.sum(cont, axis=1))
+        h_residual = _entropy(cont)
+        return h_class - h_residual
+
+
+class GainRatio(Score):
+    """
+    Gain ratio score of a feature in class-labeled data set.
+
+    :param feature: feature id
+    :param data: data set
+    :type data: Orange.data.Table
+    :return: float
+    """
+    def from_contingency(self, cont):
+        h_class = _entropy(np.sum(cont, axis=1))
+        h_residual = _entropy(cont)
+        h_attribute = _entropy(np.sum(cont, axis=0))
+        return (h_class - h_residual) / h_attribute
+
+
+class Gini(Score):
+    """
+    Gini score of a feature in class-labeled data set.
+
+    :param feature: feature id
+    :param data: data set
+    :type data: Orange.data.Table
+    :return: float
+    """
+    def from_contingency(self, cont):
+        return _gini(np.sum(cont, axis=1)) - _gini(cont)
diff --git a/Orange/tests/test_feature_scoring.py b/Orange/tests/test_feature_scoring.py
@@ -0,0 +1,25 @@
+import unittest
+import numpy as np
+from Orange.data import Table
+from Orange.feature import scoring
+
+
+class FeatureScoringTest(unittest.TestCase):
+
+    def setUp(self):
+        self.zoo = Table("zoo")
+
+    def test_info_gain(self):
+        scorer = scoring.InfoGain()
+        correct = [0.79067, 0.71795, 0.83014, 0.97432, 0.46970]
+        np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)
+
+    def test_gain_ratio(self):
+        scorer = scoring.GainRatio()
+        correct = [0.80351, 1.00000, 0.84754, 1.00000, 0.59376]
+        np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)
+
+    def test_gini(self):
+        scorer = scoring.Gini()
+        correct = [0.11893, 0.10427, 0.13117, 0.14650, 0.05973]
+        np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)