Skip to content

Commit

Permalink
Feature scoring (info gain, gain ratio, gini).
Browse files Browse the repository at this point in the history
  • Loading branch information
BlazZupan committed Jun 3, 2013
1 parent 3bf63c8 commit dba0297
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 0 deletions.
Empty file added Orange/feature/__init__.py
Empty file.
73 changes: 73 additions & 0 deletions Orange/feature/scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import numpy as np
from Orange.statistics import contingency


class Score:
def __new__(cls, *args):
self = super().__new__(cls)
if args:
return self(*args)
else:
return self

def __call__(self, feature, data):
cont = contingency.Discrete(data, feature)
return self.from_contingency(cont)


def _entropy(D):
"""Entropy of class-distribution matrix"""
P = D / np.sum(D, axis=0)
PC = np.clip(P, 1e-15, 1)
return np.sum(np.sum(- P * np.log2(PC), axis=0) * np.sum(D, axis=0) / np.sum(D))


def _gini(D):
"""Gini index of class-distribution matrix"""
P = D / np.sum(D, axis=0)
return sum((np.ones(1 if len(D.shape) == 1 else D.shape[1]) - np.sum(np.square(P), axis=0)) \
* 0.5 * np.sum(D, axis=0) / np.sum(D))


class InfoGain(Score):
"""
Information gain of a feature in class-labeled data set.
:param feature: feature id
:param data: data set
:type data: Orange.data.Table
:return: float
"""
def from_contingency(self, cont):
h_class = _entropy(np.sum(cont, axis=1))
h_residual = _entropy(cont)
return h_class - h_residual


class GainRatio(Score):
"""
Gain ratio score of a feature in class-labeled data set.
:param feature: feature id
:param data: data set
:type data: Orange.data.Table
:return: float
"""
def from_contingency(self, cont):
h_class = _entropy(np.sum(cont, axis=1))
h_residual = _entropy(cont)
h_attribute = _entropy(np.sum(cont, axis=0))
return (h_class - h_residual) / h_attribute


class Gini(Score):
"""
Gini score of a feature in class-labeled data set.
:param feature: feature id
:param data: data set
:type data: Orange.data.Table
:return: float
"""
def from_contingency(self, cont):
return _gini(np.sum(cont, axis=1)) - _gini(cont)
25 changes: 25 additions & 0 deletions Orange/tests/test_feature_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest
import numpy as np
from Orange.data import Table
from Orange.feature import scoring


class FeatureScoringTest(unittest.TestCase):

def setUp(self):
self.zoo = Table("zoo")

def test_info_gain(self):
scorer = scoring.InfoGain()
correct = [0.79067, 0.71795, 0.83014, 0.97432, 0.46970]
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)

def test_gain_ratio(self):
scorer = scoring.GainRatio()
correct = [0.80351, 1.00000, 0.84754, 1.00000, 0.59376]
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)

def test_gini(self):
scorer = scoring.Gini()
correct = [0.11893, 0.10427, 0.13117, 0.14650, 0.05973]
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5)

0 comments on commit dba0297

Please sign in to comment.