-
-
Notifications
You must be signed in to change notification settings - Fork 989
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature scoring (info gain, gain ratio, gini).
- Loading branch information
Showing
3 changed files
with
98 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import numpy as np | ||
from Orange.statistics import contingency | ||
|
||
|
||
class Score: | ||
def __new__(cls, *args): | ||
self = super().__new__(cls) | ||
if args: | ||
return self(*args) | ||
else: | ||
return self | ||
|
||
def __call__(self, feature, data): | ||
cont = contingency.Discrete(data, feature) | ||
return self.from_contingency(cont) | ||
|
||
|
||
def _entropy(D): | ||
"""Entropy of class-distribution matrix""" | ||
P = D / np.sum(D, axis=0) | ||
PC = np.clip(P, 1e-15, 1) | ||
return np.sum(np.sum(- P * np.log2(PC), axis=0) * np.sum(D, axis=0) / np.sum(D)) | ||
|
||
|
||
def _gini(D): | ||
"""Gini index of class-distribution matrix""" | ||
P = D / np.sum(D, axis=0) | ||
return sum((np.ones(1 if len(D.shape) == 1 else D.shape[1]) - np.sum(np.square(P), axis=0)) \ | ||
* 0.5 * np.sum(D, axis=0) / np.sum(D)) | ||
|
||
|
||
class InfoGain(Score): | ||
""" | ||
Information gain of a feature in class-labeled data set. | ||
:param feature: feature id | ||
:param data: data set | ||
:type data: Orange.data.Table | ||
:return: float | ||
""" | ||
def from_contingency(self, cont): | ||
h_class = _entropy(np.sum(cont, axis=1)) | ||
h_residual = _entropy(cont) | ||
return h_class - h_residual | ||
|
||
|
||
class GainRatio(Score): | ||
""" | ||
Gain ratio score of a feature in class-labeled data set. | ||
:param feature: feature id | ||
:param data: data set | ||
:type data: Orange.data.Table | ||
:return: float | ||
""" | ||
def from_contingency(self, cont): | ||
h_class = _entropy(np.sum(cont, axis=1)) | ||
h_residual = _entropy(cont) | ||
h_attribute = _entropy(np.sum(cont, axis=0)) | ||
return (h_class - h_residual) / h_attribute | ||
|
||
|
||
class Gini(Score): | ||
""" | ||
Gini score of a feature in class-labeled data set. | ||
:param feature: feature id | ||
:param data: data set | ||
:type data: Orange.data.Table | ||
:return: float | ||
""" | ||
def from_contingency(self, cont): | ||
return _gini(np.sum(cont, axis=1)) - _gini(cont) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import unittest | ||
import numpy as np | ||
from Orange.data import Table | ||
from Orange.feature import scoring | ||
|
||
|
||
class FeatureScoringTest(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self.zoo = Table("zoo") | ||
|
||
def test_info_gain(self): | ||
scorer = scoring.InfoGain() | ||
correct = [0.79067, 0.71795, 0.83014, 0.97432, 0.46970] | ||
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5) | ||
|
||
def test_gain_ratio(self): | ||
scorer = scoring.GainRatio() | ||
correct = [0.80351, 1.00000, 0.84754, 1.00000, 0.59376] | ||
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5) | ||
|
||
def test_gini(self): | ||
scorer = scoring.Gini() | ||
correct = [0.11893, 0.10427, 0.13117, 0.14650, 0.05973] | ||
np.testing.assert_almost_equal([scorer(a, self.zoo) for a in range(5)], correct, decimal=5) |