Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.0.4: Add ddof (Delta degrees of freedom) parameter for standard deviation computation #7

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 35 additions & 32 deletions outliers/smirnov_grubbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@


DEFAULT_ALPHA = 0.95
DEFAULT_DDOF = 0


# Test output types
Expand Down Expand Up @@ -93,23 +94,25 @@ def _get_g_test(self, data, alpha):
t = stats.t.isf(significance_level, n-2)
return ((n-1) / sqrt(n)) * (sqrt(t**2 / (n-2 + t**2)))

def _test_once(self, data, alpha):
def _test_once(self, data, alpha, ddof):
"""Perform one iteration of the Smirnov-Grubbs test.

:param numpy.array data: data set
:param float alpha: significance level
:param int ddof: Means Delta Degrees of Freedom used in numpy.std
:return: the index of the outlier if one if found; None otherwise
"""
target_index, value = self._target(data)

g = value / data.std()
g = value / data.std(ddof=ddof)
g_test = self._get_g_test(data, alpha)
return target_index if g > g_test else None

def run(self, alpha=DEFAULT_ALPHA, output_type=OutputType.DATA):
def run(self, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF, output_type=OutputType.DATA):
"""Run the Smirnov-Grubbs test to remove outliers in the given data set.

:param float alpha: significance level
:param int ddof: Means Delta Degrees of Freedom used in numpy.std
:param int output_type: test output type (from OutputType class values)
:return: depending on the value of output_type, the data set without
outliers (DATA), the outliers themselves (OUTLIERS) or the indices of
Expand All @@ -119,7 +122,7 @@ def run(self, alpha=DEFAULT_ALPHA, output_type=OutputType.DATA):
outliers = list()

while True:
outlier_index = self._test_once(data, alpha)
outlier_index = self._test_once(data, alpha, ddof)
if outlier_index is None:
break
outlier = data[outlier_index]
Expand Down Expand Up @@ -187,57 +190,57 @@ def _get_index(self, data):

# Convenience functions to run single Grubbs tests

def _test(test_class, data, alpha, output_type):
return test_class(data).run(alpha, output_type=output_type)
def _test(test_class, data, alpha, ddof, output_type):
return test_class(data).run(alpha, ddof, output_type=output_type)


def _two_sided_test(data, alpha, output_type):
return _test(TwoSidedGrubbsTest, data, alpha, output_type)
def _two_sided_test(data, alpha, ddof, output_type):
return _test(TwoSidedGrubbsTest, data, alpha, ddof, output_type)


def _min_test(data, alpha, output_type):
return _test(MinValueGrubbsTest, data, alpha, output_type)
def _min_test(data, alpha, ddof, output_type):
return _test(MinValueGrubbsTest, data, alpha, ddof, output_type)


def _max_test(data, alpha, output_type):
return _test(MaxValueGrubbsTest, data, alpha, output_type)
def _max_test(data, alpha, ddof, output_type):
return _test(MaxValueGrubbsTest, data, alpha, ddof, output_type)


def two_sided_test(data, alpha=DEFAULT_ALPHA):
return _two_sided_test(data, alpha, OutputType.DATA)
def two_sided_test(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _two_sided_test(data, alpha, ddof, OutputType.DATA)


def two_sided_test_indices(data, alpha=DEFAULT_ALPHA):
return _two_sided_test(data, alpha, OutputType.INDICES)
def two_sided_test_indices(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _two_sided_test(data, alpha, ddof, OutputType.INDICES)


def two_sided_test_outliers(data, alpha=DEFAULT_ALPHA):
return _two_sided_test(data, alpha, OutputType.OUTLIERS)
def two_sided_test_outliers(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _two_sided_test(data, alpha, ddof, OutputType.OUTLIERS)


def min_test(data, alpha=DEFAULT_ALPHA):
return _min_test(data, alpha, OutputType.DATA)
def min_test(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _min_test(data, alpha, ddof, OutputType.DATA)


def min_test_indices(data, alpha=DEFAULT_ALPHA):
return _min_test(data, alpha, OutputType.INDICES)
def min_test_indices(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _min_test(data, alpha, ddof, OutputType.INDICES)


def min_test_outliers(data, alpha=DEFAULT_ALPHA):
return _min_test(data, alpha, OutputType.OUTLIERS)
def min_test_outliers(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _min_test(data, alpha, ddof, OutputType.OUTLIERS)


def max_test(data, alpha=DEFAULT_ALPHA):
return _max_test(data, alpha, OutputType.DATA)
def max_test(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _max_test(data, alpha, ddof, OutputType.DATA)


def max_test_indices(data, alpha=DEFAULT_ALPHA):
return _max_test(data, alpha, OutputType.INDICES)
def max_test_indices(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _max_test(data, alpha, ddof, OutputType.INDICES)


def max_test_outliers(data, alpha=DEFAULT_ALPHA):
return _max_test(data, alpha, OutputType.OUTLIERS)
def max_test_outliers(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return _max_test(data, alpha, ddof, OutputType.OUTLIERS)


def test(data, alpha=DEFAULT_ALPHA):
return two_sided_test(data, alpha)
def test(data, alpha=DEFAULT_ALPHA, ddof=DEFAULT_DDOF):
return two_sided_test(data, ddof, alpha)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
README = open(os.path.join(BASE_PATH, 'README.rst')).read()
CHANGES = open(os.path.join(BASE_PATH, 'CHANGES.rst')).read()

__version__ = '0.0.3'
__version__ = '0.0.4'
__author__ = 'Masashi Shibata <contact@c-bata.link>'
__author_email__ = 'contact@c-bata.link'
__license__ = 'MIT License'
Expand Down
4 changes: 2 additions & 2 deletions tests/test_smirnov_grubbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ def test_test_once_when_given_pandas_series(self):
grubbs_test = grubbs.TwoSidedGrubbsTest(data)

expected_index = 0
actual_index = grubbs_test._test_once(data, 0.05)
actual_index = grubbs_test._test_once(data, 0.05, 0)
self.assertEqual(actual_index, expected_index)

def test_test_once_when_given_numpy_ndarray(self):
data = np.array(self.data2)
grubbs_test = grubbs.TwoSidedGrubbsTest(data)

expected_index = 0
actual_index = grubbs_test._test_once(data, 0.05)
actual_index = grubbs_test._test_once(data, 0.05, 0)
self.assertEqual(actual_index, expected_index)

def test_delete_item_when_given_pandas_series(self):
Expand Down