Skip to content

Commit

Permalink
Merge pull request #6623 from janezd/datainfo-count-missing
Browse files Browse the repository at this point in the history
[ENH] Data Info: Show statistics about missing values
  • Loading branch information
markotoplak committed Nov 3, 2023
2 parents 5555314 + d23a9b5 commit d120a72
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 35 deletions.
33 changes: 31 additions & 2 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,18 +1551,47 @@ def has_missing_class(self):
return bn.anynan(self._Y)

@staticmethod
def __get_nan_frequency(data):
def __get_nan_count(data):
if data.size == 0:
return 0
dense = data if not sp.issparse(data) else data.data
return np.isnan(dense).sum() / np.prod(data.shape)
return np.isnan(dense).sum()

@classmethod
def __get_nan_frequency(cls, data):
return cls.__get_nan_count(data) / (np.prod(data.shape) or 1)

def get_nan_count_attribute(self):
return self.__get_nan_count(self.X)

def get_nan_count_class(self):
return self.__get_nan_count(self.Y)

def get_nan_count_metas(self):
if self.metas.dtype != object:
return self.__get_nan_count(self.metas)

data = self.metas
if sp.issparse(data):
data = data.tocsc()

count = 0
for i, attr in enumerate(self.domain.metas):
col = data[:, i]
missing = np.isnan(col.astype(float)) \
if not isinstance(attr, StringVariable) else data == ""
count += np.sum(missing)
return count

def get_nan_frequency_attribute(self):
return self.__get_nan_frequency(self.X)

def get_nan_frequency_class(self):
return self.__get_nan_frequency(self.Y)

def get_nan_frequency_metas(self):
return self.get_nan_count_metas() / (np.prod(self.metas.shape) or 1)

def checksum(self, include_metas=True):
# TODO: zlib.adler32 does not work for numpy arrays with dtype object
# (after pickling and unpickling such arrays, checksum changes)
Expand Down
45 changes: 36 additions & 9 deletions Orange/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2366,31 +2366,58 @@ def test_value_assignment(self):

class TestTableStats(TableTests):
def test_get_nan_frequency(self):
metas = [DiscreteVariable("x", values=tuple("abc")), StringVariable("s")]
meta_data = np.array([list(range(self.nrows)), ["x"] * self.nrows]).T
domain = self.create_domain(self.attributes, self.class_vars)
table = data.Table(domain, self.data, self.class_data)
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 0)
domain = Domain(domain.attributes, domain.class_vars, metas)
table = data.Table(domain, self.data, self.class_data, meta_data)

def test_counts(at, cl, me):
x, y, metas = table.X, table.Y, table.metas
for _ in range(2):
self.assertEqual(table.get_nan_count_attribute(), at)
self.assertEqual(table.get_nan_count_class(), cl)
self.assertEqual(table.get_nan_count_metas(), me)
self.assertEqual(table.get_nan_frequency_attribute(), at / np.prod(x.shape))
self.assertEqual(table.get_nan_frequency_class(), cl / np.prod(y.shape))
self.assertEqual(table.get_nan_frequency_metas(), me / np.prod(metas.shape))
with table.unlocked():
table.X = sp.csr_matrix(x)
table.Y = sp.csr_matrix(y)
with table.unlocked():
table.X, table.Y = x, y

test_counts(0, 0, 0)

with table.unlocked():
table.X[1, 2] = table.X[4, 5] = np.nan
self.assertEqual(table.get_nan_frequency_attribute(), 2 / table.X.size)
self.assertEqual(table.get_nan_frequency_class(), 0)
test_counts(2, 0, 0)

with table.unlocked():
table.Y[3:6] = np.nan
self.assertEqual(table.get_nan_frequency_attribute(), 2 / table.X.size)
self.assertEqual(table.get_nan_frequency_class(), 3 / table.Y.size)
test_counts(2, 3, 0)

with table.unlocked():
table.X[1, 2] = table.X[4, 5] = 0
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 3 / table.Y.size)
test_counts(0, 3, 0)

with table.unlocked():
table.metas[1, 0] = table.metas[3, 0] = np.nan
test_counts(0, 3, 2)

with table.unlocked():
table.metas[5, 1] = ""
test_counts(0, 3, 3)

def test_get_nan_frequency_empty_table(self):
domain = self.create_domain(self.attributes, self.class_vars)
table = data.Table.from_domain(domain)
self.assertEqual(table.get_nan_count_attribute(), 0)
self.assertEqual(table.get_nan_count_class(), 0)
self.assertEqual(table.get_nan_count_metas(), 0)
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 0)
self.assertEqual(table.get_nan_frequency_metas(), 0)


class TestRowInstance(unittest.TestCase):
Expand Down
59 changes: 44 additions & 15 deletions Orange/widgets/data/owdatainfo.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import threading
import textwrap

from Orange.data import \
Table, StringVariable, DiscreteVariable, ContinuousVariable
try:
from Orange.data.sql.table import SqlTable
except ImportError:
SqlTable = None
import numpy as np

from Orange.widgets import widget, gui
from Orange.widgets.utils.localization import pl
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input

from Orange.data import \
Table, StringVariable, DiscreteVariable, ContinuousVariable

try:
from Orange.data.sql.table import SqlTable
except ImportError:
def is_sql(_):
return False
else:
def is_sql(data):
return isinstance(data, SqlTable)


class OWDataInfo(widget.OWWidget):
name = "Data Info"
Expand Down Expand Up @@ -53,12 +60,13 @@ def data(self, data):
("Size", self._p_size),
("Features", self._p_features),
("Targets", self._p_targets),
("Metas", self._p_metas))
("Metas", self._p_metas),
("Missing data", self._p_missing))
if bool(value := func(data))}
self.data_attrs = data.attributes
self.update_info()

if SqlTable is not None and isinstance(data, SqlTable):
if is_sql(data):
def set_exact_length():
self.data_desc["Size"] = self._p_size(data, exact=True)
self.update_info()
Expand Down Expand Up @@ -101,16 +109,18 @@ def _p_name(data):

@staticmethod
def _p_location(data):
if SqlTable is not None and isinstance(data, SqlTable):
connection_string = ' '.join(
f'{key}={value}'
for key, value in data.connection_params.items()
if value is not None and key != 'password')
return f"SQL Table using connection:<br/>{connection_string}"
if not is_sql(data):
return None

connection_string = ' '.join(
f'{key}={value}'
for key, value in data.connection_params.items()
if value is not None and key != 'password')
return f"SQL Table using connection:<br/>{connection_string}"

@staticmethod
def _p_size(data, exact=False):
exact = exact or SqlTable is None or not isinstance(data, SqlTable)
exact = exact or is_sql(data)
if exact:
n = len(data)
desc = f"{n} {pl(n, 'row')}"
Expand Down Expand Up @@ -152,6 +162,25 @@ def _p_targets(self, data):
def _p_metas(cls, data):
return cls._pack_var_counts(data.domain.metas)

@staticmethod
def _p_missing(data: Table):
if is_sql(data):
return "(not checked for SQL data)"

counts = []
for name, part, n_miss in ((pl(len(data.domain.attributes), "feature"),
data.X, data.get_nan_count_attribute()),
(pl(len(data.domain.class_vars), "targets"),
data.Y, data.get_nan_count_class()),
(pl(len(data.domain.metas), "meta variable"),
data.metas, data.get_nan_count_metas())):
if n_miss:
counts.append(
f"{n_miss} ({n_miss / np.prod(part.shape):.1%}) in {name}")
if not counts:
return "none"
return ", ".join(counts)

@staticmethod
def _count(s, tpe):
return sum(isinstance(x, tpe) for x in s)
Expand Down
22 changes: 13 additions & 9 deletions Orange/widgets/data/tests/test_owdatainfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,24 @@ def test_data(self):
# combinations that must not crash
a, b, c = (DiscreteVariable(n) for n in "abc")
x, y, z = (ContinuousVariable(n) for n in "xyz")
m, n = (StringVariable(n) for n in "nm")
m, n = (StringVariable(n) for n in "mn")
meta_s = np.array([["foo", "bar", ""]]).T
meta_c = np.array([[3.14, np.nan, np.nan]]).T
metadata = np.hstack((meta_s, meta_c))
self.widget.send_report()
for attrs, classes, metas in (((a, b, c), (), ()),
((a, b, c, x), (y,), ()),
((a, b, c), (y, x), (m, )),
((a, b), (y, x, c), (m, )),
((a, ), (b, c), (m, )),
((a, b, x), (c, ), (m, y)),
((), (c, ), (m, y))):
for attrs, classes, metas, metad in (((a, b, c), (), (), None),
((a, b, c, x), (y,), (), None),
((a, b, c), (y, x), (m, ), meta_s),
((a, b, c), (y, ), (x, ), meta_c),
((a, b), (y, x, c), (m, ), meta_s),
((a, ), (b, c), (m, ), meta_s),
((a, b, x), (c, ), (m, y), metadata),
((), (c, ), (m, y), metadata)):
data = Table.from_numpy(
Domain(attrs, classes, metas),
np.zeros((3, len(attrs))),
np.zeros((3, len(classes))),
np.full((3, len(metas)), object()))
metad)
data.attributes = {"att 1": 1, "att 2": True, "att 3": 3}
if metas:
data.name = "name"
Expand Down

0 comments on commit d120a72

Please sign in to comment.