Skip to content
Permalink
Browse files

Merge pull request #178 from ales-erjavec/fixes/owcsvimport-na-parse

[FIX] CSV Import: Fix N/A parsing for text columns
  • Loading branch information...
ajdapretnar committed Apr 11, 2019
2 parents d82d05a + 24ea3e5 commit 16f877cca393aecc408e6c4e91190bcc32d13875
@@ -1140,6 +1140,17 @@ def _mime_type_for_path(path):
return mtype


NA_DEFAULT = ["", "?", ".", "~", "nan", "NAN", "NaN", "N/A", "n/a", "NA"]

NA_VALUES = {
ColumnType.Numeric: NA_DEFAULT,
ColumnType.Categorical: NA_DEFAULT,
ColumnType.Time: NA_DEFAULT + ["NaT", "NAT"],
ColumnType.Text: [],
ColumnType.Auto: NA_DEFAULT,
}


def load_csv(path, opts, progres_callback=None):
# type: (Union[AnyStr, BinaryIO], Options, ...) -> pd.DataFrame
def dtype(coltype):
@@ -1172,7 +1183,8 @@ def expand(ranges):
dtcols = {i for i, c in expand(opts.columntypes)
if c == ColumnType.Time}
parse_dates = sorted(dtcols)

na_values = {i: NA_VALUES.get(c, NA_DEFAULT)
for i, c in expand(opts.columntypes)}
if not parse_dates:
parse_dates = False

@@ -1231,7 +1243,8 @@ def expand(ranges):
skipinitialspace=opts.dialect.skipinitialspace,
header=header, skiprows=skiprows,
dtype=dtypes, parse_dates=parse_dates, prefix=prefix,
na_values=["?", "."], **numbers_format_kwds
na_values=na_values, keep_default_na=False,
**numbers_format_kwds
)
if columns_ignored:
# TODO: use 'usecols' parameter in `read_csv` call to
@@ -1449,7 +1462,7 @@ def pandas_to_table(df):
var = Orange.data.TimeVariable.make(str(header))
var.have_date = var.have_time = 1
elif pdtypes.is_object_dtype(series):
coldata = series.values
coldata = series.fillna('').values
assert isinstance(coldata, np.ndarray)
orangecol = coldata
var = Orange.data.StringVariable.make(str(header))
@@ -1,3 +1,4 @@
from numpy.testing import assert_array_equal
import unittest

import os
@@ -8,8 +9,10 @@

from Orange.widgets.tests.base import WidgetTest, GuiTest

from orangecontrib.prototypes.widgets.utils import textimport
from orangecontrib.prototypes.widgets import owcsvimport
from orangecontrib.prototypes.widgets.owcsvimport import (
pandas_to_table, ColumnType, RowSpec
)


class TestOWCSVFileImport(WidgetTest):
@@ -33,16 +36,14 @@ def test_dialog(self):
path = os.path.join(dirname, "test_owgrep_file.txt")
d = owcsvimport.CSVImportDialog()
d.setPath(path)
ColumnTypes = owcsvimport.Options.ColumnType
RowSpec = owcsvimport.Options.RowSpec
opts = owcsvimport.Options(
encoding="utf-8",
dialect=owcsvimport.textimport.Dialect(
" ", "\"", "\\", True, True
),
columntypes=[
(range(0, 2), ColumnTypes.Numeric),
(range(2, 3), ColumnTypes.Categorical)
(range(0, 2), ColumnType.Numeric),
(range(2, 3), ColumnType.Categorical)
],
rowspec=[
(range(0, 4), RowSpec.Skipped),
@@ -64,8 +65,6 @@ def test_load_csv(self):
b'1/1/1990,2.0,],two,\n'
b'1/1/1990,3.0,{,three,'
)
ColumnType = owcsvimport.Options.ColumnType
RowSpec = owcsvimport.Options.RowSpec
opts = owcsvimport.Options(
encoding="ascii",
dialect=csv.excel(),
@@ -74,6 +73,7 @@ def test_load_csv(self):
(range(1, 2), ColumnType.Numeric),
(range(2, 3), ColumnType.Text),
(range(3, 4), ColumnType.Categorical),
(range(4, 5), ColumnType.Auto),
],
rowspec=[]
)
@@ -109,3 +109,49 @@ def test_load_csv(self):
self.assertSequenceEqual(
list(df.iloc[:, 1]), ["one", "three"]
)

def test_convert(self):
contents = (
b'I, J, K\n'
b' , A, \n'
b'B, , 1\n'
b'?, ., NA'
)

class dialect(csv.excel):
skipinitialspace = True

opts = owcsvimport.Options(
encoding="ascii",
dialect=dialect(),
columntypes=[
(range(0, 1), ColumnType.Text),
(range(1, 2), ColumnType.Categorical),
(range(2, 3), ColumnType.Text),

],
rowspec=[(range(0, 1), RowSpec.Header)]
)
df = owcsvimport.load_csv(io.BytesIO(contents), opts)
tb = pandas_to_table(df)

assert_array_equal(tb.metas[:, 0], ["", "B", "?"])
assert_array_equal(tb.metas[:, 1], ["", "1", "NA"])
assert_array_equal(tb.X[:, 0], [0.0, np.nan, np.nan])

opts = owcsvimport.Options(
encoding="ascii",
dialect=dialect(),
columntypes=[
(range(0, 1), ColumnType.Categorical),
(range(1, 2), ColumnType.Categorical),
(range(2, 3), ColumnType.Numeric),
],
rowspec=[(range(0, 1), RowSpec.Header)]
)
df = owcsvimport.load_csv(io.BytesIO(contents), opts)
tb = pandas_to_table(df)

assert_array_equal(tb.X[:, 0], [np.nan, 0, np.nan])
assert_array_equal(tb.X[:, 1], [0, np.nan, np.nan])
assert_array_equal(tb.X[:, 2], [np.nan, 1, np.nan])

0 comments on commit 16f877c

Please sign in to comment.
You can’t perform that action at this time.