Merge pull request #228 from cpcloud/missing-json-to-csv

Add a default of None when going from records to tuples
blaze · Jun 20, 2015 · 2b56485 · 2b56485
2 parents b3dec1a + bcbbb1a
commit 2b56485
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 12 deletions.
diff --git a/odo/backends/tests/test_json.py b/odo/backends/tests/test_json.py
@@ -1,17 +1,22 @@
 from __future__ import absolute_import, division, print_function
 
-from odo.backends.json import *
-from odo.utils import tmpfile, ignoring
-from odo import into
-from odo.temp import Temp, _Temp
-from contextlib import contextmanager
-from datashape import dshape
 import datetime
 import os
 import gzip
 import os
 import json
 
+from contextlib import contextmanager
+
+import numpy as np
+from odo.backends.json import json_dumps
+from odo.utils import tmpfile, ignoring
+from odo import odo, discover, JSONLines, resource, JSON, convert, append, drop
+from odo.temp import Temp, _Temp
+
+from datashape import dshape
+
+
 @contextmanager
 def json_file(data):
     with tmpfile('.json') as fn:
@@ -20,6 +25,7 @@ def json_file(data):
 
         yield fn
 
+
 @contextmanager
 def jsonlines_file(data):
     with tmpfile('.json') as fn:
@@ -34,11 +40,13 @@ def jsonlines_file(data):
 dat = [{'name': 'Alice', 'amount': 100},
        {'name': 'Bob', 'amount': 200}]
 
+
 def test_discover_json():
     with json_file(dat) as fn:
         j = JSON(fn)
         assert discover(j) == discover(dat)
 
+
 def test_discover_jsonlines():
     with jsonlines_file(dat) as fn:
         j = JSONLines(fn)
@@ -239,3 +247,18 @@ def test_drop():
         assert os.path.exists(fn)
         drop(js)
         assert not os.path.exists(fn)
+
+
+def test_missing_to_csv():
+    data = [dict(a=1, b=2), dict(a=2, c=4)]
+    with tmpfile('.json') as fn:
+        js = JSON(fn)
+        js = odo(data, js)
+
+        with tmpfile('.csv') as csvf:
+            csv = odo(js, csvf)
+            with open(csv.path, 'rt') as f:
+                result = f.read()
+
+    expected = 'a,b,c\n1,2.0,\n2,,4.0\n'
+    assert result == expected
diff --git a/odo/convert.py b/odo/convert.py
@@ -159,16 +159,15 @@ def element_of(seq):
         seq = seq[0]
     return seq
 
+
 @convert.register(np.ndarray, list, cost=10.0)
 def list_to_numpy(seq, dshape=None, **kwargs):
     if isinstance(element_of(seq), dict):
         seq = list(records_to_tuples(dshape, seq))
-    if (seq and isinstance(seq[0], Iterable)
-            and not ishashable(seq[0])
-            and not isscalar(dshape)):
+    if (seq and isinstance(seq[0], Iterable) and not ishashable(seq[0]) and
+            not isscalar(dshape)):
         seq = list(map(tuple, seq))
-    dtype = dshape_to_numpy(dshape)
-    return np.array(seq, dtype=dtype)
+    return np.array(seq, dtype=dshape_to_numpy(dshape))
 
 
 @convert.register(Iterator, list, cost=0.001)

diff --git a/odo/tests/test_convert.py b/odo/tests/test_convert.py
@@ -127,6 +127,20 @@ def test_list_to_numpy_on_dicts():
     assert convert(list, x) == [('Alice', 100), ('Bob', 200)]
 
 
+def test_list_of_dicts_with_missing_to_numpy():
+    data = [{'name': 'Alice', 'amount': 100},
+            {'name': 'Bob'},
+            {'amount': 200}]
+    result = convert(np.ndarray, data)
+    assert result.dtype.names == ('amount', 'name')
+    expected = np.array([(100.0, 'Alice'),
+                         (np.nan, 'Bob'),
+                         (200.0, None)],
+                        dtype=[('amount', 'float64'), ('name', 'O')])
+    assert np.all((result == expected) |
+                  ((result != result) & (expected != expected)))
+
+
 def test_chunks_numpy_pandas():
     x = np.array([('Alice', 100), ('Bob', 200)],
                  dtype=[('name', 'S7'), ('amount', 'i4')])

diff --git a/odo/utils.py b/odo/utils.py
@@ -207,7 +207,7 @@ def records_to_tuples(ds, data):
     if isinstance(ds, (str, unicode)):
         ds = dshape(ds)
     if isinstance(ds.measure, Record) and len(ds.shape) == 1:
-        return pluck(ds.measure.names, data)
+        return pluck(ds.measure.names, data, default=None)
     if isinstance(ds.measure, Record) and len(ds.shape) == 0:
         return get(ds.measure.names, data)
     if not isinstance(ds.measure, Record):