Skip to content


Subversion checkout URL

You can clone with
Download ZIP
tree: 597e0e748e
Fetching contributors…

Cannot retrieve contributors at this time

executable file 106 lines (91 sloc) 2.733 kB
#!/usr/bin/env python
Convert a stream of JSON objects to TSV, extracting the keys
you specify into columns.
If you don't specify any keys, it tries to figure out the set of all keys.
for m in ['json','simplejson','yajl']:
exec "import %s as simplejson" % m
except ImportError:
raise Exception("Didn't find a JSON library")
import sys,re,itertools
import tsvutil
#all_json = simplejson.load(sys.stdin)
#assert isinstance(all_json, list) and len(all_json)>0
#item1 = json[0]
#keys = items1.keys()
#json_iter = all_json
def safe_json_iter(raw_json_iter):
for raw in raw_json_iter:
data = simplejson.loads(raw)
yield data
except Exception, e:
print>>sys.stderr, type(e), e
print>>sys.stderr, repr(raw)
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(itertools.islice(iterable, n))
def order_keys(keys, item_sample):
# Some heuristics that may or may not help
from collections import defaultdict
types = defaultdict(list)
lengths = defaultdict(list)
mid = int(len(item_sample)/2)
def median(L): return L[mid]
for item in item_sample:
for key in keys:
value = item[key]
if isinstance(value, unicode):
lengths = dict((k, 0 if not L else median(L)) for (k,L) in lengths.items())
types = dict((k,tsvutil.mode(L)) for k,L in types.items())
prios = [
lambda k: k=='id',
lambda k: k.endswith('_id'),
lambda k: k==('docid'),
lambda k: True]
def score(key):
prio = [i for i,f in enumerate(prios) if f(key)][0]
return (prio, lengths.get(key,0))
return sorted(keys, key=score)
json_iter = safe_json_iter(sys.stdin)
keys = sys.argv[1:]
if not keys:
top = take(1000, json_iter)
keys = set()
for item in top: keys |= set(item.keys())
keys = order_keys(keys, top)
json_iter = itertools.chain(top, json_iter)
BAD = re.compile("[\r\n\t]")
def clean_cell(x):
if x is None: return ""
return BAD.sub(" ", unicode(x))
def lookup(json, k):
# limited hacky subset of jpath
parts = k.split('.')
v = json
for part in parts:
if part.endswith(']'):
m ='^([^\]]+)\[([^\]])\]$', part)
if not m: raise Exception("wtf: %s" % repr(part))
k,ind = m.groups()
ind = eval(ind)
v = v[k]
v = v[ind]
k = part
v = v.get(k,{})
return v or ""
print "\t".join(keys)
for json in json_iter:
print "\t".join([clean_cell(lookup(json,k)) for k in keys])
#print "\t".join([clean_cell(json[k]) for k in keys])
#print(*[clean_cell(json[k]) for k in keys], sep="\t")
Jump to Line
Something went wrong with that request. Please try again.