Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
tree: 597e0e748e
Fetching contributors…

Cannot retrieve contributors at this time

executable file 106 lines (91 sloc) 2.733 kB
#!/usr/bin/env python
"""
Convert a stream of JSON objects to TSV, extracting the keys
you specify into columns.
If you don't specify any keys, it tries to figure out the set of all keys.
"""
for m in ['json','simplejson','yajl']:
try:
exec "import %s as simplejson" % m
break
except ImportError:
pass
else:
raise Exception("Didn't find a JSON library")
import sys,re,itertools
import tsvutil
#all_json = simplejson.load(sys.stdin)
#assert isinstance(all_json, list) and len(all_json)>0
#item1 = json[0]
#keys = items1.keys()
#keys.sort()
#json_iter = all_json
def safe_json_iter(raw_json_iter):
for raw in raw_json_iter:
try:
data = simplejson.loads(raw)
yield data
except Exception, e:
print>>sys.stderr, type(e), e
print>>sys.stderr, repr(raw)
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(itertools.islice(iterable, n))
def order_keys(keys, item_sample):
# Some heuristics that may or may not help
from collections import defaultdict
types = defaultdict(list)
lengths = defaultdict(list)
mid = int(len(item_sample)/2)
def median(L): return L[mid]
for item in item_sample:
for key in keys:
value = item[key]
types[key].append(type(value))
if isinstance(value, unicode):
lengths[key].append(len(value))
lengths = dict((k, 0 if not L else median(L)) for (k,L) in lengths.items())
types = dict((k,tsvutil.mode(L)) for k,L in types.items())
prios = [
lambda k: k=='id',
lambda k: k.endswith('_id'),
lambda k: k==('docid'),
lambda k: True]
def score(key):
prio = [i for i,f in enumerate(prios) if f(key)][0]
return (prio, lengths.get(key,0))
return sorted(keys, key=score)
json_iter = safe_json_iter(sys.stdin)
keys = sys.argv[1:]
if not keys:
top = take(1000, json_iter)
keys = set()
for item in top: keys |= set(item.keys())
keys = order_keys(keys, top)
json_iter = itertools.chain(top, json_iter)
BAD = re.compile("[\r\n\t]")
def clean_cell(x):
if x is None: return ""
return BAD.sub(" ", unicode(x))
def lookup(json, k):
# limited hacky subset of jpath
parts = k.split('.')
v = json
for part in parts:
if part.endswith(']'):
m = re.search(r'^([^\]]+)\[([^\]])\]$', part)
if not m: raise Exception("wtf: %s" % repr(part))
k,ind = m.groups()
ind = eval(ind)
v = v[k]
v = v[ind]
else:
k = part
v = v.get(k,{})
return v or ""
tsvutil.fix_stdio()
print "\t".join(keys)
for json in json_iter:
print "\t".join([clean_cell(lookup(json,k)) for k in keys])
#print "\t".join([clean_cell(json[k]) for k in keys])
#print(*[clean_cell(json[k]) for k in keys], sep="\t")
Jump to Line
Something went wrong with that request. Please try again.