Can we identify JSON blobs as freetext in XML elements or attributes?

By host, by qualified xpath. Number of XMLs.

In [1]:
%reload_ext autoreload
%autoreload 2

import os
import json
import glob
from lxml import etree
from semproc.rawresponse import RawResponse
from semproc.xml_utils import extract_elems
import urlparse

In [2]:
def convert_header_list(headers):
    return dict(
        (k.strip().lower(), v.strip()) for k, v in (
            h.split(':', 1) for h in headers)
    )

def get_xml(response, content_type):
    rr = RawResponse(response, content_type)
    try:
        content = rr.clean_raw_content()
    except:
        return None

    if rr.datatype != 'xml':
        return None
    
    try:
        return etree.fromstring(content)
    except:
        return None

    return None

# a modified version of the bag parser
# we want to return the text and the xpath
# of anything that parses as json
class XmlParser(object):
    def __init__(self, xml):
        self.xml = xml
        
    def _is_json(self, text):
        try:
            j = json.loads(text)
            return isinstance(j, dict)
        except:
            return False
        return False
    
    def parse(self):
        def _extract_tag(t):
            if not t:
                return
            return t.split('}')[-1]

        def _taggify(e):
            tags = [e.tag] + [m.tag for m in e.iterancestors()]
            tags.reverse()

            try:
                return [_extract_tag(t) for t in tags]
            except:
                return []
        
        for elem in self.xml.iter():
            t = elem.text.strip() if elem.text else ''
            tags = _taggify(elem)
            
            if t and self._is_json(t):
                yield (t, '/'.join(tags))
        
            for k, v in elem.attrib.iteritems():
                a = v.strip()
                if a and self._is_json(a):
                    yield (a, '/'.join(tags + ['@' + _extract_tag(k)]))

In [3]:
files = glob.glob('/Users/sparky/Documents/solr_responses/solr_20150922_docs/*.json')

In [4]:
PATH = 'outputs/responses_with_json.csv'
with open(PATH, 'w') as f:
    f.write('file|json|xpath\n')

for i, f in enumerate(files):
    if i % 10000 == 0:
        print 'finished: ', i
    with open(f, 'r') as g:
        data = json.loads(g.read())
    
    response = data.get('raw_content')
    headers = convert_header_list(data.get('response_headers', []))
    content_type = headers.get('content-type', '')
    
    xml = get_xml(response, content_type)
    if xml is None:
        continue
    
    parser = XmlParser(xml)
    jsons = [j for j in parser.parse()]
    
    if jsons:
        with open(PATH, 'a') as g:
            for j, x in jsons:
                g.write('{0}|{1}|{2}\n'.format(f, j.replace('|', ';').encode('UTF-8'), x))

finished:  0
finished:  10000
finished:  20000
finished:  30000
finished:  40000
finished:  50000
finished:  60000
finished:  70000
finished:  80000
finished:  90000
finished:  100000
finished:  110000
finished:  120000
finished:  130000
finished:  140000
finished:  150000
finished:  160000
finished:  170000
finished:  180000
finished:  190000
finished:  200000
finished:  210000
finished:  220000
finished:  230000
finished:  240000
finished:  250000
finished:  260000
finished:  270000
finished:  280000
finished:  290000
finished:  300000
finished:  310000
finished:  320000
finished:  330000
finished:  340000
finished:  350000
finished:  360000
finished:  370000
finished:  380000
finished:  390000
finished:  400000
finished:  410000
finished:  420000
finished:  430000
finished:  440000
finished:  450000
finished:  460000
finished:  470000
finished:  480000
finished:  490000
finished:  500000
finished:  510000
finished:  520000
finished:  530000
finished:  540000
finished:  550000
finish