Can we identify JSON blobs as freetext in XML elements or attributes?

By host, by qualified xpath. Number of XMLs.

In [1]:
%reload_ext autoreload
%autoreload 2

import os
import json
import glob
from lxml import etree
from semproc.rawresponse import RawResponse
from semproc.xml_utils import extract_elems
import urlparse

In [21]:
def convert_header_list(headers):
    return dict(
        (k.strip().lower(), v.strip()) for k, v in (
            h.split(':', 1) for h in headers)
    )

def get_xml(response, content_type):
    rr = RawResponse(response, content_type)
    try:
        content = rr.clean_raw_content()
    except:
        return None

    if rr.datatype != 'xml':
        return None
    
    try:
        return etree.fromstring(content)
    except:
        return None

    return None

# a modified version of the bag parser
# we want to return the text and the xpath
# of anything that parses as json
class XmlParser(object):
    def __init__(self, xml):
        self.xml = xml
        
    def _is_json(self, text):
        try:
            j = json.loads(text)
            return isinstance(j, dict)
        except:
            return False
        return False
    
    def parse(self):
        def _extract_tag(t):
            if not t:
                return
            return t.split('}')[-1]

        def _taggify(e):
            tags = [e.tag] + [m.tag for m in e.iterancestors()]
            tags.reverse()

            try:
                return [_extract_tag(t) for t in tags]
            except:
                return []
        
        for elem in self.xml.iter():
            t = elem.text.strip() if elem.text else ''
            tags = _taggify(elem)
            
            if t and self._is_json(t):
                yield (t, '/'.join(tags))
        
            for k, v in elem.attrib.iteritems():
                a = v.strip()
                if a and self._is_json(a):
                    yield (a, '/'.join(tags + ['@' + _extract_tag(k)]))

In [4]:
files = glob.glob('/Users/sparky/Documents/solr_responses/solr_20150922_docs/*.json')

In [None]:
PATH = 'outputs/responses_with_json.csv'
with open(PATH, 'w') as f:
    f.write('file|json|xpath\n')

for f in files:
    with open(f, 'r') as g:
        data = json.loads(g.read())
    
    response = data.get('raw_content')
    headers = convert_header_list(data.get('response_headers', []))
    content_type = headers.get('content-type', '')
    
    xml = get_xml(response, content_type)
    if xml is None:
        continue
    
    parser = XmlParser(xml)
    jsons = [j for j in parser.parse()]
    
    if jsons:
        with open(PATH, 'a') as g:
            for j, x in jsons:
                g.write('{0}|{1}|{2}\n'.format(f, j.replace('|', ';').encode('UTF-8'), x))