In [None]:
Basic word (token) counts for XML responses.

I will note, wherever necessary, assumptions related to the stopwords corpora. For this count, mimetypes, namespaces and URNs (specifically those related to EPSG codes and the like). Also ditching numbers and timestamps.

TODOs:

- group by identification
- deal with outstanding stopwords issues
- configure the parallel processing support?
- at least use the cleaned versions instead of doing that again.

In [8]:
%reload_ext autoreload
%autoreload 2


import glob
import json
import re
import dateutil.parser as dateparser
from itertools import chain

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import WordListCorpusReader

from semproc.rawresponse import RawResponse
from semproc.bag_parser import BagParser

In [21]:
def remove_stopwords(text):
    '''
    remove any known english stopwords from a
    piece of text (bag of words or otherwise)
    '''
    _stopwords = set(stopwords.words('english'))
    words = word_tokenize(text)
    words = words if isinstance(words, list) else words.split()
    return ' '.join([w for w in words if w not in _stopwords and w])


def load_token_list(term_file):
    '''
    load some stopword list from the corpus
    '''
    __location__ = '../corpora/'
    tokens = WordListCorpusReader(__location__, term_file)
    return [w.replace('+', '\+') for w in tokens.words()]


def remove_tokens(term_file, text):
    '''
    do this before something like tokenize or the
    resplit option will split the mimetypes to not
    be recognizable as such anymore
    '''
    words = load_token_list(term_file)

    pttn = re.compile('|'.join(words))
    return pttn.sub('', text)

def remove_numeric(text):
    match_pttn = ur'\w*\b-?\d\s*\w*'
    captures = re.findall(match_pttn, u' {0} '.format(text))

    # strip them out
    if captures:
        text = re.sub('|'.join(captures), ' ', text)
        return '' if text == '0' else text

    return text

def strip_dates(text):
        # this should still make it an invalid date
        # text = text[3:] if text.startswith('NaN') else text
        try:
            d = dateparser.parse(text)
            return ''
        except ValueError:
            return text
        except OverflowError:
            return text
        
def strip_filenames(text):
    # we'll see
    exts = ('png', 'jpg', 'hdf', 'xml', 'doc', 'pdf', 'txt', 'jar', 'nc', 'XSL', 'kml', 'xsd')
    return '' if text.endswith(exts) else text
    
def strip_identifiers(texts):
    # chuck any urns, urls, uuids
    _pattern_set = [
        ('url', ur"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""),
        # a urn that isn't a url
        ('urn', ur"(?![http://])(?![https://])(?![ftp://])(([a-z0-9.\S][a-z0-9-.\S]{0,}\S:{1,2}\S)+[a-z0-9()+,\-.=@;$_!*'%/?#]+)"),
        ('uuid', ur'([a-f\d]{8}(-[a-f\d]{4}){3}-[a-f\d]{12}?)'),
        ('doi', ur"(10[.][0-9]{4,}(?:[/][0-9]+)*/(?:(?![\"&\\'])\S)+)"),
        ('md5', ur"([a-f0-9]{32})")
    ]
    for pattern_type, pattern in _pattern_set:
        for m in re.findall(re.compile(pattern), texts):
            m = max(m) if isinstance(m, tuple) else m
            try:
                texts = texts.replace(m, '')
            except Exception as ex:
                print ex
                print m
                
    files = ['cat_interop_urns.txt', 'mimetypes.txt', 'namespaces.txt']
    for f in files:
        texts = remove_tokens(f, texts)
    return texts.split()

def remove_punctuation(text):
    simple_pattern = r'[;|>+:=.,()/?!\[\]{}]'
    text = re.sub(simple_pattern, ' ', text)
    text = text.replace(' - ', ' ').strip()
    return text if text != '-' else ''

def strip_punctuation(text):
    terminal_punctuation = '(){}[].,~|":'
    return text.strip(terminal_punctuation).strip()
    
def clean(text):
    text = strip_dates(text)
    text = remove_numeric(text)
    
    text = remove_punctuation(text.strip()).strip()
    text = strip_punctuation(text)
    
    text = strip_filenames(text)
    
    return text
        
exclude_tags = ['schemaLocation', 'noNamespaceSchemaLocation']

In [25]:
files = glob.glob('/Users/sparky/Documents/solr_responses/solr_20150922_docs/f4*.json')

for f in files[10:20]:
    with open(f, 'r') as g:
        data = json.loads(g.read())
        
    print data.get('url')
    
    rr = RawResponse('', data.get('raw_content'), '')
    content = rr.clean_raw_content().decode('string_escape')
    
    # strip the html cruft but ignore the a tags
    bp = BagParser(content, True, False)
    if not bp.parser.xml:
        print 'NOT XML: ', content[:100]
        continue
    # we don't care about the fully qualified namespace here
    stripped_text = [b[1].split() for b in bp.strip_text(exclude_tags) if b[1]]
    stripped_text = list(chain.from_iterable(stripped_text))
    cleaned_text = [s for s in stripped_text if clean(s)]
    
    bow = strip_identifiers(' '.join(cleaned_text))
    
    print
    print stripped_text
    print cleaned_text
    print bow
    

http://www.interieur.gouv.fr/avotreservice/elections/telechargements/CN2008/resultatsT1/017/01703121.xml

['Cantonales', '2008', '17', '17', '017', 'CHARENTE', 'MARITIME', 'A', '017A', 'B', '017B', 'C', '017C', 'D', '017D', 'E', '017E', 'F', '017F', 'G', '017G', 'H', '017H', 'J', '017J', 'L', '017L', 'M', '017M', 'N', '017N', 'O', '017O', 'P', '017P', 'R', '017R', 'S', '017S', 'T', '017T', 'V', '017V', '121', 'La', 'Couarde-sur-Mer', '03', 'ARS-EN-RE', 'La', 'Couarde-sur-Mer', 'Canton', 'de', 'ARS-EN-RE', '1', '09', '03', '2008', '1262', '415', '32,88', '847', '67,12', '24', '1,90', '2,83', '823', '65,21', '97,17', '17C03089', '1', 'OLIVIER', 'Jean-Louis', 'M.', 'M-NC', 'Majorit', 'dont', 'le', 'Nouveau', 'Centre', '223', '27,10', '17C03023', '2', 'QUILLET', 'Lionel', 'M.', 'DVD', 'Divers', 'droite', '426', '51,76', '17C03121', '3', 'DENIEL', 'Annie', 'Mme', 'SOC', 'Socialiste', '174', '21,14', '2', '16', '03', '2008']
['Cantonales', 'CHARENTE', 'MARITIME', 'A', 'B', 'C', 'D', 'E', 'F'

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


['11156', 'PRJ', 'Published', '/', 'External', 'N', 'Research', 'Set-Aside', '(RSA)', 'Program', 'Research', 'Set-Aside', '(RSA)', 'Program', 'Research', 'Set-Aside', 'programs', '(RSAs)', 'were', 'developed', 'by', 'the', 'New', 'England', 'Fishery', 'Management', 'Council', '(NEFMC)', 'and', 'the', 'Mid-Atlantic', 'Fishery', 'Management', 'Council', '(MAFMC)', 'as', 'part', 'of', 'the', 'fishery', 'management', 'plan', 'process,', 'and', 'are', 'administered', 'by', 'the', 'National', 'Marine', 'Fisheries', 'Service.', 'To', 'collect', 'trip-level', 'data', 'from', 'all', 'vessels', 'participating', 'in', 'Research', 'Set-Aside', '(RSA)', 'programs', '(both', 'state-only', 'and', 'federally', 'permitted', 'vessels', 'participate).', 'There', 'are', '3', 'types', 'of', 'RSA', 'projects:', 'Mid-Atlantic', 'projects', '(which', 'land', 'black', 'sea', 'bass,', 'bluefish,', 'loligo', 'squid,', 'scup', 'and', 'summer', 'flounder),', 'Monkfish', 'projects,', 'and', 'Scallop', 'projects.', 

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
