Permalink
Browse files

Parse internal citations for U.S. Code

  • Loading branch information...
1 parent ab79fe3 commit e43d8c28e30ff4dce73538d240ad056987668718 @aih committed May 7, 2012
Showing with 1,528 additions and 0 deletions.
  1. +115 −0 autoparser.py
  2. +664 −0 namedacts.pkl
  3. +664 −0 namedactsabbr.pkl
  4. +20 −0 splithtml.py
  5. +65 −0 usc-regex.txt
View
@@ -0,0 +1,115 @@
+#!/usr/bin/python<2.7>
+# -*- coding: utf-8 -*-
+
+import string, os, sys
+from splithtml import *
+import codecs
+try:
+ import re2 as re
+except:
+ import re
+
+regfile = os.path.abspath(os.path.dirname(__file__))+'/usc-regex.txt'
+divider="<!-- documentid:"
+_DEBUG = False
+
+# To parse the html in data/, documentid:26_12 = 26usc12
+
+
+def makergxlist(regfile):
+ rfile = codecs.open(regfile, 'r', 'utf-8')
+ findreplace = []
+ for eachline in rfile.readlines():
+ if (not eachline.isspace()) & (eachline[0] <> '#'):
+ pattern_replace = unicode(eachline).strip().split('#')
+ findreplace.append(pattern_replace)
+ rfile.close()
+ findreplace = [(re.compile(pattern, re.U|re.M), replacement) for pattern, replacement in findreplace]
+ return findreplace
+
+def subfile(inputfile, findreplace):
+ outtext = inputfile
+ if inputfile is None:
+ return outtext
+
+ for counter, pattern_replace in enumerate(findreplace):
+ try:
+ outtext, subs = re.subn(pattern_replace[0], pattern_replace[1], outtext)
+ if _DEBUG and subs > 0:
+ print pattern_replace[0].pattern
+ print pattern_replace[1]
+ print "=" * 30
+ except KeyboardInterrupt:
+ print counter, ' substituted: ', subs
+ print pattern_replace[0], "||||", pattern_replace[1], outtext
+ return outtext
+
+# Dynamically replace references with links
+def parsesections(pattern, pattern_replace, section):
+ sectionsref = re.search(pattern, section)
+ while sectionsref:
+ i1 = sectionsref.start(1)
+ i2 = sectionsref.end(2)
+ #print "found multiple secs at", i1, "-", i2
+ section = section[:i1]+re.sub(pattern_replace[0], pattern_replace[1] % sectionsref.group(2), section[i1:i2]) + section[1+i2:]
+ sectionsref = re.search(pattern, section)
+ return section
+
+def parsenamedacts(pattern, intext):
+ namedacts = re.findall(pattern, intext)
+ namedacts = list(set(namedacts))
+ outtext = intext
+ for namedact in namedacts:
+ #outtext = outtext.replace(namedact+r'@/', encode_act(namedact)+r'@/')
+ outtext = outtext.replace(r'ref-namedact-'+namedact,r'ref-namedact-'+encode_act(namedact))
+ return outtext
+
+def encode_act(namedact):
+ #encoding = namedact.translate(None, string.punctuation) # removed because it is fails with unicode objects
+ encoding = ''.join([char if char not in string.punctuation else '' for char in namedact])
+ encodelen = str(len(encoding))
+ encoding = [char if char not in string.lowercase+" " else '' for char in encoding] #[encoding.translate(None, string.lowercase + r' ')]
+ encoding.append('-' + encodelen)
+ out = "".join(encoding)
+ return out
+
+# Loop through sections with various parsing passes
+def parsers(uscfiles, findreplace):
+ parsedfiles = []
+ for counter, section in enumerate(uscfiles):
+
+ #print ""
+ #print "File", counter
+ parsedfile = subfile(section, findreplace)
+ #print parsedfile
+ # Replace Multiple Section references with links
+ # Include [^<] to make sure no group is transformed twice
+ pattern = r'@@@\s[Ss]ections?\s([^<]*?)@@@@@(.*?)@@'
+ pattern_replace = [r'%s' % u'(\d+\w*(?:\(\w+\))*[-|–]?\d*)([, @])', r'<a href="/laws/target/%s/\1" class="sec">\1</a>\2']
+ parsedfile = parsesections(pattern, pattern_replace, parsedfile)
+ #parsedfile = re.sub(r'@of-ref@', r'ref-Title-'+title, parsedfile)
+ parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile)
+ #parsedfile = re.sub(r'ref-title-this', r'ref-title-'+title, parsedfile)
+
+
+
+ # Encode Named Acts by removing lowercase and non-word characters, and appending the length of the name w/o non-word characters
+ #pattern = r'@@ref-namedact-(.*?)@@'
+ pattern = r'/ref-namedact-(.*?)/'
+ parsedfile = parsenamedacts(pattern, parsedfile)
+
+ parsedfile = re.sub(r'@of-ref@', r'ref-title-this', parsedfile)
+ parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile)
+
+ # Remove remaining @
+ parsedfile = parsedfile.replace('@','')#.translate(None, '@')
+ parsedfiles.append(parsedfile)
+ return parsedfiles
+
+def parse(data=False):
+ uscfiles = [data, ]#.split(divider)#splithtml()
+ findreplace = makergxlist(regfile)
+ parsedfiles = parsers(uscfiles, findreplace)
+
+ return parsedfiles
+
Oops, something went wrong.

0 comments on commit e43d8c2

Please sign in to comment.