Skip to content

Commit

Permalink
minor cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
cagerton committed Feb 11, 2012
1 parent ea8fc16 commit 44163d4
Showing 1 changed file with 6 additions and 9 deletions.
15 changes: 6 additions & 9 deletions xray.py
Expand Up @@ -2,28 +2,25 @@
from BeautifulSoup import BeautifulSoup
import re


class XRayHasher():
"""Hacky groundwork of a project that extracts some concept of structure from documents without looking at the coentent"""

# these tokens are going to be hella fat.
def gen_token(self, tag, height=1, attrs=True, prefix=""):
"""Generates a token for a given tag."""
## strip whitespace.
if attrs:
at_string = "".join(sorted(["K:%s;V:%s;" % (k.lower(),v.lower()) for k,v in tag.attrs]))
else:
at_string = ""

tag_string = "T:%s;%s" % (tag.name, at_string)

if height > 1:
child_tokens = [self.gen_token(child, height=height-1, attrs=attrs) for child in tag.findAll(recursive=False)]
child_tokens = [token for child in tag.findAll(recursive=False) for toekn in self.gen_token(child, height=height-1, attrs=attrs)]
else:
child_tokens = [[""]]
if len(child_tokens)==0:
child_tokens = [["LEAF"]]
child_tokens = [""]

# flatten. ugly.
child_tokens = [item for sublist in child_tokens for item in sublist]
if len(child_tokens)==0:
child_tokens = ["LEAF"]

return ["%s-%s" % (tag_string, child_token) for child_token in [t for t in child_tokens]]

Expand Down

0 comments on commit 44163d4

Please sign in to comment.