Skip to content

Commit

Permalink
Added changes from Bill Bushey, Brian Young. body_txt() method goes f…
Browse files Browse the repository at this point in the history
…rom being O(n^3) to O(n^2)
  • Loading branch information
aidanf committed Apr 2, 2010
1 parent 3f97727 commit eac90e6
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
*.pyc
116 changes: 101 additions & 15 deletions BodyTextExtractor.py 100755 → 100644
@@ -1,18 +1,23 @@
#!/usr/bin/python

'''
Modified by Bill Bushey <wbushey@acm.org> and Brian Young <byoung061@gmail.com> on August 10th, 2009
'''

import htmllib
import formatter
import string
import sys,urllib
import time

class HtmlTokenParser(htmllib.HTMLParser):
# return a dictionary mapping anchor texts to lists
# of associated hyperlinks
# return a dictionary mapping anchor texts to lists
# of associated hyperlinks
def __init__(self, verbose=0):
self.tokens = []
self.tokens = []
self.binary_tokens = []
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def unknown_tag(self):
self.tokens.append("TAG")
self.binary_tokens.append(1)
Expand All @@ -34,25 +39,32 @@ def handle_endtag(self,tag, method):
self.binary_tokens.append(1)

class HtmlBodyTextExtractor(HtmlTokenParser):
''' Modified to include the initialization of total_tokens_before'''
def __init__(self):
HtmlTokenParser.__init__(self)
self.encoded = [0]
self.total_tokens_before = [0]
self.lookup0N = [0]
self.lookupN0 = [0]
self.lookupN0 = [0]
self.body_txt = ""

def close(self):
HtmlTokenParser.close(self)
self._encode_binary_tokens()
self._initialise_lookups()

''' Modified to set values in total_tokens_before'''
def _encode_binary_tokens(self):
i = 0
for x in self.binary_tokens:
if(abs(x + self.encoded[i]) < abs(self.encoded[i])):
i = i + 1
self.encoded.append(0)
self.encoded[i] = self.encoded[i] + x
self.total_tokens_before.append(self.total_tokens_before[-1])
i = i + 1
self.encoded[i] = self.encoded[i] + x
self.total_tokens_before[i] = self.total_tokens_before[i] + 1
# total_tokens_before works better in the rest of the class if we shift all values up one index
self.total_tokens_before.insert(0,0)

def _initialise_lookups(self):
t = 0
Expand All @@ -71,30 +83,90 @@ def _initialise_lookups(self):
del(self.lookupN0[0]) #will never need these values
del(self.lookup0N[-1])

'''
This method has been modified to be in O(1).
This version of the method works with the assumption that all nodes are
either text or tags. Since we can quickly find out the number of tags
that have occured upto a given region, and the number of total tags up
to that region, we can quickly calculate the number of text nodes that
have occured upto that region.
The original method is available as _objective_fcn_old
'''
def _objective_fcn(self,i,j):
tags_to_i = self.lookup0N[i]
tags_after_j = self.lookupN0[j]

text_to_i = self.total_tokens_before[i] - tags_to_i
text_to_j = self.total_tokens_before[j] - self.lookup0N[j]

text_between_i_j = text_to_j - text_to_i
return_val = tags_to_i + tags_after_j + text_between_i_j
return return_val

'''
The original method, which is in O(n)
'''
def _objective_fcn_old(self,i,j):
return_val = self.lookup0N[i] + self.lookupN0[j]
for x in self.encoded[i:j]:
if(x<0):
return_val = return_val - x
return return_val


def _is_tag(self,s):
if(s[0]=='<' and s[-1]=='>'):
return(1)
else:
return(0)

'''
Method which uses the modified version of _objective_fcn, this function is in O(n^2)
This method has also been modified to improve the finding of the 'start' and 'end' variables
Finally, body_text now uses the join method for building the output string
'''
def body_text(self):
self.body_txt = ""
obj_max = 0
i_max = 0
j_max = len(self.encoded)-1
for i in range(len(self.encoded)-1):
if self.encoded[i] > 0:
continue
for j in range(i,len(self.encoded)):
if self.encoded[j] > 0:
continue
obj = self._objective_fcn(i,j)
if(obj > obj_max):
obj_max = obj
i_max = i
j_max = j
start = self.total_tokens_before[i_max]
end = self.total_tokens_before[j_max]

self.body_txt = " ".join(x for x in self.tokens[start:end] if not self._is_tag(x))

# This is added for testing purposes, so that the old and new versions produce the same string.
self.body_txt = self.body_txt + " "

return(self.body_txt)

'''
Method which uses _objective_fcn_old, this function is in O(n^3)
'''
def body_text_old(self):
self.body_txt = ""
obj_max = 0
i_max = 0
j_max = len(self.encoded)-1
for i in range(len(self.encoded)-1):
for j in range(i,len(self.encoded)):
obj = self._objective_fcn_old(i,j)
if(obj > obj_max):
obj_max = obj
i_max = i
j_max = j
start = 0
end = 0
for x in self.encoded[:i_max]:
Expand All @@ -104,31 +176,45 @@ def body_text(self):
for x in self.tokens[start:-end]:
if(not(self._is_tag(x))):
self.body_txt = self.body_txt + x + " "
return(self.body_txt)
return(self.body_txt)

def summary(self, start=0, bytes=255):
if(not(self.body_txt)):
self.body_text()
return(self.body_txt[start:(start+bytes)])

'''
Modified to use the more efficient join method for building the string
'''
def full_text(self):
ft = ""
for x in self.tokens[0:-1]:
if(not(self._is_tag(x))):
ft = ft + x + " "
ft = " ".join(x for x in self.tokens if not self._is_tag(x))
return ft

if __name__ == '__main__':
html = open(sys.argv[1]).read()
t0 = time.clock()
p = HtmlBodyTextExtractor()
p.feed(html)
p.close()
r10 = range(10)
t1 = time.clock()
for r in r10:
x = p.body_text()
t2 = time.clock()
for r in r10:
z = p.body_text_old()
t3 = time.clock()
x = p.body_text()
z = p.body_text_old()
s = p.summary()
t = p.full_text()
print "\n\nSummary:\n",s
print "\nBodytext:\n",x
print "\nFulltext:\n",t
# print "\nNew Bodytext:\n",x
# print "\nOld Bodytext:\n",z
# print "\nFull Text:\n",t
if (x == z):
print "The SAME!!!!!\n"
print "Time to initialize: %f\nTime for new method: %f\nTime for old method: %f\n" % (t1-t0, t2-t1, t3-t2)

# (c) 2001 Aidan Finn
# Released under the terms of the GNU GPL
Expand Down
5 changes: 5 additions & 0 deletions CONTRIB
@@ -0,0 +1,5 @@
Original version: Aidan Finn (http://www.aidanf.net)

Aug. 09, Updates that change body_txt() method from being in O(n^3) to O(n^2): Bill Bushey, Brian Young.

To contribute, fork the github repository, make your changes and push the repository back and send me a pull request. See http://help.github.com/forking/ for more details.
1 change: 1 addition & 0 deletions VERSION
@@ -0,0 +1 @@
0.2 - April 02, 2010
14 changes: 14 additions & 0 deletions example.py
@@ -0,0 +1,14 @@
import sys,BodyTextExtractor

# Usage: python example.py html_file

html = open(sys.argv[1]).read()
p = BodyTextExtractor.HtmlBodyTextExtractor()
p.feed(html)
p.close()
x = p.body_text()
s = p.summary()
t = p.full_text()
print "\n\nSummary:\n",s
print "\nBodytext:\n",x
print "\nFulltext:\n",t

0 comments on commit eac90e6

Please sign in to comment.