Added changes from Bill Bushey, Brian Young. body_txt() method goes f…

…rom being O(n^3) to O(n^2)
aidanf · Apr 2, 2010 · eac90e6 · eac90e6
1 parent 3f97727
commit eac90e6
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/BodyTextExtractor.py b/BodyTextExtractor.py
@@ -1,18 +1,23 @@
 #!/usr/bin/python
 
+'''
+Modified by Bill Bushey <wbushey@acm.org> and Brian Young <byoung061@gmail.com> on August 10th, 2009
+'''
+
 import htmllib
 import formatter
 import string 
 import sys,urllib
+import time
 
 class HtmlTokenParser(htmllib.HTMLParser):
-    	# return a dictionary mapping anchor texts to lists
-   	# of associated hyperlinks     
+	# return a dictionary mapping anchor texts to lists
+	# of associated hyperlinks     
 	def __init__(self, verbose=0):
-        	self.tokens = []
+		self.tokens = []
 		self.binary_tokens = []
-        	f = formatter.NullFormatter()
-        	htmllib.HTMLParser.__init__(self, f, verbose)     
+		f = formatter.NullFormatter()
+		htmllib.HTMLParser.__init__(self, f, verbose)     
 	def unknown_tag(self):
 		self.tokens.append("TAG")
 		self.binary_tokens.append(1)
@@ -34,25 +39,32 @@ def handle_endtag(self,tag, method):
 		self.binary_tokens.append(1)
 
 class HtmlBodyTextExtractor(HtmlTokenParser):
+	''' Modified to include the initialization of total_tokens_before'''
 	def __init__(self):
 		HtmlTokenParser.__init__(self)
 		self.encoded = [0]
+		self.total_tokens_before = [0]
 		self.lookup0N = [0]
-		self.lookupN0 = [0]		
+		self.lookupN0 = [0]
 		self.body_txt = ""
 
 	def close(self):
 		HtmlTokenParser.close(self)
 		self._encode_binary_tokens()
 		self._initialise_lookups()
 
+	''' Modified to set values in total_tokens_before'''
 	def _encode_binary_tokens(self):
 		i = 0
 		for x in self.binary_tokens:
 			if(abs(x + self.encoded[i]) < abs(self.encoded[i])):
-				i = i + 1
 				self.encoded.append(0)
-			self.encoded[i] = self.encoded[i] + x			
+				self.total_tokens_before.append(self.total_tokens_before[-1])
+				i = i + 1
+			self.encoded[i] = self.encoded[i] + x
+			self.total_tokens_before[i] = self.total_tokens_before[i] + 1
+		# total_tokens_before works better in the rest of the class if we shift all values up one index
+		self.total_tokens_before.insert(0,0) 
 
 	def _initialise_lookups(self):
 		t = 0
@@ -71,30 +83,90 @@ def _initialise_lookups(self):
 		del(self.lookupN0[0]) #will never need these values
 		del(self.lookup0N[-1])
 
+	'''
+	This method has been modified to be in O(1).
+	This version of the method works with the assumption that all nodes are
+	either text or tags. Since we can quickly find out the number of tags
+	that have occured upto a given region, and the number of total tags up
+	to that region, we can quickly calculate the number of text nodes that 
+	have occured upto that region.
+
+	The original method is available as _objective_fcn_old 
+	'''
 	def _objective_fcn(self,i,j):
+		tags_to_i = self.lookup0N[i]
+		tags_after_j = self.lookupN0[j]
+
+		text_to_i = self.total_tokens_before[i] - tags_to_i
+		text_to_j = self.total_tokens_before[j] - self.lookup0N[j]
+
+		text_between_i_j = text_to_j - text_to_i
+		return_val = tags_to_i + tags_after_j + text_between_i_j
+		return return_val
+
+	'''
+	The original method, which is in O(n)
+	'''
+	def _objective_fcn_old(self,i,j):
 		return_val = self.lookup0N[i] + self.lookupN0[j]
 		for x in self.encoded[i:j]:
 			if(x<0):
 				return_val = return_val - x
 		return return_val
 
+
 	def _is_tag(self,s):
 		if(s[0]=='<' and s[-1]=='>'):
 			return(1)
 		else:
 			return(0)
 
+	'''
+	Method which uses the modified version of _objective_fcn, this function is in O(n^2)
+	This method has also been modified to improve the finding of the 'start' and 'end' variables
+	Finally, body_text now uses the join method for building the output string
+	'''
 	def body_text(self):
+		self.body_txt = ""
 		obj_max = 0
 		i_max = 0
 		j_max = len(self.encoded)-1
 		for i in range(len(self.encoded)-1):
+			if self.encoded[i] > 0:	
+				continue
 			for j in range(i,len(self.encoded)):
+				if self.encoded[j] > 0:
+					continue
 				obj = self._objective_fcn(i,j)
 				if(obj > obj_max):
 					obj_max = obj
 					i_max = i
 					j_max = j
+		start = self.total_tokens_before[i_max]
+		end = self.total_tokens_before[j_max]
+
+		self.body_txt = " ".join(x for x in self.tokens[start:end] if not self._is_tag(x))
+
+		# This is added for testing purposes, so that the old and new versions produce the same string.
+		self.body_txt = self.body_txt + " "
+
+		return(self.body_txt)	
+
+	'''
+	Method which uses _objective_fcn_old, this function is in O(n^3)
+	'''
+	def body_text_old(self):
+		self.body_txt = ""
+		obj_max = 0
+		i_max = 0
+		j_max = len(self.encoded)-1
+		for i in range(len(self.encoded)-1):
+			for j in range(i,len(self.encoded)):
+				obj = self._objective_fcn_old(i,j)
+				if(obj > obj_max):
+					obj_max = obj
+					i_max = i
+					j_max = j
 		start = 0
 		end = 0
 		for x in self.encoded[:i_max]:
@@ -104,31 +176,45 @@ def body_text(self):
 		for x in self.tokens[start:-end]:
 			if(not(self._is_tag(x))):
 				self.body_txt = self.body_txt + x + " "
-		return(self.body_txt)	
+		return(self.body_txt)
 
 	def summary(self, start=0, bytes=255):
 		if(not(self.body_txt)):
 			self.body_text()
 		return(self.body_txt[start:(start+bytes)])
 
+	'''
+	Modified to use the more efficient join method for building the string
+	'''
 	def full_text(self):
 		ft = ""
-		for x in self.tokens[0:-1]:
-			if(not(self._is_tag(x))):
-				ft = ft + x + " "
+		ft = " ".join(x for x in self.tokens if not self._is_tag(x))
 		return ft
 
 if __name__ == '__main__':
 	html = open(sys.argv[1]).read()
+	t0 = time.clock()
 	p = HtmlBodyTextExtractor()
 	p.feed(html)
 	p.close()
+	r10 = range(10)
+	t1 = time.clock()
+	for r in r10:
+		x = p.body_text()
+	t2 = time.clock()
+	for r in r10:
+		z = p.body_text_old()
+	t3 = time.clock()
 	x = p.body_text()
+	z = p.body_text_old()
 	s = p.summary()
 	t = p.full_text()
-	print "\n\nSummary:\n",s
-	print "\nBodytext:\n",x
-	print "\nFulltext:\n",t
+#	print "\nNew Bodytext:\n",x
+#	print "\nOld Bodytext:\n",z
+#	print "\nFull Text:\n",t
+	if (x == z):
+		print "The SAME!!!!!\n"
+	print "Time to initialize: %f\nTime for new method: %f\nTime for old method: %f\n" % (t1-t0, t2-t1, t3-t2)
 
 # (c) 2001 Aidan Finn
 # Released under the terms of the GNU GPL

diff --git a/CONTRIB b/CONTRIB
@@ -0,0 +1,5 @@
+Original version: Aidan Finn (http://www.aidanf.net)
+
+Aug. 09, Updates that change body_txt() method from being in O(n^3) to O(n^2): Bill Bushey, Brian Young.
+
+To contribute, fork the github repository, make your changes and push the repository back and send me a pull request. See http://help.github.com/forking/ for more details.
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.2 - April 02, 2010
diff --git a/example.py b/example.py
@@ -0,0 +1,14 @@
+import sys,BodyTextExtractor
+
+# Usage: python example.py html_file
+
+html = open(sys.argv[1]).read()
+p = BodyTextExtractor.HtmlBodyTextExtractor()
+p.feed(html)
+p.close()
+x = p.body_text()
+s = p.summary()
+t = p.full_text()
+print "\n\nSummary:\n",s
+print "\nBodytext:\n",x
+print "\nFulltext:\n",t