VERSION 1.0

abhinavgupta · Sep 7, 2013 · 7abc9b4 · 7abc9b4
1 parent fc00be5
commit 7abc9b4
Show file tree

Hide file tree

Showing 25 changed files with 333 additions and 209 deletions.
diff --git a/ENS/GoogleNews.py → ENS/GoogleRSSReader/GoogleNews.py b/ENS/GoogleNews.py → ENS/GoogleRSSReader/GoogleNews.py
diff --git a/ENS/GoogleRSSReader/__init__.py b/ENS/GoogleRSSReader/__init__.py
diff --git a/ENS/Summarize.py b/ENS/Summarize.py
diff --git a/ENS/Summary.py → ENS/TextRankSummarize/Summary.py b/ENS/Summary.py → ENS/TextRankSummarize/Summary.py
@@ -15,7 +15,7 @@ def textRank(document):
     nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
     scores = nx.pagerank(nx_graph)
     text_rank_graph = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
-    number_of_nodes = int(0.25*len(text_rank_graph))
+    number_of_nodes = int(0.3*len(text_rank_graph))
 
     if number_of_nodes < 3:
 		number_of_nodes = 3

diff --git a/ENS/TextRankSummarize/__init__.py b/ENS/TextRankSummarize/__init__.py
diff --git a/ENS/__init__.py b/ENS/__init__.py
@@ -1,9 +1,5 @@
-from ENS.Readability import Document
-from ENS.GoogleNews import newsSearch
-from ENS.Summarize import SimpleSummarizer
-from ENS.Summary import textRank
-from ENS.cleaners import *
-from ENS.debug import *
-from ENS.encoding import *
-from ENS.htmls import *
-from ENS.fetch_url import *
+from ENS.lxmlReadability.Readability import Document
+from ENS.bsReadability.bsReadability import readable
+from ENS.fetch_url import *
+from ENS.GoogleRSSReader.GoogleNews import newsSearch
+from ENS.TextRankSummarize.Summary import textRank
diff --git a/ENS/bsReadability/__init__.py b/ENS/bsReadability/__init__.py
diff --git a/ENS/bsReadability/bsReadability.py b/ENS/bsReadability/bsReadability.py
@@ -0,0 +1,204 @@
+# Adapted from http://github.com/scyclops/Readable-Feeds/blob/master/readability/hn.py
+# License: GPL3
+
+from __future__ import unicode_literals
+
+import os
+import sys
+import urllib
+import urlparse
+import re
+import HTMLParser
+
+from BeautifulSoup import BeautifulSoup
+
+
+NEGATIVE = re.compile("comment|meta|footer|footnote|foot")
+POSITIVE = re.compile("post|hentry|entry|content|text|body|article")
+PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
+CLEANUP = re.compile("<(.*?)>")
+
+
+# TODO: have sub-classes for specific exceptions
+class ReadabilityError(Exception):
+    """Base class for all readability related exceptions"""
+
+
+# XXX: we should auto-detect the encoding
+DEFAULT_ENCODING = 'latin-1'
+
+def grabContent(link, html, encoding=DEFAULT_ENCODING):
+    """Return (TITLE, CONTENT)
+where CONTENT is the readable version of ``html``
+"""
+    # Replace all doubled-up <BR> tags with <P> tags, and (TODO) remove fonts.
+    replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
+    html = re.sub(replaceBrs, "</p><p>", html)
+
+    try:
+        soup = BeautifulSoup(html)
+    except HTMLParser.HTMLParseError as e:
+        raise ReadabilityError('BeautifulSoup parse error: %s' % e)
+
+    # REMOVE SCRIPTS
+    for s in soup.findAll("script"):
+        s.extract()
+
+    allParagraphs = soup.findAll("p")
+    topParent = None
+
+    # Study all the paragraphs and find the chunk that has the best score.
+    # A score is determined by things like: Number of <p>'s, commas, special classes, etc.
+    parents = []
+    for paragraph in allParagraphs:
+
+        parent = paragraph.parent
+
+        if parent not in parents:
+            parents.append(parent)
+            parent.score = 0
+
+            # Look for a special classname
+            if "class" in parent:
+                if NEGATIVE.match(parent["class"]):
+                    parent.score -= 50
+                if POSITIVE.match(parent["class"]):
+                    parent.score += 25
+
+            # Look for a special ID
+            if "id" in parent:
+                if NEGATIVE.match(parent["id"]):
+                    parent.score -= 50
+                if POSITIVE.match(parent["id"]):
+                    parent.score += 25
+
+        if parent.score is None:
+            parent.score = 0
+
+        # Add a point for the paragraph found
+        innerText = paragraph.renderContents(
+            ).decode(encoding) # "".join(paragraph.findAll(text=True))
+        if len(innerText) > 10:
+            parent.score += 1
+
+        # Add points for any commas within this paragraph
+        parent.score += innerText.count(",")
+
+    # Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+    for parent in parents:
+        if (not topParent) or (parent.score > topParent.score):
+            topParent = parent
+
+    if not topParent:
+        raise ReadabilityError("no topParent")
+
+    # REMOVES ALL STYLESHEETS ...
+    styleLinks = soup.findAll("link", attrs={"type": "text/css"})
+    for s in styleLinks:
+        s.extract()
+
+    # Remove all style tags in head
+    for s in soup.findAll("style"):
+        s.extract()
+
+    # CLEAN STYLES FROM ELEMENTS IN TOP PARENT
+    for ele in topParent.findAll(True):
+        del ele['style']
+        del ele['class']
+
+    _killDivs(topParent, encoding)
+    _clean(topParent, "form")
+    _clean(topParent, "object")
+    _clean(topParent, "iframe")
+
+    _fixLinks(topParent, link)
+
+    title = soup.find('title').text
+    content = topParent.renderContents().decode(encoding)
+
+    return title, content
+
+
+def _fixLinks(parent, link):
+    tags = parent.findAll(True)
+    for t in tags:
+        if t.attrMap and "href" in t.attrMap:
+            t["href"] = urlparse.urljoin(link, t["href"])
+        if t.attrMap and "src" in t.attrMap:
+            t["src"] = urlparse.urljoin(link, t["src"])
+
+
+def _clean(top, tag, minWords=10000):
+    tags = top.findAll(tag)
+    for t in tags:
+        # If the text content isn't laden with words, remove the child
+        if t.renderContents().count(" ") < minWords:
+            t.extract()
+
+
+def _killDivs(parent, encoding):
+    divs = parent.findAll("div")
+
+    # Gather counts for other typical elements embedded within.
+    # Traverse backwards so we can remove nodes at the same time without
+    # effectiving the traversal.
+    for d in divs:
+        p = len(d.findAll("p"))
+        img = len(d.findAll("img"))
+        li = len(d.findAll("li"))
+        a = len(d.findAll("a"))
+        embed = len(d.findAll("embed"))
+        pre = len(d.findAll("pre"))
+        code = len(d.findAll("code"))
+
+        # If the number of commas is less than 10 (bad sign) ...
+        if d.renderContents().decode(encoding).count(",") < 10:
+            # DEVIATION: XXX: why do this?
+            if (pre == 0) and (code == 0):
+                # Add the number of non-paragraph elements is more than
+                # paragraphs or other ominous signs
+                if (img > p) or (li > p) or (a > p) or (p == 0) or (embed > 0):
+                    d.extract()
+
+
+def readable(url, html, DEFAULT_ENCODING):
+    """Return the readable version of this URL/HTML"""
+    title, content = grabContent(url, html, DEFAULT_ENCODING)
+    content = re.sub(CLEANUP,"",content)
+    return r'''{title}
+{content}'''.format(title=title, content=content)
+
+
+def main():
+
+    import webbrowser
+    from tempfile import mkstemp
+    from optparse import OptionParser
+    import codecs
+
+    usage = "usage: %prog [options] URL1 URL2 ..."
+    parser = OptionParser(usage=usage)
+    parser.add_option(b"-b", b"--open-browser",
+                  action="store_true", dest="open_browser", default=False,
+                  help=b"show the readable version in a web browser")
+    (options, args) = parser.parse_args()
+
+    if not args:
+        print(parser.format_help())
+        sys.exit(2)
+
+    for url in args:
+        html = urllib.urlopen(url).read().decode(DEFAULT_ENCODING)
+        readable_html = readable(url, html)
+        if options.open_browser:
+            fd, fn = mkstemp('readability.html')
+            os.close(fd)
+            with codecs.open(fn, 'w', encoding=DEFAULT_ENCODING) as f:
+                f.write(readable_html)
+            webbrowser.open('file://' + os.path.abspath(fn))
+        else:
+            print(readable_html)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ENS/fetch_url.py b/ENS/fetch_url.py
@@ -1,9 +1,11 @@
 import threading, urllib2
 import Queue
 
+DEFAULT_ENCODING = 'latin-1'
+
 def read_url(url, queue):
 	try:
-    		data = urllib2.urlopen(url).read()
+    		data = urllib2.urlopen(url).read().decode(DEFAULT_ENCODING)
 	except urllib2.HTTPError, e:
 	    checksLogger.error('HTTPError = ' + str(e.code))
 	except urllib2.URLError, e:
@@ -15,7 +17,7 @@ def read_url(url, queue):
 	    checksLogger.error('generic exception: ' + traceback.format_exc())
 
     	print('Fetched %s from %s' % (len(data), url))
-    	queue.put(data)
+    	queue.put([url,data])
 
 def fetch_parallel(list_of_urls):
     result = Queue.Queue()

diff --git a/ENS/Readability.py → ENS/lxmlReadability/Readability.py b/ENS/Readability.py → ENS/lxmlReadability/Readability.py
diff --git a/ENS/lxmlReadability/__init__.py b/ENS/lxmlReadability/__init__.py
diff --git a/ENS/cleaners.py → ENS/lxmlReadability/cleaners.py b/ENS/cleaners.py → ENS/lxmlReadability/cleaners.py
diff --git a/ENS/debug.py → ENS/lxmlReadability/debug.py b/ENS/debug.py → ENS/lxmlReadability/debug.py
diff --git a/ENS/encoding.py → ENS/lxmlReadability/encoding.py b/ENS/encoding.py → ENS/lxmlReadability/encoding.py
diff --git a/ENS/htmls.py → ENS/lxmlReadability/htmls.py b/ENS/htmls.py → ENS/lxmlReadability/htmls.py
diff --git a/Examples/Extract_News_Summary.py → Examples/ENS_Soup.py b/Examples/Extract_News_Summary.py → Examples/ENS_Soup.py
@@ -1,29 +1,26 @@
-from ENS import Document, fetch_url, textRank, newsSearch, SimpleSummarizer
+from ENS import readable, fetch_url, textRank, newsSearch
 import sys
 import re
 
 number_of_links = int(sys.argv[1])
 query = '+'.join(sys.argv[2:])
-regex = re.compile("<(.*?)>|\&#13;")
+DEFAULT_ENCODING = 'latin-1'
 
 article_list = []
 summary_list = []
 
 links = newsSearch(query,number_of_links)
 
-
 if not links:
 	print "No links found"
 
 else:
 	result = fetch_url.fetch_parallel(links)
 
 	while not result.empty():
-		article = Document(result.get()).summary() 
-		article = re.sub(regex, "", article)
-		article = article.encode('ascii','ignore')
-		ss = SimpleSummarizer()
-		summary = ss.summarize(article,5)
+		url_entry = result.get()
+		article = readable(url_entry[0],url_entry[1],DEFAULT_ENCODING) 
+		summary = textRank(article)
 		summary = summary.encode('ascii','ignore')
 		article_list.append(article)
 		summary_list.append(summary)

diff --git a/Examples/Extract_News_Summary_TextRank.py → Examples/ENS_lxml.py b/Examples/Extract_News_Summary_TextRank.py → Examples/ENS_lxml.py
@@ -18,7 +18,8 @@
 	result = fetch_url.fetch_parallel(links)
 
 	while not result.empty():
-		article = Document(result.get()).summary() 
+		url_entry = result.get()
+		article = Document(url_entry[1]).summary() 
 		article = re.sub(regex, "", article)
 		article = article.encode('ascii','ignore')
 		summary = textRank(article)
@@ -30,7 +31,7 @@
 """ All the outputs are written to appropriate files in this part of the code """
 
 for i in range(0,number_of_links):
-	f2 = open(query + str(i),'w')
-	f2.write(article_list[i-1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' + summary_list[i-1])
-	f2.close()
-	
+    print str(i)
+    print article_list[i-1]
+    print "*** SUMMARY ***"
+    print summary_list[i-1]
diff --git a/Narendra+Modi b/Narendra+Modi
@@ -0,0 +1,12 @@
+0
+BJP ropes in Ajmer clerics to back Narendra Modi - The Times of India
+JAIPUR: As part of the BJP's grand Muslim outreach strategy, clerics from  Ajmer Sharif, in their traditional skullcap and sherwani, will be seen honouring  Narendra Modi and senior party leaders during the September 10 rally in the Pink City. The ceremony has been carefully crafted by the  BJP top brass which has asked its minority wing to ensure a visible and sizeable gathering of the community during the rally. Conscious of the images which would be beamed through the media, the party has issued specific directive that the Muslim representatives ought to be in traditional attire like men in sherwani and women in burqa. BJP leader and Dargah khadim  Syed Afshan Chishty, who performs Ziarat for state BJP chief Vasundhara Raje, is busy preparing for the rally. He will felicitate the guests including BJP president Rajnath Singh and the Gujarat CM with 'dastarbandi' (honouring in a traditional manner by giving 'pagdi' or 'chunni' along with the Dargah's picture). "I will invite all of them to the Dargarh for Ziarat," said Chishty. "We have sent text messages to over 1 lakh primary members from the minority community across the state and expect around 10000 of them including 2000 burqa-clad women. Men will compulsorily wear skullcaps to indicate that Muslims are with the BJP," Amin Khan Pathan, state president of BJP minorities' cell, told TOI on Friday. National executive member of BJP's minority morcha and Dargah khadim, Syed Ibrahim Fakhar, was directed by the BJP leaders to gather Muslims for the rally, mostly in their traditional attire. "I have asked our people to come in our traditional dresses kurta pajama or sherwanis to give a clear message that Muslims are embracing the BJP," said Fakhar. The party has booked four buses to ferry people. Sources said the BJP's top leadership had directed the Muslim leaders in the state recently to ensure maximum participation from the community at the rally which will virtually kickstart the party's campaign for the coming assembly polls. The BJP leadership is aware that the Muslim community is decisive in at least 40 assembly constituencies. Party managers want to invade the Congress' traditional votebank with an eye on wresting power. Muslim leaders, especially from Tonk, Sawai Madhopur and Nagaur, are also keen on ensuring the rally's success as they eye party ticket for the polls. Rajnath Singh is also not leaving any platform to bring the minority community into the party fold. At a BJP minorities morcha meet recently, he stated,"Forget communal violence of 2002 and join hands for better future." During another conference in Jaipur, he asked the minorities, especially in BJP-ruled states, to address their grievances to him directly.  
+ *** SUMMARY *** 
+Sources said the BJP's top leadership had directed the Muslim leaders in the state recently to ensure maximum participation from the community at the rally which will virtually kickstart the party's campaign for the coming assembly polls. National executive member of BJP's minority morcha and Dargah khadim, Syed Ibrahim Fakhar, was directed by the BJP leaders to gather Muslims for the rally, mostly in their traditional attire. Conscious of the images which would be beamed through the media, the party has issued specific directive that the Muslim representatives ought to be in traditional attire like men in sherwani and women in burqa. BJP ropes in Ajmer clerics to back Narendra Modi - The Times of India
+JAIPUR: As part of the BJP's grand Muslim outreach strategy, clerics from  Ajmer Sharif, in their traditional skullcap and sherwani, will be seen honouring  Narendra Modi and senior party leaders during the September 10 rally in the Pink City.
+1
+Not just coal files, the entire government is missing: Narendra Modi - The Economic Times
+ NEW DELHI: Directly attacking the Prime Minister,  Gujarat Chief Minister Narendra Modi on Saturday blamed  Manmohan Singh for the drastic fall in rupee and the current economic downturn.  Addressing a rally in Ambikapur, Modi drew a parallel between Manmohan Singh and Chhattisgarh Chief Minister Raman Singh. "Both Manmohan Singh and  Raman Singh are doctors. While Raman Singh has worked for Chhattisgarh's progress, Manmohan Singh has sent the rupee to the hospital," he mocked. "Rupee is fighting for its life because of the doctor in Delhi," Modi hit out.  Raising the issue of Coalgate, Modi said that not just coal files, but the entire government is missing. "There was a debate on coal files in Parliament, SC is also asking. Parliament is worried but people asking where is the Government," he said. "Government is lost, money is lost from the coffers, their honour is lost," Modi said.  Modi also took a dig at Congress Vice President  Rahul Gandhi for his comment on poverty being a state of mind. "The person on whom the Congress is banking on said poverty is a state of mind," he said. "To say that poverty is a state of mind is making a mockery of the poor in the country," Modi added.  "Our dream is to create employment for the youth in the country," Modi said. "Congress is arrogant, it feels no need to be accountable and answerable to this nation," he said. "Uttarakhand, Jharkhand, Chhattisgarh are prime examples of how BJP can create great states, as opposed to Congress which has created Telangana amidst the entire dispute," Modi said.  Modi lauded Raman Singh for his developmental work in Chhattisgarh. "I applaud Dr Raman Singh for not depending on the central government for Chhattisgarh's development," he said.  The rally was a part of Raman Singh's state-wide "Vikas Yatra". Ambikapur is located in the state's Sarguja District, and is about 250 kilometres from state capital Raipur.  Earlier this week, Modi said he does not dream of occupying the top post (of Prime Minister) and would respect the people's mandate for serving the state till 2017.  "I never see such dreams (of becoming PM), nor am I going to see such dreams. People of Gujarat have given me the mandate to serve them till 2017 and I have to do this with full strength," Modi said earlier this week, in remarks that could be seen as subtle expression of his displeasure over the continued delay in formally annointing him BJP's prime ministerial nominee. 
+ *** SUMMARY *** 
+"The person on whom the Congress is banking on said poverty is a state of mind," he said. "To say that poverty is a state of mind is making a mockery of the poor in the country," Modi added. Not just coal files, the entire government is missing: Narendra Modi - The Economic Times
+ NEW DELHI: Directly attacking the Prime Minister,  Gujarat Chief Minister Narendra Modi on Saturday blamed  Manmohan Singh for the drastic fall in rupee and the current economic downturn. Earlier this week, Modi said he does not dream of occupying the top post (of Prime Minister) and would respect the people's mandate for serving the state till 2017. "Our dream is to create employment for the youth in the country," Modi said.