Skip to content

Commit

Permalink
VERSION 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinavgupta committed Sep 7, 2013
1 parent fc00be5 commit 7abc9b4
Show file tree
Hide file tree
Showing 25 changed files with 333 additions and 209 deletions.
File renamed without changes.
Empty file added ENS/GoogleRSSReader/__init__.py
Empty file.
69 changes: 0 additions & 69 deletions ENS/Summarize.py

This file was deleted.

2 changes: 1 addition & 1 deletion ENS/Summary.py → ENS/TextRankSummarize/Summary.py
Expand Up @@ -15,7 +15,7 @@ def textRank(document):
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
text_rank_graph = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
number_of_nodes = int(0.25*len(text_rank_graph))
number_of_nodes = int(0.3*len(text_rank_graph))

if number_of_nodes < 3:
number_of_nodes = 3
Expand Down
Empty file.
14 changes: 5 additions & 9 deletions ENS/__init__.py
@@ -1,9 +1,5 @@
from ENS.Readability import Document
from ENS.GoogleNews import newsSearch
from ENS.Summarize import SimpleSummarizer
from ENS.Summary import textRank
from ENS.cleaners import *
from ENS.debug import *
from ENS.encoding import *
from ENS.htmls import *
from ENS.fetch_url import *
from ENS.lxmlReadability.Readability import Document
from ENS.bsReadability.bsReadability import readable
from ENS.fetch_url import *
from ENS.GoogleRSSReader.GoogleNews import newsSearch
from ENS.TextRankSummarize.Summary import textRank
Empty file added ENS/bsReadability/__init__.py
Empty file.
204 changes: 204 additions & 0 deletions ENS/bsReadability/bsReadability.py
@@ -0,0 +1,204 @@
# Adapted from http://github.com/scyclops/Readable-Feeds/blob/master/readability/hn.py
# License: GPL3

from __future__ import unicode_literals

import os
import sys
import urllib
import urlparse
import re
import HTMLParser

from BeautifulSoup import BeautifulSoup


NEGATIVE = re.compile("comment|meta|footer|footnote|foot")
POSITIVE = re.compile("post|hentry|entry|content|text|body|article")
PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
CLEANUP = re.compile("<(.*?)>")


# TODO: have sub-classes for specific exceptions
class ReadabilityError(Exception):
"""Base class for all readability related exceptions"""


# XXX: we should auto-detect the encoding
DEFAULT_ENCODING = 'latin-1'

def grabContent(link, html, encoding=DEFAULT_ENCODING):
"""Return (TITLE, CONTENT)
where CONTENT is the readable version of ``html``
"""
# Replace all doubled-up <BR> tags with <P> tags, and (TODO) remove fonts.
replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
html = re.sub(replaceBrs, "</p><p>", html)

try:
soup = BeautifulSoup(html)
except HTMLParser.HTMLParseError as e:
raise ReadabilityError('BeautifulSoup parse error: %s' % e)

# REMOVE SCRIPTS
for s in soup.findAll("script"):
s.extract()

allParagraphs = soup.findAll("p")
topParent = None

# Study all the paragraphs and find the chunk that has the best score.
# A score is determined by things like: Number of <p>'s, commas, special classes, etc.
parents = []
for paragraph in allParagraphs:

parent = paragraph.parent

if parent not in parents:
parents.append(parent)
parent.score = 0

# Look for a special classname
if "class" in parent:
if NEGATIVE.match(parent["class"]):
parent.score -= 50
if POSITIVE.match(parent["class"]):
parent.score += 25

# Look for a special ID
if "id" in parent:
if NEGATIVE.match(parent["id"]):
parent.score -= 50
if POSITIVE.match(parent["id"]):
parent.score += 25

if parent.score is None:
parent.score = 0

# Add a point for the paragraph found
innerText = paragraph.renderContents(
).decode(encoding) # "".join(paragraph.findAll(text=True))
if len(innerText) > 10:
parent.score += 1

# Add points for any commas within this paragraph
parent.score += innerText.count(",")

# Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
for parent in parents:
if (not topParent) or (parent.score > topParent.score):
topParent = parent

if not topParent:
raise ReadabilityError("no topParent")

# REMOVES ALL STYLESHEETS ...
styleLinks = soup.findAll("link", attrs={"type": "text/css"})
for s in styleLinks:
s.extract()

# Remove all style tags in head
for s in soup.findAll("style"):
s.extract()

# CLEAN STYLES FROM ELEMENTS IN TOP PARENT
for ele in topParent.findAll(True):
del ele['style']
del ele['class']

_killDivs(topParent, encoding)
_clean(topParent, "form")
_clean(topParent, "object")
_clean(topParent, "iframe")

_fixLinks(topParent, link)

title = soup.find('title').text
content = topParent.renderContents().decode(encoding)

return title, content


def _fixLinks(parent, link):
tags = parent.findAll(True)
for t in tags:
if t.attrMap and "href" in t.attrMap:
t["href"] = urlparse.urljoin(link, t["href"])
if t.attrMap and "src" in t.attrMap:
t["src"] = urlparse.urljoin(link, t["src"])


def _clean(top, tag, minWords=10000):
tags = top.findAll(tag)
for t in tags:
# If the text content isn't laden with words, remove the child
if t.renderContents().count(" ") < minWords:
t.extract()


def _killDivs(parent, encoding):
divs = parent.findAll("div")

# Gather counts for other typical elements embedded within.
# Traverse backwards so we can remove nodes at the same time without
# effectiving the traversal.
for d in divs:
p = len(d.findAll("p"))
img = len(d.findAll("img"))
li = len(d.findAll("li"))
a = len(d.findAll("a"))
embed = len(d.findAll("embed"))
pre = len(d.findAll("pre"))
code = len(d.findAll("code"))

# If the number of commas is less than 10 (bad sign) ...
if d.renderContents().decode(encoding).count(",") < 10:
# DEVIATION: XXX: why do this?
if (pre == 0) and (code == 0):
# Add the number of non-paragraph elements is more than
# paragraphs or other ominous signs
if (img > p) or (li > p) or (a > p) or (p == 0) or (embed > 0):
d.extract()


def readable(url, html, DEFAULT_ENCODING):
"""Return the readable version of this URL/HTML"""
title, content = grabContent(url, html, DEFAULT_ENCODING)
content = re.sub(CLEANUP,"",content)
return r'''{title}
{content}'''.format(title=title, content=content)


def main():

import webbrowser
from tempfile import mkstemp
from optparse import OptionParser
import codecs

usage = "usage: %prog [options] URL1 URL2 ..."
parser = OptionParser(usage=usage)
parser.add_option(b"-b", b"--open-browser",
action="store_true", dest="open_browser", default=False,
help=b"show the readable version in a web browser")
(options, args) = parser.parse_args()

if not args:
print(parser.format_help())
sys.exit(2)

for url in args:
html = urllib.urlopen(url).read().decode(DEFAULT_ENCODING)
readable_html = readable(url, html)
if options.open_browser:
fd, fn = mkstemp('readability.html')
os.close(fd)
with codecs.open(fn, 'w', encoding=DEFAULT_ENCODING) as f:
f.write(readable_html)
webbrowser.open('file://' + os.path.abspath(fn))
else:
print(readable_html)


if __name__ == '__main__':
main()
6 changes: 4 additions & 2 deletions ENS/fetch_url.py
@@ -1,9 +1,11 @@
import threading, urllib2
import Queue

DEFAULT_ENCODING = 'latin-1'

def read_url(url, queue):
try:
data = urllib2.urlopen(url).read()
data = urllib2.urlopen(url).read().decode(DEFAULT_ENCODING)
except urllib2.HTTPError, e:
checksLogger.error('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
Expand All @@ -15,7 +17,7 @@ def read_url(url, queue):
checksLogger.error('generic exception: ' + traceback.format_exc())

print('Fetched %s from %s' % (len(data), url))
queue.put(data)
queue.put([url,data])

def fetch_parallel(list_of_urls):
result = Queue.Queue()
Expand Down
File renamed without changes.
Empty file added ENS/lxmlReadability/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
13 changes: 5 additions & 8 deletions Examples/Extract_News_Summary.py → Examples/ENS_Soup.py
@@ -1,29 +1,26 @@
from ENS import Document, fetch_url, textRank, newsSearch, SimpleSummarizer
from ENS import readable, fetch_url, textRank, newsSearch
import sys
import re

number_of_links = int(sys.argv[1])
query = '+'.join(sys.argv[2:])
regex = re.compile("<(.*?)>|\&#13;")
DEFAULT_ENCODING = 'latin-1'

article_list = []
summary_list = []

links = newsSearch(query,number_of_links)


if not links:
print "No links found"

else:
result = fetch_url.fetch_parallel(links)

while not result.empty():
article = Document(result.get()).summary()
article = re.sub(regex, "", article)
article = article.encode('ascii','ignore')
ss = SimpleSummarizer()
summary = ss.summarize(article,5)
url_entry = result.get()
article = readable(url_entry[0],url_entry[1],DEFAULT_ENCODING)
summary = textRank(article)
summary = summary.encode('ascii','ignore')
article_list.append(article)
summary_list.append(summary)
Expand Down
Expand Up @@ -18,7 +18,8 @@
result = fetch_url.fetch_parallel(links)

while not result.empty():
article = Document(result.get()).summary()
url_entry = result.get()
article = Document(url_entry[1]).summary()
article = re.sub(regex, "", article)
article = article.encode('ascii','ignore')
summary = textRank(article)
Expand All @@ -30,7 +31,7 @@
""" All the outputs are written to appropriate files in this part of the code """

for i in range(0,number_of_links):
f2 = open(query + str(i),'w')
f2.write(article_list[i-1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' + summary_list[i-1])
f2.close()
print str(i)
print article_list[i-1]
print "*** SUMMARY ***"
print summary_list[i-1]
12 changes: 12 additions & 0 deletions Narendra+Modi
@@ -0,0 +1,12 @@
0
BJP ropes in Ajmer clerics to back Narendra Modi - The Times of India
JAIPUR: As part of the BJP's grand Muslim outreach strategy, clerics from Ajmer Sharif, in their traditional skullcap and sherwani, will be seen honouring Narendra Modi and senior party leaders during the September 10 rally in the Pink City. The ceremony has been carefully crafted by the BJP top brass which has asked its minority wing to ensure a visible and sizeable gathering of the community during the rally. Conscious of the images which would be beamed through the media, the party has issued specific directive that the Muslim representatives ought to be in traditional attire like men in sherwani and women in burqa. BJP leader and Dargah khadim Syed Afshan Chishty, who performs Ziarat for state BJP chief Vasundhara Raje, is busy preparing for the rally. He will felicitate the guests including BJP president Rajnath Singh and the Gujarat CM with 'dastarbandi' (honouring in a traditional manner by giving 'pagdi' or 'chunni' along with the Dargah's picture). "I will invite all of them to the Dargarh for Ziarat," said Chishty. "We have sent text messages to over 1 lakh primary members from the minority community across the state and expect around 10000 of them including 2000 burqa-clad women. Men will compulsorily wear skullcaps to indicate that Muslims are with the BJP," Amin Khan Pathan, state president of BJP minorities' cell, told TOI on Friday. National executive member of BJP's minority morcha and Dargah khadim, Syed Ibrahim Fakhar, was directed by the BJP leaders to gather Muslims for the rally, mostly in their traditional attire. "I have asked our people to come in our traditional dresses kurta pajama or sherwanis to give a clear message that Muslims are embracing the BJP," said Fakhar. The party has booked four buses to ferry people. Sources said the BJP's top leadership had directed the Muslim leaders in the state recently to ensure maximum participation from the community at the rally which will virtually kickstart the party's campaign for the coming assembly polls. The BJP leadership is aware that the Muslim community is decisive in at least 40 assembly constituencies. Party managers want to invade the Congress' traditional votebank with an eye on wresting power. Muslim leaders, especially from Tonk, Sawai Madhopur and Nagaur, are also keen on ensuring the rally's success as they eye party ticket for the polls. Rajnath Singh is also not leaving any platform to bring the minority community into the party fold. At a BJP minorities morcha meet recently, he stated,"Forget communal violence of 2002 and join hands for better future." During another conference in Jaipur, he asked the minorities, especially in BJP-ruled states, to address their grievances to him directly.
*** SUMMARY ***
Sources said the BJP's top leadership had directed the Muslim leaders in the state recently to ensure maximum participation from the community at the rally which will virtually kickstart the party's campaign for the coming assembly polls. National executive member of BJP's minority morcha and Dargah khadim, Syed Ibrahim Fakhar, was directed by the BJP leaders to gather Muslims for the rally, mostly in their traditional attire. Conscious of the images which would be beamed through the media, the party has issued specific directive that the Muslim representatives ought to be in traditional attire like men in sherwani and women in burqa. BJP ropes in Ajmer clerics to back Narendra Modi - The Times of India
JAIPUR: As part of the BJP's grand Muslim outreach strategy, clerics from Ajmer Sharif, in their traditional skullcap and sherwani, will be seen honouring Narendra Modi and senior party leaders during the September 10 rally in the Pink City.
1
Not just coal files, the entire government is missing: Narendra Modi - The Economic Times
NEW DELHI: Directly attacking the Prime Minister, Gujarat Chief Minister Narendra Modi on Saturday blamed Manmohan Singh for the drastic fall in rupee and the current economic downturn. Addressing a rally in Ambikapur, Modi drew a parallel between Manmohan Singh and Chhattisgarh Chief Minister Raman Singh. "Both Manmohan Singh and Raman Singh are doctors. While Raman Singh has worked for Chhattisgarh's progress, Manmohan Singh has sent the rupee to the hospital," he mocked. "Rupee is fighting for its life because of the doctor in Delhi," Modi hit out. Raising the issue of Coalgate, Modi said that not just coal files, but the entire government is missing. "There was a debate on coal files in Parliament, SC is also asking. Parliament is worried but people asking where is the Government," he said. "Government is lost, money is lost from the coffers, their honour is lost," Modi said. Modi also took a dig at Congress Vice President Rahul Gandhi for his comment on poverty being a state of mind. "The person on whom the Congress is banking on said poverty is a state of mind," he said. "To say that poverty is a state of mind is making a mockery of the poor in the country," Modi added. "Our dream is to create employment for the youth in the country," Modi said. "Congress is arrogant, it feels no need to be accountable and answerable to this nation," he said. "Uttarakhand, Jharkhand, Chhattisgarh are prime examples of how BJP can create great states, as opposed to Congress which has created Telangana amidst the entire dispute," Modi said. Modi lauded Raman Singh for his developmental work in Chhattisgarh. "I applaud Dr Raman Singh for not depending on the central government for Chhattisgarh's development," he said. The rally was a part of Raman Singh's state-wide "Vikas Yatra". Ambikapur is located in the state's Sarguja District, and is about 250 kilometres from state capital Raipur. Earlier this week, Modi said he does not dream of occupying the top post (of Prime Minister) and would respect the people's mandate for serving the state till 2017. "I never see such dreams (of becoming PM), nor am I going to see such dreams. People of Gujarat have given me the mandate to serve them till 2017 and I have to do this with full strength," Modi said earlier this week, in remarks that could be seen as subtle expression of his displeasure over the continued delay in formally annointing him BJP's prime ministerial nominee.
*** SUMMARY ***
"The person on whom the Congress is banking on said poverty is a state of mind," he said. "To say that poverty is a state of mind is making a mockery of the poor in the country," Modi added. Not just coal files, the entire government is missing: Narendra Modi - The Economic Times
NEW DELHI: Directly attacking the Prime Minister, Gujarat Chief Minister Narendra Modi on Saturday blamed Manmohan Singh for the drastic fall in rupee and the current economic downturn. Earlier this week, Modi said he does not dream of occupying the top post (of Prime Minister) and would respect the people's mandate for serving the state till 2017. "Our dream is to create employment for the youth in the country," Modi said.

0 comments on commit 7abc9b4

Please sign in to comment.