Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added Exception handling and URL fetching using threads

  • Loading branch information...
commit 5bfdd1593a000c8bf8da22e8acf7e659e0c94a2f 1 parent 339ca1e
@abhinavgupta authored
View
48 Extract_News_Summary.py
@@ -1,5 +1,7 @@
-import urllib2
-from readability import Document,summarize, GoogleNews
+from GoogleNews import GoogleNews
+from readability import Document
+from Summarize import summarize
+from fetch_url import fetch_url
import sys
import re
@@ -10,26 +12,30 @@
article_list = []
summary_list = []
-links = GoogleNews.search(query,number_of_links) #Perform Google News search
+links = GoogleNews.search(query,number_of_links)
+
if not links:
- print "No links found" #If no links for a query then..
+ print "No links found"
else:
- for l in links:
- html = urllib2.urlopen(l).read()
- article = Document(html).summary()
- article = re.sub(regex, "", article)
- article = article.encode('ascii','ignore')
- ss = summarize.SimpleSummarizer()
- summary = ss.summarize(article,5)
- summary = summary.encode('ascii','ignore')
- article_list.append(article)
- summary_list.append(summary)
-
-
- """ All the outputs are written to appropriate files in this part of the code """
-for i in range(1,number_of_links):
- f2 = open(query + str(i),'w')
- f2.write(article_list[i-1] + '\n' + summary_list[i-1])
- f2.close()
+ result = fetch_url.fetch_parallel(links)
+
+ while not result.empty():
+ article = Document(result.get()).summary()
+ article = re.sub(regex, "", article)
+ article = article.encode('ascii','ignore')
+ ss = summarize.SimpleSummarizer()
+ summary = ss.summarize(article,5)
+ summary = summary.encode('ascii','ignore')
+ article_list.append(article)
+ summary_list.append(summary)
+
+
+""" All the outputs are written to appropriate files in this part of the code """
+
+for i in range(0,number_of_links):
+ f2 = open(query + str(i),'w')
+ f2.write(article_list[i-1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' + summary_list[i-1])
+ f2.close()
+
View
0  readability/GoogleNews.py → GoogleNews/GoogleNews.py
File renamed without changes
View
BIN  readability/GoogleNews.pyc → GoogleNews/GoogleNews.pyc
Binary file not shown
View
0  GoogleNews/__init__.py
No changes.
View
BIN  GoogleNews/__init__.pyc
Binary file not shown
View
2  README
@@ -7,7 +7,7 @@ REQUIREMENTS: a. Python and it's standard libraries
c. Python lxml-library
-The folder contains the following MAIN scripts:
+The sub-folder contains the following MAIN scripts of the same name:
*************
* SCRIPT 1 *
View
0  Summarize/__init__.py
No changes.
View
BIN  Summarize/__init__.pyc
Binary file not shown
View
0  readability/summarize.py → Summarize/summarize.py
File renamed without changes
View
BIN  readability/summarize.pyc → Summarize/summarize.pyc
Binary file not shown
View
0  fetch_url/__init__.py
No changes.
View
BIN  fetch_url/__init__.pyc
Binary file not shown
View
27 fetch_url/fetch_url.py
@@ -0,0 +1,27 @@
+import threading, urllib2
+import Queue
+
+def read_url(url, queue):
+ try:
+ data = urllib2.urlopen(url).read()
+ except urllib2.HTTPError, e:
+ checksLogger.error('HTTPError = ' + str(e.code))
+ except urllib2.URLError, e:
+ checksLogger.error('URLError = ' + str(e.reason))
+ except httplib.HTTPException, e:
+ checksLogger.error('HTTPException')
+ except Exception:
+ import traceback
+ checksLogger.error('generic exception: ' + traceback.format_exc())
+
+ print('Fetched %s from %s' % (len(data), url))
+ queue.put(data)
+
+def fetch_parallel(list_of_urls):
+ result = Queue.Queue()
+ threads = [threading.Thread(target=read_url, args = (url,result)) for url in list_of_urls]
+ for t in threads:
+ t.start()
+ for t in threads:
+ t.join()
+ return result
View
BIN  fetch_url/fetch_url.pyc
Binary file not shown
View
BIN  readability/__init__.pyc
Binary file not shown
View
BIN  readability/cleaners.pyc
Binary file not shown
View
BIN  readability/encoding.pyc
Binary file not shown
View
BIN  readability/htmls.pyc
Binary file not shown
View
40 readability/readability.py
@@ -26,14 +26,7 @@
'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
- 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
- #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
- #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
- #'trimRe': re.compile('^\s+|\s+$/'),
- #'normalizeRe': re.compile('\s{2,}/'),
- #'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
- #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
- #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
+ 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
}
@@ -438,17 +431,10 @@ def sanitize(self, node, candidates):
content_score = candidates[parent_node]['content_score']
else:
content_score = 0
- #if parent_node is not None:
- #pweight = self.class_weight(parent_node) + content_score
- #pname = describe(parent_node)
- #else:
- #pweight = 0
- #pname = "no parent"
+
to_remove = False
reason = ""
- #if el.tag == 'div' and counts["img"] >= 1:
- # continue
if counts["p"] and counts["img"] > counts["p"]:
reason = "too many images (%s)" % counts["img"]
to_remove = True
@@ -472,26 +458,6 @@ def sanitize(self, node, candidates):
elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
reason = "<embed>s with too short content length, or too many <embed>s"
to_remove = True
-# if el.tag == 'div' and counts['img'] >= 1 and to_remove:
-# imgs = el.findall('.//img')
-# valid_img = False
-# self.debug(tounicode(el))
-# for img in imgs:
-#
-# height = img.get('height')
-# text_length = img.get('text_length')
-# self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
-# if to_int(height) >= 100 or to_int(text_length) >= 100:
-# valid_img = True
-# self.debug("valid image" + tounicode(img))
-# break
-# if valid_img:
-# to_remove = False
-# self.debug("Allowing %s" %el.text_content())
-# for desnode in self.tags(el, "table", "ul", "div"):
-# allowed[desnode] = True
-
- #find x non empty preceding and succeeding siblings
i, j = 0, 0
x = 1
siblings = []
@@ -521,8 +487,6 @@ def sanitize(self, node, candidates):
if to_remove:
self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
(content_score, describe(el), weight, reason))
- #print tounicode(el)
- #self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
for el in ([node] + [n for n in node.iter()]):
View
BIN  readability/readability.pyc
Binary file not shown
Please sign in to comment.
Something went wrong with that request. Please try again.