Skip to content
Browse files

added TextRank script

  • Loading branch information...
1 parent 5bfdd15 commit 5ca82ffb70cf87eacef29f03f60fb8cb1cbe2a6a @abhinavgupta committed
View
40 Extract_News_Summary_TextRank.py
@@ -0,0 +1,40 @@
+from GoogleNews import GoogleNews
+from readability import Document
+from TextRank import Summary
+from fetch_url import fetch_url
+import sys
+import re
+
+number_of_links = int(sys.argv[1])
+query = '+'.join(sys.argv[2:])
+regex = re.compile("<(.*?)>|\&#13;")
+
+article_list = []
+summary_list = []
+
+links = GoogleNews.search(query,number_of_links)
+
+
+if not links:
+ print "No links found"
+
+else:
+ result = fetch_url.fetch_parallel(links)
+
+ while not result.empty():
+ article = Document(result.get()).summary()
+ article = re.sub(regex, "", article)
+ article = article.encode('ascii','ignore')
+ summary = Summary.textrank(article)
+ summary = summary.encode('ascii','ignore')
+ article_list.append(article)
+ summary_list.append(summary)
+
+
+""" All the outputs are written to appropriate files in this part of the code """
+
+for i in range(0,number_of_links):
+ f2 = open(query + str(i),'w')
+ f2.write(article_list[i-1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' + summary_list[i-1])
+ f2.close()
+
View
BIN GoogleNews/GoogleNews.pyc
Binary file not shown.
View
BIN GoogleNews/__init__.pyc
Binary file not shown.
View
28 README
@@ -40,7 +40,7 @@ OUTPUT: Relevant Text. (Uses the algorithm by the arc90 readability project. Che
*************
* SCRIPT 3 *
*************
-summarize.py - Independent script that creates the summary using TextRank for sentences.
+summarize.py - (Summarize folder) Independent script that creates the summary using simple analysis for sentences.
USAGE: Following is the way to use it in a python terminal
>> import summarize
@@ -49,10 +49,21 @@ USAGE: Following is the way to use it in a python terminal
OUTPUT: Summarized text.
-
*************
* SCRIPT 4 *
*************
+Summary.py - (TextRank folder) Independent script that creates the summary using TextRank for sentences.
+
+USAGE: Following is the way to use it in a python terminal
+>> import Summary
+>> result = Summary.textrank(article)
+
+OUTPUT: Summarized text.
+
+
+*************
+* SCRIPT 5 *
+*************
Extract_News_Summary.py - A script that uses all the above scripts
INPUT: Query and number of news articles to be scraped
@@ -63,6 +74,19 @@ python Extract_News_Summary.py <Number of links> <Query>
example: python Extract_News_Summary.py 50 India Pakistan Cricket
+*************
+* SCRIPT 6 *
+*************
+Extract_News_Summary_TextRank.py - A script that generates summary of articles using the TextRank method
+
+INPUT: Query and number of news articles to be scraped
+
+USAGE: On the terminal do the following
+
+python Extract_News_Summary.py <Number of links> <Query>
+
+example: python Extract_News_Summary.py 50 India Pakistan Cricket
+
TODO: 1. Get urllib exception handling
2. Unicode-ASCII conversion is weakly handled
View
BIN Summarize/__init__.pyc
Binary file not shown.
View
BIN Summarize/summarize.pyc
Binary file not shown.
View
27 TextRank/Summary.py
@@ -0,0 +1,27 @@
+import networkx as nx
+import numpy as np
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
+
+def textrank(document):
+ sentence_tokenizer = PunktSentenceTokenizer()
+ sentences = sentence_tokenizer.tokenize(document)
+
+ bow_matrix = CountVectorizer().fit_transform(sentences)
+ normalized = TfidfTransformer().fit_transform(bow_matrix)
+
+ similarity_graph = normalized * normalized.T
+
+ nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
+ scores = nx.pagerank(nx_graph)
+ text_rank_graph = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
+ number_of_nodes = int(0.25*len(text_rank_graph))
+
+ if number_of_nodes < 3:
+ number_of_nodes = 3
+
+ del text_rank_graph[number_of_nodes:]
+
+ summary = ' '.join(word for _,word in text_rank_graph)
+
+ return summary
View
0 TextRank/__init__.py
No changes.
View
BIN fetch_url/__init__.pyc
Binary file not shown.
View
BIN fetch_url/fetch_url.pyc
Binary file not shown.
View
BIN readability/__init__.pyc
Binary file not shown.
View
BIN readability/cleaners.pyc
Binary file not shown.
View
BIN readability/encoding.pyc
Binary file not shown.
View
BIN readability/htmls.pyc
Binary file not shown.
View
BIN readability/readability.pyc
Binary file not shown.

0 comments on commit 5ca82ff

Please sign in to comment.
Something went wrong with that request. Please try again.