<a href="https://colab.research.google.com/github/bundickm/Automated_RSS_Feed_Summarizer/blob/master/RSS_Feed_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pull From RSS Feed

In [0]:
!pip install feedparser



In [0]:
import feedparser # pip install feedparser

reader = feedparser.parse('http://rss.cnn.com/rss/cnn_topstories.rss')
# .. skipped handling http errors, cacheing ..

parsed_feed = []
for entry in reader.entries:
  parsed_feed.append({
      'title': entry.title,
      'url': entry.link,
      'description': entry.description})  

In [0]:
parsed_feed

[{'description': 'Tensions that have been mounting for months between some of the nation\'s most senior military officers and President Donald Trump are boiling over after his decision to intervene in the cases of three service members accused of war crimes.<div class="feedflare">\n<a href="http://rss.cnn.com/~ff/rss/cnn_topstories?a=0Z4wIIAT3QU:sL94ANKI4H0:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/rss/cnn_topstories?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://rss.cnn.com/~ff/rss/cnn_topstories?a=0Z4wIIAT3QU:sL94ANKI4H0:7Q72WNTAKBA"><img src="http://feeds.feedburner.com/~ff/rss/cnn_topstories?d=7Q72WNTAKBA" border="0"></img></a> <a href="http://rss.cnn.com/~ff/rss/cnn_topstories?a=0Z4wIIAT3QU:sL94ANKI4H0:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/rss/cnn_topstories?i=0Z4wIIAT3QU:sL94ANKI4H0:V_sGLiPBpWU" border="0"></img></a> <a href="http://rss.cnn.com/~ff/rss/cnn_topstories?a=0Z4wIIAT3QU:sL94ANKI4H0:qj6IDK7rITs"><img src="http://feeds.feedburner.com/~

# Parse Web Pages

In [0]:
from bs4 import BeautifulSoup
import requests

def get_text(url):
  html = requests.get(url)
  soup = BeautifulSoup(html.text)

  for script in soup(["script", "style"]):
      script.decompose()    # rip it out
  text = soup.find('section', {'id':'body-text'}).get_text()

  # get text
  # text = soup.get_text()
  # break into lines and remove leading and trailing space on each
  lines = (line.strip() for line in text.splitlines())
  # break multi-headlines into a line each
  chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
  # drop blank lines
  text = '\n'.join(chunk for chunk in chunks if chunk)
  start = text.find('(CNN)')
  end = text.find('\nSearch')
  return text[start+5:end]

In [0]:
test = get_text('http://rss.cnn.com/~r/rss/cnn_topstories/~3/CO9DFp_HIA4/index.html')
test



# Create Tags with LDA

In [0]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [0]:
test = test.replace('\n', ' ')
test = test.lower().split('.')
test

['on a recent november evening, bob blackman scrolled through his phone in a nondescript indian restaurant in west london, sparsely decorated except for stock pictures of east african sunsets and lions',
 " he proudly pointed to his phone's background photo, which shows him smiling next to indian prime minister narendra modi",
 '"i\'ve been to india seven times in the last three years," blackman said, sipping a cup of chai, his cheeks flushed from canvassing in the cold, ahead of britain\'s first december election in nearly a century',
 ' "i am well-known for being a very pro-indian supporter',
 '"across the table, kuldeep singh shekhawat, the head of a uk-based pressure group linked to modi\'s bharatiya janata party (bjp), chimed in: "a hundred percent',
 '" blackman is a three-term member of parliament for harrow east, a racially and religiously diverse constituency where 28',
 '2% of the residents are hindu',
 ' the conservative party politician has a track record of championing ind

In [0]:
nlp.vocab['cnn'].is_stop = True
nlp.Defaults.stop_words |= {'cnn', ''}
def tokenize(text):
  doc = nlp(text)
  tokens = [token.text 
            for token in doc 
            if not token.is_stop and 
               not token.is_punct and
               token.pos_ == 'NOUN']
  return tokens
for i, text in enumerate(test):
  test[i] = tokenize(text)

In [0]:
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora
import re

In [0]:
id2word = corpora.Dictionary(test)

id2word.filter_extremes(no_below=5, no_above=0.9)
corpus = [id2word.doc2bow(text) for text in test]
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   num_topics = 5,
                   passes=10,
                   workers=4)

words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]
topics = [' '.join(t[0:5]) for t in words]

In [0]:
topics[0]

'party britain labour indians modi'

# Sentiment Analysis

In [0]:
from textblob import TextBlob
def get_sentiment(text): 
    analysis = TextBlob(text) 

    if analysis.sentiment.polarity > 0: 
        return 'positive', analysis
    elif analysis.sentiment.polarity == 0: 
        return 'neutral', analysis.sentiment.polarity
    else: 
        return 'negative', analysis.sentiment.polarity

In [0]:
_, temp = get_sentiment(get_text('https://www.cnn.com/2019/11/22/us/trump-administration-seizing-border-wall-land/index.html'))

In [0]:
temp.sentiment

Sentiment(polarity=0.0968393639300719, subjectivity=0.42498643195545843)

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Summarize

In [0]:
article = get_text('http://rss.cnn.com/~r/rss/cnn_topstories/~3/CO9DFp_HIA4/index.html')
article = article.replace('\n', ' ')
article = article.lower().split('.')
article

['on a recent november evening, bob blackman scrolled through his phone in a nondescript indian restaurant in west london, sparsely decorated except for stock pictures of east african sunsets and lions',
 " he proudly pointed to his phone's background photo, which shows him smiling next to indian prime minister narendra modi",
 '"i\'ve been to india seven times in the last three years," blackman said, sipping a cup of chai, his cheeks flushed from canvassing in the cold, ahead of britain\'s first december election in nearly a century',
 ' "i am well-known for being a very pro-indian supporter',
 '"across the table, kuldeep singh shekhawat, the head of a uk-based pressure group linked to modi\'s bharatiya janata party (bjp), chimed in: "a hundred percent',
 '" blackman is a three-term member of parliament for harrow east, a racially and religiously diverse constituency where 28',
 '2% of the residents are hindu',
 ' the conservative party politician has a track record of championing ind

In [0]:
formatted_article = article[::]
for i, text in enumerate(formatted_article):
  formatted_article[i] = tokenize(text)

In [0]:
word_frequencies = {}
for sentence in formatted_article:
  for word in sentence:
    if word not in word_frequencies.keys():
      word_frequencies[word] = 1
    else:
      word_frequencies[word] += 1

In [0]:
maximum_frequency = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

In [0]:
sentence_scores = {}
for i, sentence in enumerate(article):
  for word in formatted_article[i]:
    if word in word_frequencies.keys():
      if sentence not in sentence_scores.keys():
        sentence_scores[sentence] = word_frequencies[word]
      else:
        sentence_scores[sentence] += word_frequencies[word]

In [0]:
import heapq
summary_sentences = heapq.nlargest(5, sentence_scores, key=sentence_scores.get)
summary_sentences

[' india\'s modi made the election a referendum on his leadership -- and it paid off suresh grover, a human rights campaigner and director of the monitoring group, said ofbjp branches were established in the us and uk to "regularly discuss how to popularize bjp policies overseas, including changing the uk\'s perception of modi himself',
 'must watchwhat\'s at stake in britain\'s volatile election 03:00"they [the labour party] did it with the jewish community, and it is now doing it with the indian community: taking loyal, solid voters for granted," meghani told cnn, referring to criticism over labour\'s handling of a recent wave of anti-semitic incidents within the party',
 ' "your views about an issue like kashmir or an issue like palestine might feed your world view, but voters, on the whole, don\'t vote on foreign policy issues," katwala said, citing a systematic study on minority voting behavior, the 2010 ethnic minority british election survey',
 "modi's bharatiya janata party swe

# Completed Output

In [1]:
!pip install feedparser

Collecting feedparser
[?25l  Downloading https://files.pythonhosted.org/packages/91/d8/7d37fec71ff7c9dbcdd80d2b48bcdd86d6af502156fc93846fb0102cb2c4/feedparser-5.2.1.tar.bz2 (192kB)
[K     |█▊                              | 10kB 15.0MB/s eta 0:00:01[K     |███▍                            | 20kB 3.5MB/s eta 0:00:01[K     |█████▏                          | 30kB 5.0MB/s eta 0:00:01[K     |██████▉                         | 40kB 3.2MB/s eta 0:00:01[K     |████████▌                       | 51kB 3.8MB/s eta 0:00:01[K     |██████████▎                     | 61kB 4.6MB/s eta 0:00:01[K     |████████████                    | 71kB 5.2MB/s eta 0:00:01[K     |█████████████▋                  | 81kB 5.8MB/s eta 0:00:01[K     |███████████████▍                | 92kB 6.5MB/s eta 0:00:01[K     |█████████████████               | 102kB 5.1MB/s eta 0:00:01[K     |██████████████████▊             | 112kB 5.1MB/s eta 0:00:01[K     |████████████████████▌           | 122kB 5.1MB/s eta 0:00:

In [2]:
import feedparser
from bs4 import BeautifulSoup
import requests
import heapq
import spacy
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora
import re
from textblob import TextBlob
import nltk
import en_core_web_sm
nltk.download('punkt')
nlp = en_core_web_sm.load()
nlp.Defaults.stop_words |= {'cnn', ''}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def read_feed(rss_url):
  try:
    rss_feed = feedparser.parse(rss_url)
    return rss_feed
  except:
    return None


def parse_rss_feed(rss_feed):
  parsed_feed = []
  
  for entry in rss_feed.entries:
    parsed_feed.append({
        'title': entry.title,
        'url': entry.link,
        'description': entry.description[:entry.description.find('.')],
        'summary': None,
        'sentiment': None,
        'topics': None,
        'video': False
    })

  return parsed_feed


def replacements(text):
  stop_phrases = ['Read More', '\n', ]
  
  for item in stop_phrases:
    text = text.replace(item, ' ')
  
  return text


def get_text(url):
  html = requests.get(url)
  soup = BeautifulSoup(html.text)

  # Strip out any excess html
  for script in soup(["script", "style"]):
      script.decompose()

  text = soup.find('section', {'id':'body-text'}).get_text() # CNN specific
  text = replacements(text)
  start = text.find('(CNN)') # CNN specific
  end = text.find('\nSearch') # CNN specific

  return text[start+5:end]


def get_sentiment(article): 
    analysis = TextBlob(article) 

    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'


def sentence_split(article):
  article = article.split('.')
  return article


def tokenize_sentence(text):
  doc = nlp(text.lower())
  tokens = [token.text 
            for token in doc 
            if not token.is_stop and 
               not token.is_punct and
               token.pos_ == 'NOUN']
  return tokens


def tokenize_article(article):
  split_article = sentence_split(article)
  for i, text in enumerate(split_article):
    split_article[i] = tokenize_sentence(text)

  return split_article


def get_tags(tokenized_article):
  id2word = corpora.Dictionary(tokenized_article)

  id2word.filter_extremes(no_below=5, no_above=0.9)
  corpus = [id2word.doc2bow(text) for text in tokenized_article]
  lda = LdaMulticore(corpus=corpus,
                    id2word=id2word,
                    num_topics = 5,
                    passes=10,
                    workers=4)

  words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]
  topics = [' '.join(t[0:5]) for t in words]

  return set(topics[0].split())


def weight_frequencies(word_frequencies):
  maximum_frequency = max(word_frequencies.values())
  
  for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
  
  return word_frequencies


def get_word_frequencies(tokenized_article):
  word_frequencies = {}

  for sentence in tokenized_article:
    for word in sentence:
      if word not in word_frequencies.keys():
        word_frequencies[word] = 1
      else:
        word_frequencies[word] += 1

  return weight_frequencies(word_frequencies)


def score_sentences(article, tokenized_article):
  sentence_scores = {}
  word_frequencies = get_word_frequencies(tokenized_article)
  word_frequencies = weight_frequencies(word_frequencies)

  for i, sentence in enumerate(sentence_split(article)):
    for word in tokenized_article[i]:
      if word in word_frequencies.keys():
        if sentence not in sentence_scores.keys():
          sentence_scores[sentence] = word_frequencies[word]
        else:
          sentence_scores[sentence] += word_frequencies[word]
  
  return sentence_scores


def get_top_x_summary_sentences(article, tokenized_article, x):
  scored_sentences = score_sentences(article, tokenized_article)

  return heapq.nlargest(x, scored_sentences, key=scored_sentences.get)


def print_synopsis(rss_feed_entry, tags, sentiment, summary_sentences):
  print(rss_feed_entry['title'])
  print('='*(len(rss_feed_entry['title'])+5))
  print('Slugline:', rss_feed_entry['description'])
  print('URL:', rss_feed_entry['url'])
  print('Tags:', tags)
  print('Sentiment:', sentiment)
  print('Summary:')
  print('.\n'.join(summary_sentences))
  print()

In [32]:
url = 'http://rss.cnn.com/rss/cnn_topstories.rss'
rss_feed = read_feed(url)
if rss_feed is None:
  print('Error: Unable to read RSS Feed')
else:
  rss_feed = parse_rss_feed(rss_feed)

for i, entry in enumerate(rss_feed):
  try:
    article = get_text(entry['url'])
    split_article = sentence_split(article)
    tokenized_article = tokenize_article(article)
    tags = get_tags(tokenized_article)
    sentiment = get_sentiment(article)
    summary_sentences = get_top_x_summary_sentences(article, 
                                                    tokenized_article, 5)
    print_synopsis(rss_feed[i], tags, sentiment, summary_sentences)
  except:
    rss_feed[i]['video'] = True
    continue
  

  print()

The person who killed two and wounded others in a stabbing at London Bridge had previously been 'released from jail for terror offenses,' sources say
Slugline: <div class="feedflare">
<a href="http://rss
URL: http://rss.cnn.com/~r/rss/cnn_topstories/~3/V5UHcKbPaS8/index.html
Tags: {'suspect', 'caption', 'bridge', 'attack', 'building'}
Sentiment: negative
Summary:
 Photos: Stabbing incident at London BridgePolice and emergency services work at the scene of a stabbing incident near London Bridge on Friday, November 29.
Hide Caption 11 of 18 Photos: Stabbing incident at London BridgeA police officer cordons off the London Bridge Station.
Hide Caption 4 of 18 Photos: Stabbing incident at London BridgeOffice workers watch from windows as the police carry out an operation following the incident.
Hide Caption 14 of 18 Photos: Stabbing incident at London BridgeMembers of the public are held behind a police cordon near London Bridge Station.
Hide Caption 16 of 18 Photos: Stabbing incident at Lo

In [0]:
# TO-DO
# -----------
# Tokenize - Lemmatize, Customize stopwords to remove low value words (different stopwords for AutoTag and Summary?)
# AutoTag - tokenization improvement improve tags?
# Sentiment - Why is almost everything positive? New algo or weight the sentiment?
# Summary - Strip excess spacing, ensure each sentence is unique, evaluate summary vs article.