In [1]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
  def __init__(self, min_cut=0.1, max_cut=0.9):
    """ Words that have a frequency term lower than min_cut or higer than max_cut will be ignored. """
    self._min_cut = min_cut
    self._max_cut = max_cut 
    self._stopwords = set(stopwords.words('english') + list(punctuation))

  def _compute_frequencies(self, word_sent):
    """ 
      Input: a list of sentences already tokenized.
      Output:  a dictionary where freq[w] is the frequency of w.
    """
    freq = defaultdict(int)
    for s in word_sent:
      for word in s:
        if word not in self._stopwords:
          freq[word] += 1
    # frequencies normalization and fitering
    m = float(max(freq.values()))
    for w in list(freq):
      freq[w] = freq[w]/m
      if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
        del freq[w]
    return freq

  def summarize(self, text, n):
    """"
    Return a list of n sentences which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx]

  def _rank(self, ranking, n):
    """ return the first n sentences with highest ranking """
    return nlargest(n, ranking, key=ranking.get)
   

In [9]:
import urllib.request
from bs4 import BeautifulSoup

def get_only_text(url):
    """ return the title and the text of the article at the specified url """
    page = urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page,"html5lib")
    text = ' '.join(list(map(lambda p: p.text, soup.find_all('p'))))
    return soup.title.text, text
 
feed_xml = urllib.request.urlopen('http://feeds.bbci.co.uk/news/rss.xml').read()
feed = BeautifulSoup(feed_xml.decode('utf8'),"html5lib")
to_summarize = list(map(lambda p: p.text, feed.find_all('guid')))

fs = FrequencySummarizer()
for article_url in to_summarize[:5]:
  title, text = get_only_text(article_url)
  print('----------------------------------')
  print(title)
  for s in fs.summarize(text, 2):
    print('*',s)

----------------------------------
North Korea: UN imposes fresh sanctions over missile tests - BBC News
* It has continued to test nuclear and ballistic missiles despite these recent examples of UN pressure: The tough measures imposed over North Korea's missile programme target petrol supplies.
* The Trump administration says it is seeking a diplomatic solution to the issue, and drafted this new set of sanctions: Last month, the US unveiled fresh sanctions against North Korea which it said were designed to limit the funding for its nuclear and ballistic missile programmes.
----------------------------------
Aldi stabbing: Jodie Willsher murder suspect charged - BBC News
*  Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link These are external links and will open in a new window A 44-year-old man has been charged with murdering a woman who was stabbed to death in an Aldi supermarket.
* Matthew Barnes, chief executive officer of Aldi UK 