# Rule Based Text Summarization

This is a simple example of rule based text summarization approach 

In [1]:
# Web Scraping library
import urllib.request
from bs4 import BeautifulSoup

In [2]:
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"

In [3]:
# Helper functions
def removeNonAscii(text):
    # return re.sub(r'[^\x00-\x7F]',' ', text)
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

def getTextFromURL(url):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(url)
    page = response.read().decode('utf8')
    soup = BeautifulSoup(page,"lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    return removeNonAscii(text)

In [4]:
text = getTextFromURL(articleURL)

In [5]:
# NLP Libraries
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation, whitespace
from collections import defaultdict
from nltk.probability import FreqDist
from heapq import nlargest

In [6]:
# Main function to summarize text
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation) + list(whitespace))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
        
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [7]:
# Just 
summarize(text,3)

['On Tuesday, the Defense Department took another significant step toward monitoring all of the cosmic junk swirling around in space, by delivering a gigantic new telescope capable of seeing small objects from very far away.',
 'The telescope is  a big improvement over the legacy ground-based optical telescopes that are used by the U.S. Air Force, because it can search large areas of sky and also track very faint (small) objects in and around GEO,  Brian Weeden, a Technical Advisor at the Secure World Foundation, wrote in an email.',
 'Every military operation that takes place in the world today is critically dependent on space in one way or another,  Air Force Gen. John Hyten said in an interview earlier this year when he was the commander of the Air Force Space Command.']