### Tokenize text into sentences and words

In [36]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amira\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amira\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
text = "Mary had a little lamb. Her fleece was white as snow"
sents = sent_tokenize(text)
print (sents)

['Mary had a little lamb.', 'Her fleece was white as snow']


In [21]:
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]


### Remove stop words 

In [8]:
# words that doesn't add a lot of information in terms of meaning of the sentence or the text
from nltk.corpus import stopwords
from string import punctuation

customStopWords = set(stopwords.words('english')+list(punctuation))

If we apply the removal on words it will generate error : unhashable type list and that's because we have nested lists

In [17]:
wordsWOStopwords = [wo for wo in word_tokenize(text) if wo not in customStopWords]
print(wordsWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


### Identifying bigrams

In [22]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords) # Construct Bigrams from a list of words
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

### Stemming

In [24]:
text2 = "Mary closed on closing night when she was in the mood to close"

In [25]:
from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos']


### Parts of Speech Tagging

In [27]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Amira\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [28]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

### Disambiguation word meanings

In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amira\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [33]:
from nltk.corpus import wordnet as wn

for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [32]:
from nltk.wsd import lesk # an algorithm for word sense disamiguation

sensel = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass')
print(sensel, sensel.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [34]:
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"), 'bass')
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


### Auto-summarizing text

In [3]:
import requests
from bs4 import BeautifulSoup

soup = requests.get("https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/")
page = BeautifulSoup(soup.text, "html.parser")


In [8]:
page.find("title").text

'The Pentagon’s massive new telescope is designed to track space junk and watch out for killer asteroids - The Washington Post'

In [12]:
page.find("article").text

"By Christian DavenportcloseChristian DavenportReporter covering the defense and space industriesEmailEmailBioBioFollowFollowReporterOct. 18, 2016 at 4:01 p.m. EDTThere are a lot of rocks flying around through space. Lots of debris, too. Old satellites, spent rocket boosters, even for a short while a spatula that got loose during a space shuttle mission in 2006. All of it swirling around in orbit, creating a bit of a traffic jam.Support our journalism. Subscribe today.arrow-rightFor years, the Pentagon has been worried about the collisions that might be caused by an\xa0estimated 500,000 pieces of debris, taking out enormously valuable satellites and, in turn, creating even more debris. On Tuesday, the Defense Department\xa0took another significant step toward monitoring all of the cosmic junk swirling around in space, by delivering\xa0a gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the Space Surveill

In [28]:
text = ' '.join(map(lambda p: p.text, page.find_all('article')))
text

"By Christian DavenportcloseChristian DavenportReporter covering the defense and space industriesEmailEmailBioBioFollowFollowReporterOct. 18, 2016 at 4:01 p.m. EDTThere are a lot of rocks flying around through space. Lots of debris, too. Old satellites, spent rocket boosters, even for a short while a spatula that got loose during a space shuttle mission in 2006. All of it swirling around in orbit, creating a bit of a traffic jam.Support our journalism. Subscribe today.arrow-rightFor years, the Pentagon has been worried about the collisions that might be caused by an\xa0estimated 500,000 pieces of debris, taking out enormously valuable satellites and, in turn, creating even more debris. On Tuesday, the Defense Department\xa0took another significant step toward monitoring all of the cosmic junk swirling around in space, by delivering\xa0a gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the Space Surveill

In [29]:
text = text.replace('\xa0', '')
text
#### OR 
#def rep(tx):
 #   tx.replace('\xa0', '')
  #  return tx
#text = rep(text)

"By Christian DavenportcloseChristian DavenportReporter covering the defense and space industriesEmailEmailBioBioFollowFollowReporterOct. 18, 2016 at 4:01 p.m. EDTThere are a lot of rocks flying around through space. Lots of debris, too. Old satellites, spent rocket boosters, even for a short while a spatula that got loose during a space shuttle mission in 2006. All of it swirling around in orbit, creating a bit of a traffic jam.Support our journalism. Subscribe today.arrow-rightFor years, the Pentagon has been worried about the collisions that might be caused by anestimated 500,000 pieces of debris, taking out enormously valuable satellites and, in turn, creating even more debris. On Tuesday, the Defense Departmenttook another significant step toward monitoring all of the cosmic junk swirling around in space, by deliveringa gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the Space Surveillance Telesco

In [None]:
# Encapsulate all the steps in one function 
url = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"
def get_article(url):
    soup = requests.get(url)
    page = BeautifulSoup(soup.text, "html.parser")
    text = ' '.join(map(lambda p: p.text, page.find_all('article')))
    text = text.replace('\xa0', '')
    return text
text = get_article(url)
text

#### Preprocessing Article text

In [37]:
sents2 = sent_tokenize(text)
sents2

['By Christian DavenportcloseChristian DavenportReporter covering the defense and space industriesEmailEmailBioBioFollowFollowReporterOct.',
 '18, 2016 at 4:01 p.m. EDTThere are a lot of rocks flying around through space.',
 'Lots of debris, too.',
 'Old satellites, spent rocket boosters, even for a short while a spatula that got loose during a space shuttle mission in 2006.',
 'All of it swirling around in orbit, creating a bit of a traffic jam.Support our journalism.',
 'Subscribe today.arrow-rightFor years, the Pentagon has been worried about the collisions that might be caused by anestimated 500,000 pieces of debris, taking out enormously valuable satellites and, in turn, creating even more debris.',
 'On Tuesday, the Defense Departmenttook another significant step toward monitoring all of the cosmic junk swirling around in space, by deliveringa gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the S

In [38]:
word_sent2 = word_tokenize(text)
word_sent2

['By',
 'Christian',
 'DavenportcloseChristian',
 'DavenportReporter',
 'covering',
 'the',
 'defense',
 'and',
 'space',
 'industriesEmailEmailBioBioFollowFollowReporterOct',
 '.',
 '18',
 ',',
 '2016',
 'at',
 '4:01',
 'p.m.',
 'EDTThere',
 'are',
 'a',
 'lot',
 'of',
 'rocks',
 'flying',
 'around',
 'through',
 'space',
 '.',
 'Lots',
 'of',
 'debris',
 ',',
 'too',
 '.',
 'Old',
 'satellites',
 ',',
 'spent',
 'rocket',
 'boosters',
 ',',
 'even',
 'for',
 'a',
 'short',
 'while',
 'a',
 'spatula',
 'that',
 'got',
 'loose',
 'during',
 'a',
 'space',
 'shuttle',
 'mission',
 'in',
 '2006',
 '.',
 'All',
 'of',
 'it',
 'swirling',
 'around',
 'in',
 'orbit',
 ',',
 'creating',
 'a',
 'bit',
 'of',
 'a',
 'traffic',
 'jam.Support',
 'our',
 'journalism',
 '.',
 'Subscribe',
 'today.arrow-rightFor',
 'years',
 ',',
 'the',
 'Pentagon',
 'has',
 'been',
 'worried',
 'about',
 'the',
 'collisions',
 'that',
 'might',
 'be',
 'caused',
 'by',
 'anestimated',
 '500,000',
 'pieces',
 'of'

In [49]:
punc=("’","”","“")
_stopwords = set(stopwords.words('english') + list(punctuation)+list(punc))
_stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [50]:
word_sent2 = [word for word in word_sent2 if word not in _stopwords]
word_sent2

['By',
 'Christian',
 'DavenportcloseChristian',
 'DavenportReporter',
 'covering',
 'defense',
 'space',
 'industriesEmailEmailBioBioFollowFollowReporterOct',
 '18',
 '2016',
 '4:01',
 'p.m.',
 'EDTThere',
 'lot',
 'rocks',
 'flying',
 'around',
 'space',
 'Lots',
 'debris',
 'Old',
 'satellites',
 'spent',
 'rocket',
 'boosters',
 'even',
 'short',
 'spatula',
 'got',
 'loose',
 'space',
 'shuttle',
 'mission',
 '2006',
 'All',
 'swirling',
 'around',
 'orbit',
 'creating',
 'bit',
 'traffic',
 'jam.Support',
 'journalism',
 'Subscribe',
 'today.arrow-rightFor',
 'years',
 'Pentagon',
 'worried',
 'collisions',
 'might',
 'caused',
 'anestimated',
 '500,000',
 'pieces',
 'debris',
 'taking',
 'enormously',
 'valuable',
 'satellites',
 'turn',
 'creating',
 'even',
 'debris',
 'On',
 'Tuesday',
 'Defense',
 'Departmenttook',
 'another',
 'significant',
 'step',
 'toward',
 'monitoring',
 'cosmic',
 'junk',
 'swirling',
 'around',
 'space',
 'deliveringa',
 'gigantic',
 'new',
 'telesc

#### Extracting a summary

In [54]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent2)
freq

FreqDist({'space': 11, 'debris': 7, 'telescope': 7, 'satellites': 6, 'orbit': 6, 'objects': 6, 'Air': 6, 'Force': 6, 'around': 4, 'small': 4, ...})

In [61]:
print(freq)

<FreqDist with 348 samples and 454 outcomes>


In [62]:
freq.most_common(454)

[('space', 11),
 ('debris', 7),
 ('telescope', 7),
 ('satellites', 6),
 ('orbit', 6),
 ('objects', 6),
 ('Air', 6),
 ('Force', 6),
 ('around', 4),
 ('small', 4),
 ('even', 3),
 ('Pentagon', 3),
 ('another', 3),
 ('far', 3),
 ('Space', 3),
 ('GEO', 3),
 ('important', 3),
 ('used', 3),
 ('lot', 2),
 ('swirling', 2),
 ('creating', 2),
 ('years', 2),
 ('worried', 2),
 ('pieces', 2),
 ('valuable', 2),
 ('Defense', 2),
 ('new', 2),
 ('seeing', 2),
 ('monitor', 2),
 ('away', 2),
 ('point', 2),
 ('That', 2),
 ('satellite', 2),
 ('communications', 2),
 ('thousands', 2),
 ('said', 2),
 ('DARPA', 2),
 ("'s", 2),
 ('area', 2),
 ('technology', 2),
 ('officials', 2),
 ('large', 2),
 ('It', 2),
 ('also', 2),
 ('could', 2),
 ('going', 2),
 ('would', 2),
 ('U.S.', 2),
 ('track', 2),
 ('military', 2),
 ('increasingly', 2),
 ('The', 2),
 ('radar', 2),
 ('become', 2),
 ('world', 2),
 ('By', 1),
 ('Christian', 1),
 ('DavenportcloseChristian', 1),
 ('DavenportReporter', 1),
 ('covering', 1),
 ('defense', 1)

In [63]:
# This is used to start any collection whether it's a dictionary or a list
from heapq import nlargest

In [64]:
nlargest(10, freq, key=freq.get)  # it finds the corresponding value for a given key 

['space',
 'debris',
 'telescope',
 'satellites',
 'orbit',
 'objects',
 'Air',
 'Force',
 'around',
 'small']

In [65]:
# Create a score fro sentences
from collections import defaultdict
ranking = defaultdict(int)

for i, sent in enumerate (sents2):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]
ranking
        

defaultdict(int,
            {0: 13,
             1: 23,
             2: 7,
             3: 30,
             4: 17,
             5: 41,
             6: 125,
             7: 23,
             8: 22,
             9: 13,
             10: 153,
             11: 70,
             12: 133,
             13: 9,
             14: 65,
             15: 39,
             16: 3,
             17: 4})

In [66]:
# Pick the top 4 sentences based on their significance score

sents_idx = nlargest(4, ranking, key=ranking.get)
sents_idx


[10, 12, 6, 11]

In [68]:
[sents2[j] for j in sorted(sents_idx)]

['On Tuesday, the Defense Departmenttook another significant step toward monitoring all of the cosmic junk swirling around in space, by deliveringa gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the Space Surveillance Telescope was formally transferred to the Air Force during a ceremony at White Sands Missile Base in New Mexico Tuesday.ADADThe telescope is designed to monitor objects as small as softballs, in Geosynchonous orbit (GEO)—some of the most important real estate in space.',
 "But the telescope's ability to see “something very far away over a very wide area is really what it’s best at.”DARPA says the advanced technology in the massive, 90-ton telescope wouldallow officials to go from “seeing only a few large objects at a time through the equivalent of a drinking straw to a windshield view with 10,000 objects at a time.”It is also being used by NASA to monitor asteroids and other near-Earth o

#### Let's put everything together in one function

In [72]:
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents) # Check whether the text has required number of sentences
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    word_sent= [word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    ranking = defaultdict(int)
    
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
                
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted (sents_idx)]


In [73]:
summarize(text,4)

['On Tuesday, the Defense Departmenttook another significant step toward monitoring all of the cosmic junk swirling around in space, by deliveringa gigantic new telescope capable of seeing small objects from very far away.Developed by the Defense Advanced Research Project Agency, the Space Surveillance Telescope was formally transferred to the Air Force during a ceremony at White Sands Missile Base in New Mexico Tuesday.ADADThe telescope is designed to monitor objects as small as softballs, in Geosynchonous orbit (GEO)—some of the most important real estate in space.',
 "But the telescope's ability to see “something very far away over a very wide area is really what it’s best at.”DARPA says the advanced technology in the massive, 90-ton telescope wouldallow officials to go from “seeing only a few large objects at a time through the equivalent of a drinking straw to a windshield view with 10,000 objects at a time.”It is also being used by NASA to monitor asteroids and other near-Earth o