In [36]:

import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
%matplotlib

Using matplotlib backend: TkAgg


In [37]:
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [38]:
sentences = brown.sents('ca05')

In [39]:
sentlist=[' '.join(sent) for sent in sentences]

In [40]:
wholesent=""
for i in sentlist:
    wholesent+=i;
print(wholesent)

East Providence should organize its civil defense setup and begin by appointing a full-time director , Raymond H. Hawksley , the present city CD head , believes .Mr. Hawksley said yesterday he would be willing to go before the city council `` or anyone else locally '' to outline his proposal at the earliest possible time .East Providence now has no civil defense program .Mr. Hawksley , the state's general treasurer , has been a part-time CD director in the city for the last nine years .He is not interested in being named a full-time director .Noting that President Kennedy has handed the Defense Department the major responsibility for the nation's civil defense program , Mr. Hawksley said the federal government would pay half the salary of a full-time local director .He expressed the opinion the city could hire a CD director for about $3,500 a year and would only have to put up half that amount on a matching fund basis to defray the salary costs .Mr. Hawksley said he believed there are 

In [41]:
stop_words = set(stopwords.words("english"))
punctuations = set(string.punctuation)
pos_tags = {
            NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
            VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
            ADJ: ['JJ', 'JJR', 'JJS'],
            ADV: ['RB', 'RBR', 'RBS', 'WRB']
}

In [42]:
def remove_stop_words(words):
        return [w for w in words if w not in stop_words]

In [43]:
def remove_regex(sent):
        sent = " ".join([w.lower() for w in sent])
        sent = re.sub(r"i'm", "i am", sent)
        sent = re.sub(r"he's", "he is", sent)
        sent = re.sub(r"can't", "cannot", sent)
        sent = re.sub(r"don't", "do not", sent)
        sent = re.sub(r"that's", "that is", sent)
        sent = re.sub(r"\'ve", " have", sent)
        sent = re.sub(r"\'ll", " will", sent)
        sent = re.sub(r"what's", "what is", sent)
        sent = re.sub(r"where's", "where is", sent)
        sent = re.sub(r"\'re", " are", sent)
        sent = re.sub(r"\'d", " would", sent)
        sent = re.sub(r"she's", "she is", sent)
        sent = re.sub(r"won't", "will not", sent)
        patterns = re.finditer("#[\w]*", sent)
        for pattern in patterns:
            sent = re.sub(pattern.group().strip(), "", sent)
        sent = "".join(ch for ch in sent if ch not in punctuations)
        return sent

In [44]:
def posTagging(words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words

In [45]:
def preprocessData(sentence):
    sentence= remove_regex(sentence)
    words = word_tokenize(sentence)
    cleanedWords = remove_stop_words(words)
    lem = WordNetLemmatizer()
    pos_words = posTagging(words)
    cleanedWords = [lem.lemmatize(w, pos=p) for w, p in pos_words]
    return cleanedWords

In [46]:
def findSentenceSimilarity(s1, s2):
    s1 = preprocessData(s1)
    s2 = preprocessData(s2)
    allWords = list(set(s1 + s2))
    vectorForS1 = [0] * len(allWords)
    vectorForS2 = [0] * len(allWords)
    for word in s1:
        vectorForS1[allWords.index(word)] += 1
    for word in s2:
        vectorForS2[allWords.index(word)] += 1
    return 1 - cosine_distance(vectorForS1, vectorForS2)

In [47]:
def createSimilarityMatrix(sentences):
    matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                matrix[i][j] = findSentenceSimilarity(sentences[i], sentences[j])
    for i in range(len(matrix)):
        matrix[i] /= matrix[i].sum()
    return matrix

In [25]:
SimilarityMatrix = createSimilarityMatrix(sentences)

In [26]:
def pagerank(matrix, eps=1.0e-8, d=0.85):
    N = matrix.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * matrix) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [27]:

ranks = pagerank(SimilarityMatrix)

In [28]:
ranks

array([[0.00104778],
       [0.00138417],
       [0.01249441],
       [0.01941747],
       [0.01068676],
       [0.00717181],
       [0.01299622],
       [0.00398737],
       [0.00558846],
       [0.01996421],
       [0.00797427],
       [0.01896819],
       [0.0204762 ],
       [0.00194573],
       [0.01280512],
       [0.0212577 ],
       [0.00357478],
       [0.00241932],
       [0.01183738],
       [0.0087274 ],
       [0.00871349],
       [0.01223938],
       [0.01393707],
       [0.0219473 ],
       [0.01653646],
       [0.01341724],
       [0.02002164],
       [0.0117618 ],
       [0.00429421],
       [0.01029215],
       [0.00302789],
       [0.02137912],
       [0.01520069],
       [0.00596772],
       [0.01659362],
       [0.00798436],
       [0.00458146],
       [0.01995804],
       [0.00922298],
       [0.02073735],
       [0.01416501],
       [0.01224123],
       [0.00881516],
       [0.0082539 ],
       [0.01020895],
       [0.01401801],
       [0.01713903],
       [0.000

In [29]:
sortRankWithindexes = [item[0] for item in sorted(enumerate(ranks), key=lambda item: -item[1])]

In [30]:
plt.figure(figsize=(20, 10))
plt.bar([item[0] for item in sorted(enumerate(ranks))], ranks.T[0],color='olive',width=0.8)
plt.ylabel("Page Rank / Importance")
plt.xlabel("Sentence No.")
plt.show()

In [31]:
SummaryLines = 5

In [32]:
selectedSentences = sorted(sortRankWithindexes[:SummaryLines])

In [33]:
summary = itemgetter(*selectedSentences)(sentences)

In [34]:
finalSummary=""
for sentence in summary:
    finalSummary+=' '.join(sentence)

In [35]:
finalSummary

"Rhode Island is going to examine its Sunday sales law with possible revisions in mind .`` I would expect the proposed committee to hold public hearings '' , Mr. Notte said , `` to obtain the views of the general public and religious , labor and special-interest groups affected by these laws '' .`` That was before I studied law .Mr. Martinelli has , in recent weeks , been of the opinion that a special town meeting would be called for the vote , while Mr. Bourcier said that a special election might be called instead .Nothing has been done yet to take advantage of the enabling legislation ."