Codes are from https://towardsdatascience.com/text-summarization-in-python-with-jaro-winkler-and-pagerank-72d693da94e8

In [1]:
!pip install jaro-winkler networkx nltk

Collecting jaro-winkler
  Downloading jaro_winkler-2.0.0-py3-none-any.whl (33 kB)
Installing collected packages: jaro-winkler
Successfully installed jaro-winkler-2.0.0


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import networkx as nx
import numpy as np
import jaro

try:
    from nltk.corpus import stopwords
except:
    import nltk
    nltk.download('stopwords')
finally:
    from nltk.corpus import stopwords
    
# constants
sw = list(set(stopwords.words('english')))
punct = [
    '!','#','$','%','&','(',')','*',
    '+',',','-','/',':',';','<','=','>','@',
    '[','\\',']','^','_','`','{','|','}','~'
]

try:
    book = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
except:
    nltk.download('gutenberg')
finally:
    book = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [5]:
book



In [6]:
len(book)

112310

In [7]:
def clean_text(text, sw = sw, punct = punct):
    '''
    This function will clean the input text by lowering, removing certain punctuations, stopwords and 
    new line tags.
    
    params:
        text (String) : The body of text you want to clean
        sw (List) : The list of stopwords you wish to removed from the input text
        punct (List) : The slist of punctuations you wish to remove from the input text
        
    returns:
        This function will return the input text after it's cleaned (the output will be a string) and 
        a dictionary mapping of the original sentences with its index
    '''
    article = text.lower()
    
    # clean punctuations
    for pun in punct:
        article = article.replace(pun, '')
    
    article = article.replace("[^a-zA-Z]", " ").replace('\r\n', ' ').replace('\n', ' ')
    original_text_mapping = {k:v for k,v in enumerate(article.split('. '))}
    
    article = article.split(' ')
    
    # clean stopwords
    article = [x.lstrip().rstrip() for x in article if x not in sw]
    article = [x for x in article if x]
    article = ' '.join(article)

    return original_text_mapping, article
  
original_text_mapping, cleaned_book = clean_text(book)

# get sentences
sentences = [x for x in cleaned_book.split('. ') if x not in ['', ' ', '..', '.', '...']]
print(len(sentences))

1289


In [8]:
def create_similarity_matrix(sentences):
    '''
    The purpose of this function will be to create an N x N similarity matrix.
    N represents the number of sentences and the similarity of a pair of sentences
    will be determined through the Jaro-Winkler Score.
    
    params:
        sentences (List -> String) : This is a list of strings you want to create
                                     the similarity matrix with.
     
    returns:
        This function will return a square numpy matrix
    '''
    
    # identify sentence similarity matrix with Jaro Winkler score
    sentence_length = len(sentences)
    sim_mat = np.zeros((sentence_length, sentence_length))

    for i in range(sentence_length):
        for j in range(sentence_length):
            if i != j:
                similarity = jaro.jaro_winkler_metric(sentences[i], sentences[j])
                sim_mat[i][j] = similarity
    return sim_mat
  
sim_mat = create_similarity_matrix(sentences)

# create network
G = nx.from_numpy_matrix(sim_mat)

# calculate page rank scores
pr_sentence_similarity = nx.pagerank(G)

ranked_sentences = [
    (original_text_mapping[sent], rank) for sent,rank in sorted(pr_sentence_similarity.items(), key=lambda item: item[1], reverse = True)
]

print(ranked_sentences[0][0])

the taper burneth in your closet sir searching the window for a flint i found this paper thus seal'd vp and i am sure it did not lye there when i went to bed


In [9]:
def generate_summary(ranked_sentences, N):
    '''
    This function will generate the summary given a list of ranked sentences and the
    number of sentences the user wants in their summary.
    
    params:
        ranked_sentences (List -> Tuples) : The list of ranked sentences where each
                                            element is a tuple, the first value in the
                                            tuple is the sentence, the second value is
                                            the rank
        N (Integer) : The number of sentences the user wants in the summary
        
    returns:
        This function will return a string associated to the summarized ranked_sentences
        of a book
    '''
    summary = '. '.join([sent[0] for sent in ranked_sentences[0:N]])
    return summary
  
N = 25
summary = generate_summary(ranked_sentences, N)
print(summary)

the taper burneth in your closet sir searching the window for a flint i found this paper thus seal'd vp and i am sure it did not lye there when i went to bed. forget not in your speed antonio to touch calphurnia for our elders say the barren touched in this holy chace shake off their sterrile curse     ant. decius well vrg'd i thinke it is not meet marke antony so well belou'd of caesar should outliue caesar we shall finde of him a shrew'd contriuer. this day i breathed first time is come round and where i did begin there shall i end my life is run his compasse. post backe with speede and tell him what hath chanc'd heere is a mourning rome a dangerous rome no rome of safety for octauius yet hie hence and tell him so. publius good cheere there is no harme intended to your person nor to no roman else so tell them publius     cassi. these growing feathers pluckt from caesars wing will make him flye an ordinary pitch who else would soare aboue the view of men and keepe vs all in seruile fe