In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [2]:
def read_article(document):
  file = open(document,'r')
  filedata = file.readlines()
  #print(filedata)
  article = filedata[0].split(".")
  #print(article)
  sentences = []
  for sentence in article:
        #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
  sentences.pop()

  return sentences

In [9]:
print(read_article("/content/trump.txt"))

[['WASHINGTON', '-', 'The', 'Trump', 'administration', 'has', 'ordered', 'the', 'military', 'to', 'start', 'withdrawing', 'roughly', '7,000', 'troops', 'from', 'Afghanistan', 'in', 'the', 'coming', 'months,', 'two', 'defense', 'officials', 'said', 'Thursday,', 'an', 'abrupt', 'shift', 'in', 'the', '17-year-old', 'war', 'there', 'and', 'a', 'decision', 'that', 'stunned', 'Afghan', 'officials,', 'who', 'said', 'they', 'had', 'not', 'been', 'briefed', 'on', 'the', 'plans'], ['President', 'Trump', 'made', 'the', 'decision', 'to', 'pull', 'the', 'troops', '-', 'about', 'half', 'the', 'number', 'the', 'United', 'States', 'has', 'in', 'Afghanistan', 'now', '-', 'at', 'the', 'same', 'time', 'he', 'decided', 'to', 'pull', 'American', 'forces', 'out', 'of', 'Syria,', 'one', 'official', 'said'], ['The', 'announcement', 'came', 'hours', 'after', 'Jim', 'Mattis,', 'the', 'secretary', 'of', 'defense,', 'said', 'that', 'he', 'would', 'resign', 'from', 'his', 'position', 'at', 'the', 'end', 'of', 'Feb

In [5]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1

    return 1 - cosine_distance(vector1, vector2)

In [12]:
print(sentence_similarity(read_article("/content/trump.txt")[0], read_article("/content/trump.txt")[1], stopwords=None))

0.5484827557301444


In [15]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [13]:
nltk.download("stopwords")
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
print(build_similarity_matrix(read_article("/content/trump.txt"), stop_words))

[[0.         0.2773501  0.08111071 0.         0.03952847 0.04055536
  0.09128709 0.20833333 0.         0.14173668 0.1677051  0.08838835
  0.05892557 0.09805807 0.22592403 0.04419417 0.09449112 0.04287465
  0.13693064]
 [0.2773501  0.         0.08998425 0.         0.0877058  0.08998425
  0.         0.18490007 0.         0.15724273 0.06201737 0.09805807
  0.13074409 0.05439283 0.25064021 0.09805807 0.15724273 0.14269545
  0.05063697]
 [0.08111071 0.08998425 0.         0.         0.         0.05263158
  0.         0.10814761 0.         0.06131393 0.         0.
  0.07647191 0.12725695 0.04188539 0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.25819889 0.         0.14142136 0.
  0.1490712  0.         0.         0.         0.         0.
  0.        ]
 [0.03952847 0.0877058  0.         0.         0.         0.10259784
  0.         0.05270463 0.12909944 0.11952286 0.         0.1118034
  0.         0.06201737 0.       

In [25]:
def generate_summary(file_name, top_n=5):

    summarize_text = []

    sentences =  read_article(file_name)

    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    #print(sentence_similarity_graph)
    scores = nx.pagerank(sentence_similarity_graph)
    #print(scores)

    #print(((scores[i],s) for i,s in enumerate(sentences)))

    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    print("Indexes of top ranked_sentence order are ", ranked_sentence)

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    print("Summarize Text: \n", ". ".join(summarize_text))


In [26]:
generate_summary("/content/trump.txt", 2)

Indexes of top ranked_sentence order are  [(0.07539020741864103, ['', 'Mattis,', 'he', 'begrudgingly', 'pledged', 'an', 'additional', '4,000', 'troops', 'to', 'the', 'Afghan', 'campaign', 'to', 'try', 'to', 'hasten', 'an', 'end', 'to', 'the', 'conflict']), (0.07521361534475711, ['President', 'Trump', 'made', 'the', 'decision', 'to', 'pull', 'the', 'troops', '-', 'about', 'half', 'the', 'number', 'the', 'United', 'States', 'has', 'in', 'Afghanistan', 'now', '-', 'at', 'the', 'same', 'time', 'he', 'decided', 'to', 'pull', 'American', 'forces', 'out', 'of', 'Syria,', 'one', 'official', 'said']), (0.07445451764762709, ['', 'That', 'air', 'support,', 'officials', 'say,', 'consists', 'mostly', 'of', 'propping', 'up', 'Afghan', 'troops', 'while', 'they', 'try', 'to', 'hold', 'territory', 'from', 'a', 'resurgent', 'Taliban']), (0.07237621565584887, ['', 'The', 'fear', 'that', 'Mr']), (0.07199159190712157, ['WASHINGTON', '-', 'The', 'Trump', 'administration', 'has', 'ordered', 'the', 'military'