## Importing modules 

In [10]:
# Import necessary libraries for text clustering
# Import stopwords to remove common words that may not be useful for clustering
from nltk.corpus import stopwords 
# Import cosine_distance function to calculate cosine distance between vectors
from nltk.cluster.util import cosine_distance  
# Import numpy for numerical operations
import numpy as np  
# Import networkx for graph-based operations
import networkx as nx 

## Text 1

## Read the dataset

In [17]:
import re
from nltk.tokenize import word_tokenize

file_path = "C:\\Users\\pbhar\\Downloads\\Text1.txt"
with open(file_path, "r") as file:
    filedata = file.readline()  # Read only the first line

article = filedata.split(". ")  # Split into sentences

sentences = []
for sentence in article:
    print(sentence)
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # Remove non-alphabetic characters
    words = word_tokenize(cleaned_sentence.lower())  # Tokenize the cleaned sentence into words
    sentences.append(words)

It was the best of times
It was the worst of times
It was the age of wisdom
It was the age of foolishness
What is the importance of age
This is the best example.


## list of sentences

In [10]:
print("Sentences are ", sentences)

Sentences are  [['It', 'was', 'the', 'best', 'of', 'times'], ['It', 'was', 'the', 'worst', 'of', 'times'], ['It', 'was', 'the', 'age', 'of', 'wisdom'], ['It', 'was', 'the', 'age', 'of', 'foolishness'], ['What', 'is', 'the', 'importance', 'of', 'age'], ['This', 'is', 'the', 'best', 'example.']]


## Function to caluculate similarity

In [11]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Create The similarity Matrix

In [12]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.83333333 0.66666667 0.66666667 0.33333333 0.36514837]
 [0.83333333 0.         0.66666667 0.66666667 0.33333333 0.18257419]
 [0.66666667 0.66666667 0.         0.83333333 0.5        0.18257419]
 [0.66666667 0.66666667 0.83333333 0.         0.5        0.18257419]
 [0.33333333 0.33333333 0.5        0.5        0.         0.36514837]
 [0.36514837 0.18257419 0.18257419 0.18257419 0.36514837 0.        ]]


## Rank sentences in Similarity matrix

In [13]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.19306449754902083, 1: 0.18095893645850156, 2: 0.1911855225552033, 3: 0.1911855225552033, 4: 0.14434636291527997, 5: 0.09925915796679063}


## Sort Sentences by pagerank

In [14]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.19306449754902083, ['It', 'was', 'the', 'best', 'of', 'times']), (0.1911855225552033, ['It', 'was', 'the', 'age', 'of', 'wisdom']), (0.1911855225552033, ['It', 'was', 'the', 'age', 'of', 'foolishness']), (0.18095893645850156, ['It', 'was', 'the', 'worst', 'of', 'times']), (0.14434636291527997, ['What', 'is', 'the', 'importance', 'of', 'age']), (0.09925915796679063, ['This', 'is', 'the', 'best', 'example.'])]


## Pick the top 'n' sentences

In [15]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  4


## summary text

In [21]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 It was the best of times. It was the age of wisdom. It was the age of foolishness. It was the worst of times


## Text 2

## Read the dataset

In [22]:
file = open("C:\\Users\\pbhar\\Downloads\\Text2.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

I AM SAM
I AM SAM
SAM I AM
THAT SAM-I-AM! THAT SAM-I-AM!
I DO NOT LIKE THAT SAM-I-AM!
DO WOULD YOU LIKE GREEN EGGS AND HAM?
I DO NOT LIKE THEM, SAM-I-AM
I DO NOT LIKE GREEN EGGS AND HAM
WOULD YOU LIKE THEM HERE OR THERE?
I WOULD NOT LIKE THEM HERE OR THERE
I WOULD NOT LIKE THEM ANYWHERE
I DO NOT LIKE GREEN EGGS AND HAM
I DO NOT LIKE THEM, SAM-I-AM
WOULD YOU LIKE THEM IN A HOUSE?
WOULD YOU LIKE THEN WITH A MOUSE?
I DO NOT LIKE THEM IN A HOUSE
I DO NOT LIKE THEM WITH A MOUSE
I DO NOT LIKE THEM HERE OR THERE
I DO NOT LIKE THEM ANYWHERE
I DO NOT LIKE GREEN EGGS AND HAM
I DO NOT LIKE THEM, SAM-I-AM.



## list of sentences

In [23]:
print("Sentences are ", sentences)

Sentences are  [['I', 'AM', 'SAM'], ['I', 'AM', 'SAM'], ['SAM', 'I', 'AM'], ['THAT', 'SAM-I-AM!', 'THAT', 'SAM-I-AM!'], ['I', 'DO', 'NOT', 'LIKE', 'THAT', 'SAM-I-AM!'], ['DO', 'WOULD', 'YOU', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM?'], ['I', 'DO', 'NOT', 'LIKE', 'THEM,', 'SAM-I-AM'], ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM'], ['WOULD', 'YOU', 'LIKE', 'THEM', 'HERE', 'OR', 'THERE?'], ['I', 'WOULD', 'NOT', 'LIKE', 'THEM', 'HERE', 'OR', 'THERE'], ['I', 'WOULD', 'NOT', 'LIKE', 'THEM', 'ANYWHERE'], ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM'], ['I', 'DO', 'NOT', 'LIKE', 'THEM,', 'SAM-I-AM'], ['WOULD', 'YOU', 'LIKE', 'THEM', 'IN', 'A', 'HOUSE?'], ['WOULD', 'YOU', 'LIKE', 'THEN', 'WITH', 'A', 'MOUSE?'], ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'IN', 'A', 'HOUSE'], ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'WITH', 'A', 'MOUSE'], ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'HERE', 'OR', 'THERE'], ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'ANYWHERE'], ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AN

In [24]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

In [25]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         1.         1.         0.         0.23570226 0.
  0.23570226 0.20412415 0.         0.20412415 0.23570226 0.20412415
  0.23570226 0.         0.         0.20412415 0.20412415 0.20412415
  0.23570226 0.20412415 0.23570226]
 [1.         0.         1.         0.         0.23570226 0.
  0.23570226 0.20412415 0.         0.20412415 0.23570226 0.20412415
  0.23570226 0.         0.         0.20412415 0.20412415 0.20412415
  0.23570226 0.20412415 0.23570226]
 [1.         1.         0.         0.         0.23570226 0.
  0.23570226 0.20412415 0.         0.20412415 0.23570226 0.20412415
  0.23570226 0.         0.         0.20412415 0.20412415 0.20412415
  0.23570226 0.20412415 0.23570226]
 [0.         0.         0.         0.         0.57735027 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.23570226 0.23570226 0.23570226 0.57735027 0.         0.28867513
 

In [26]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.03492638917817515, 1: 0.03492638917817515, 2: 0.03492638917817515, 3: 0.010552425887368076, 4: 0.06321847556853293, 5: 0.03960609463088077, 6: 0.057289451630209653, 7: 0.056226314678287335, 8: 0.03486412173792891, 9: 0.053338934277549654, 10: 0.0566062415473087, 11: 0.056226314678287335, 12: 0.05728945163020967, 13: 0.0350182718613405, 14: 0.02940805086606966, 15: 0.056983599690836606, 16: 0.05702071446687991, 17: 0.057531865612924, 18: 0.06143601920162538, 19: 0.056226314678287335, 20: 0.05637816982094812}


In [27]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.06321847556853293, ['I', 'DO', 'NOT', 'LIKE', 'THAT', 'SAM-I-AM!']), (0.06143601920162538, ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'ANYWHERE']), (0.057531865612924, ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'HERE', 'OR', 'THERE']), (0.05728945163020967, ['I', 'DO', 'NOT', 'LIKE', 'THEM,', 'SAM-I-AM']), (0.057289451630209653, ['I', 'DO', 'NOT', 'LIKE', 'THEM,', 'SAM-I-AM']), (0.05702071446687991, ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'WITH', 'A', 'MOUSE']), (0.056983599690836606, ['I', 'DO', 'NOT', 'LIKE', 'THEM', 'IN', 'A', 'HOUSE']), (0.0566062415473087, ['I', 'WOULD', 'NOT', 'LIKE', 'THEM', 'ANYWHERE']), (0.05637816982094812, ['I', 'DO', 'NOT', 'LIKE', 'THEM,', 'SAM-I-AM.\n']), (0.056226314678287335, ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM']), (0.056226314678287335, ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM']), (0.056226314678287335, ['I', 'DO', 'NOT', 'LIKE', 'GREEN', 'EGGS', 'AND', 'HAM']), (0.053338934277549654, ['I', 

In [28]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  5


In [29]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 I DO NOT LIKE THAT SAM-I-AM!. I DO NOT LIKE THEM ANYWHERE. I DO NOT LIKE THEM HERE OR THERE. I DO NOT LIKE THEM, SAM-I-AM. I DO NOT LIKE THEM, SAM-I-AM


## Text 3

## Read the dataset

In [32]:
file = open("C:\\Users\\pbhar\\Downloads\\Text3.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(".") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

As an institution of higher learning, Sacred Heart University places special emphasis on academic integrity, which is a commitment to the fundamental values of honesty, trust, fairness, respect, and responsibility
 Only when these values are widely respected and practiced by all members of the University students, faculty, administrators, and staff can the University maintain a culture that promotes free exploration of knowledge, constructive debate, genuine learning, effective research, fair assessment of student progress, and development of members characters
 These aims of the University require that its members exercise mutual responsibilities
 At its core, academic integrity is secured by a principled commitment to carry out these responsibilities, not by rules and penalties
 Students and faculty should strive to create an academic environment that is honest, fair, and respectful of all
 They do this by evaluating others work fairly, by responding to others ideas critically yet co

## list of sentences

In [33]:
print("Sentences are ", sentences)

Sentences are  [['As', 'an', 'institution', 'of', 'higher', 'learning,', 'Sacred', 'Heart', 'University', 'places', 'special', 'emphasis', 'on', 'academic', 'integrity,', 'which', 'is', 'a', 'commitment', 'to', 'the', 'fundamental', 'values', 'of', 'honesty,', 'trust,', 'fairness,', 'respect,', 'and', 'responsibility'], ['', 'Only', 'when', 'these', 'values', 'are', 'widely', 'respected', 'and', 'practiced', 'by', 'all', 'members', 'of', 'the', 'University', 'students,', 'faculty,', 'administrators,', 'and', 'staff', 'can', 'the', 'University', 'maintain', 'a', 'culture', 'that', 'promotes', 'free', 'exploration', 'of', 'knowledge,', 'constructive', 'debate,', 'genuine', 'learning,', 'effective', 'research,', 'fair', 'assessment', 'of', 'student', 'progress,', 'and', 'development', 'of', 'members', 'characters'], ['', 'These', 'aims', 'of', 'the', 'University', 'require', 'that', 'its', 'members', 'exercise', 'mutual', 'responsibilities'], ['', 'At', 'its', 'core,', 'academic', 'integr

In [34]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)
    

In [35]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.37242265 0.19611614 0.21650635 0.3086067  0.25103951
  0.30175492 0.32868787 0.25802342 0.24748737 0.29543947 0.        ]
 [0.37242265 0.         0.42199786 0.19112739 0.33202614 0.37774868
  0.33810162 0.27696991 0.24404766 0.32771521 0.34774393 0.11704115]
 [0.19611614 0.42199786 0.         0.16984156 0.18156826 0.17902872
  0.25492496 0.18752289 0.23132597 0.33282012 0.2575131  0.2773501 ]
 [0.21650635 0.19112739 0.16984156 0.         0.26726124 0.36893239
  0.18761969 0.20701967 0.17025131 0.04082483 0.15161961 0.20412415]
 [0.3086067  0.33202614 0.18156826 0.26726124 0.         0.28171808
  0.28653413 0.29508445 0.1820063  0.13093073 0.28365431 0.21821789]
 [0.25103951 0.37774868 0.17902872 0.36893239 0.28171808 0.
  0.28817699 0.24003968 0.21535276 0.20655911 0.23973165 0.12909944]
 [0.30175492 0.33810162 0.25492496 0.18761969 0.28653413 0.28817699
  0.         0.33292257 0.27379284 0.34139673 0.39012788 0.13130643]
 [0.32868787 0.27696991 0.1875

In [36]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.08231531175473109, 1: 0.09715692609711293, 2: 0.08062288488440884, 3: 0.06759377895084334, 4: 0.08259976668074122, 5: 0.08281549444786611, 6: 0.091460912017107, 7: 0.09337733676577872, 8: 0.07967858671934032, 9: 0.08553542711220241, 10: 0.09782645061550886, 11: 0.05901712395435903}


In [37]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.09782645061550886, ['', 'All', 'matriculated', 'students', 'will', 'be', 'provided', 'with', 'a', 'full', 'description', 'of', 'the', 'University', 'standards', 'for', 'academic', 'integrity,', 'consequences', 'for', 'violations,', 'and', 'the', 'appeals', 'procedure']), (0.09715692609711293, ['', 'Only', 'when', 'these', 'values', 'are', 'widely', 'respected', 'and', 'practiced', 'by', 'all', 'members', 'of', 'the', 'University', 'students,', 'faculty,', 'administrators,', 'and', 'staff', 'can', 'the', 'University', 'maintain', 'a', 'culture', 'that', 'promotes', 'free', 'exploration', 'of', 'knowledge,', 'constructive', 'debate,', 'genuine', 'learning,', 'effective', 'research,', 'fair', 'assessment', 'of', 'student', 'progress,', 'and', 'development', 'of', 'members', 'characters']), (0.09337733676577872, ['', 'Faculty', 'will', 'assign', 'failing', 'grades', 'for', 'violations', 'of', 'the', 'University', 'policy', 'on', 'academic', '

In [38]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  3


In [39]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
  All matriculated students will be provided with a full description of the University standards for academic integrity, consequences for violations, and the appeals procedure.  Only when these values are widely respected and practiced by all members of the University students, faculty, administrators, and staff can the University maintain a culture that promotes free exploration of knowledge, constructive debate, genuine learning, effective research, fair assessment of student progress, and development of members characters.  Faculty will assign failing grades for violations of the University policy on academic integrity and students may immediately receive an F for a course in which they commit a violation


## Text-4 

## Read the dataset

In [40]:
file = open("C:\\Users\\pbhar\\Downloads\\Text4.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(".") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

Imagine there's no heaven
 It's easy if you try
 No hell below us
 Above us, only sky
 Imagine all the people livin' for today
 Imagine there's no countries
 It isn't hard to do
 Nothing to kill or die for and no religion, too
 Imagine all the people livin' life in peace
 You may say I'm a dreamer but I'm not the only one
 I hope someday you'll join us and the world will be as one
 Imagine no possessions
 I wonder if you can
 No need for greed or hunger
 A brotherhood of man
 Imagine all the people sharing all the world
 



## list of sentences

In [41]:
print("Sentences are ", sentences)

Sentences are  [['Imagine', "there's", 'no', 'heaven'], ['', "It's", 'easy', 'if', 'you', 'try'], ['', 'No', 'hell', 'below', 'us'], ['', 'Above', 'us,', 'only', 'sky'], ['', 'Imagine', 'all', 'the', 'people', "livin'", 'for', 'today'], ['', 'Imagine', "there's", 'no', 'countries'], ['', 'It', "isn't", 'hard', 'to', 'do'], ['', 'Nothing', 'to', 'kill', 'or', 'die', 'for', 'and', 'no', 'religion,', 'too'], ['', 'Imagine', 'all', 'the', 'people', "livin'", 'life', 'in', 'peace'], ['', 'You', 'may', 'say', "I'm", 'a', 'dreamer', 'but', "I'm", 'not', 'the', 'only', 'one'], ['', 'I', 'hope', 'someday', "you'll", 'join', 'us', 'and', 'the', 'world', 'will', 'be', 'as', 'one'], ['', 'Imagine', 'no', 'possessions'], ['', 'I', 'wonder', 'if', 'you', 'can'], ['', 'No', 'need', 'for', 'greed', 'or', 'hunger'], ['', 'A', 'brotherhood', 'of', 'man'], ['', 'Imagine', 'all', 'the', 'people', 'sharing', 'all', 'the', 'world'], ['', '\n']]


In [42]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)
    

In [43]:
similarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.         0.2236068  0.         0.1767767  0.67082039
  0.         0.15075567 0.16666667 0.         0.         0.5
  0.         0.18898224 0.         0.13867505 0.        ]
 [0.         0.         0.18257419 0.18257419 0.14433757 0.18257419
  0.16666667 0.12309149 0.13608276 0.21081851 0.10910895 0.20412415
  0.5        0.15430335 0.18257419 0.1132277  0.28867513]
 [0.2236068  0.18257419 0.         0.2        0.15811388 0.4
  0.18257419 0.26967994 0.1490712  0.11547005 0.23904572 0.4472136
  0.18257419 0.3380617  0.2        0.12403473 0.31622777]
 [0.         0.18257419 0.2        0.         0.15811388 0.2
  0.18257419 0.13483997 0.1490712  0.23094011 0.11952286 0.2236068
  0.18257419 0.16903085 0.2        0.12403473 0.31622777]
 [0.1767767  0.14433757 0.15811388 0.15811388 0.         0.31622777
  0.14433757 0.21320072 0.70710678 0.18257419 0.18898224 0.35355339
  0.14433757 0.26726124 0.15811388 0.68640647 0.25      ]
 [0.67082039 0.18257419 0.4       

In [44]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.04103754840165519, 1: 0.05216674823773841, 2: 0.06476461328390501, 3: 0.05042552394376271, 4: 0.07232546656110307, 5: 0.07957594803435018, 6: 0.04721661353225302, 7: 0.05462145172863904, 8: 0.0657679173825595, 9: 0.046194052816998114, 10: 0.04611265198711134, 11: 0.08291851317376232, 12: 0.05391673253998026, 13: 0.06064416531313606, 14: 0.05042552394376271, 15: 0.0626103810541617, 16: 0.06927614806512122}


In [45]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.08291851317376232, ['', 'Imagine', 'no', 'possessions']), (0.07957594803435018, ['', 'Imagine', "there's", 'no', 'countries']), (0.07232546656110307, ['', 'Imagine', 'all', 'the', 'people', "livin'", 'for', 'today']), (0.06927614806512122, ['', '\n']), (0.0657679173825595, ['', 'Imagine', 'all', 'the', 'people', "livin'", 'life', 'in', 'peace']), (0.06476461328390501, ['', 'No', 'hell', 'below', 'us']), (0.0626103810541617, ['', 'Imagine', 'all', 'the', 'people', 'sharing', 'all', 'the', 'world']), (0.06064416531313606, ['', 'No', 'need', 'for', 'greed', 'or', 'hunger']), (0.05462145172863904, ['', 'Nothing', 'to', 'kill', 'or', 'die', 'for', 'and', 'no', 'religion,', 'too']), (0.05391673253998026, ['', 'I', 'wonder', 'if', 'you', 'can']), (0.05216674823773841, ['', "It's", 'easy', 'if', 'you', 'try']), (0.05042552394376271, ['', 'Above', 'us,', 'only', 'sky']), (0.05042552394376271, ['', 'A', 'brotherhood', 'of', 'man']), (0.047216613532

In [48]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  3


In [49]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
  Imagine no possessions.  Imagine there's no countries.  Imagine all the people livin' for today


## Text 5

## Read the dataset

In [50]:
file = open("C:\\Users\\pbhar\\Downloads\\Text5.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(".") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
 Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
 As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
 The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning
According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transIn an attempt to build an AI

In [51]:
print("Sentences are ", sentences)

Sentences are  [['In', 'an', 'attempt', 'to', 'build', 'an', 'AI-ready', 'workforce,', 'Microsoft', 'announced', 'Intelligent', 'Cloud', 'Hub', 'which', 'has', 'been', 'launched', 'to', 'empower', 'the', 'next', 'generation', 'of', 'students', 'with', 'AI-ready', 'skills'], ['', 'Envisioned', 'as', 'a', 'three-year', 'collaborative', 'program,', 'Intelligent', 'Cloud', 'Hub', 'will', 'support', 'around', '100', 'institutions', 'with', 'AI', 'infrastructure,', 'course', 'content', 'and', 'curriculum,', 'developer', 'support,', 'development', 'tools', 'and', 'give', 'students', 'access', 'to', 'cloud', 'and', 'AI', 'services'], ['', 'As', 'part', 'of', 'the', 'program,', 'the', 'Redmond', 'giant', 'which', 'wants', 'to', 'expand', 'its', 'reach', 'and', 'is', 'planning', 'to', 'build', 'a', 'strong', 'developer', 'ecosystem', 'in', 'India', 'with', 'the', 'program', 'will', 'set', 'up', 'the', 'core', 'AI', 'infrastructure', 'and', 'IoT', 'Hub', 'for', 'the', 'selected', 'campuses'], [''

In [52]:
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)
    

In [53]:
imilarity_matrix = np.zeros((len(sentences), len(sentences)))

for idx1 in range(len(sentences)):
    for idx2 in range(len(sentences)):
        if idx1 == idx2: #ignore if both are same sentences
            continue
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1],
sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.20759972 0.31900456 0.06253054 0.78687791 0.20759972
  0.31900456 0.06253054 0.17588162 0.04652421 0.15569979 0.36710862
  0.31333978 0.2731155  0.32663729 0.18993429 0.        ]
 [0.20759972 0.         0.32781502 0.45515762 0.25496724 1.
  0.32781502 0.45515762 0.30123204 0.43825049 0.33333333 0.14509525
  0.20869968 0.1754116  0.32274861 0.22771002 0.1490712 ]
 [0.31900456 0.32781502 0.         0.30719192 0.3880597  0.32781502
  1.         0.30719192 0.42320737 0.26120926 0.19122543 0.63419203
  0.46424389 0.45522859 0.42320737 0.23993603 0.12216944]
 [0.06253054 0.45515762 0.30719192 0.         0.17553824 0.45515762
  0.30719192 1.         0.31108551 0.38401229 0.20080483 0.20395079
  0.1796053  0.31701148 0.25923792 0.19596545 0.1796053 ]
 [0.78687791 0.25496724 0.3880597  0.17553824 0.         0.25496724
  0.3880597  0.17553824 0.61717741 0.13060463 0.21854335 0.43600702
  0.34207444 0.26355339 0.35267281 0.23993603 0.        ]
 [0.20759972 1.    

In [54]:
# Step 3 - Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.0496663425004684, 1: 0.06648612192426775, 2: 0.07552550251202063, 3: 0.06281058990603468, 4: 0.06298537632731369, 5: 0.06648612192426775, 6: 0.07552550251202063, 7: 0.06281058990603468, 8: 0.06084799196806396, 9: 0.05396466893734665, 10: 0.045966797906720466, 11: 0.06314250045762816, 12: 0.05405088114863601, 13: 0.05639425115489125, 14: 0.06290681216880517, 15: 0.048693837767049004, 16: 0.03173611097843138}


In [55]:
# Step 4 - Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in
enumerate(sentences)), reverse=True)
print("Indexes of top ranked_sentence order are \n\n",
ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.07552550251202063, ['', 'As', 'part', 'of', 'the', 'program,', 'the', 'Redmond', 'giant', 'which', 'wants', 'to', 'expand', 'its', 'reach', 'and', 'is', 'planning', 'to', 'build', 'a', 'strong', 'developer', 'ecosystem', 'in', 'India', 'with', 'the', 'program', 'will', 'set', 'up', 'the', 'core', 'AI', 'infrastructure', 'and', 'IoT', 'Hub', 'for', 'the', 'selected', 'campuses']), (0.07552550251202063, ['', 'As', 'part', 'of', 'the', 'program,', 'the', 'Redmond', 'giant', 'which', 'wants', 'to', 'expand', 'its', 'reach', 'and', 'is', 'planning', 'to', 'build', 'a', 'strong', 'developer', 'ecosystem', 'in', 'India', 'with', 'the', 'program', 'will', 'set', 'up', 'the', 'core', 'AI', 'infrastructure', 'and', 'IoT', 'Hub', 'for', 'the', 'selected', 'campuses']), (0.06648612192426775, ['', 'Envisioned', 'as', 'a', 'three-year', 'collaborative', 'program,', 'Intelligent', 'Cloud', 'Hub', 'will', 'support', 'around', '100', 'institutions', 'with

In [56]:
#Step 5 - How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary?  4


In [57]:
# Step 6 - Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
  As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses.  As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses.  Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services.  Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
