In [21]:
#This program summarizes small texts using the TextRank algorithm. We will be using the extractive form of summarization in this
#program. This program shows a more detailed form of the gensim text rank summarization algorithm by computing the cosine distances

In [22]:
import numpy as np
import pandas as pd
import nltk
from nltk.cluster.util import cosine_distance

In [23]:
#Get the input file
input_file = 'C:\\Users\\HemaRamachandran\\conversation1.txt'

In [24]:
#Open the input file. Remove newline characters
a = open(input_file,'r')
text = a.read().split('\n\n')
text[:4]

['Chairman Wormsley: Each of you has received the agenda. I will entertain a motion that the agenda be approved.',
 'Commissioner Brown: So moved.',
 'Commissioner Hobbs: Seconded',
 'Chairman Wormsley: It has been moved and seconded that the agenda be approved as received by the members. All those in favor signify by saying "Aye"?...Opposed by saying "No"?...The agenda is approved. You have received a copy of the minutes of the last meeting. Are there any corrections or additions to the meeting?']

In [25]:
from nltk.tokenize import sent_tokenize

In [26]:
#separate the sentences into tokens using nltk's sent_tokenize
sentences = []
for s in text:
    sentences.append(sent_tokenize(s))
#print(sentences[:5])


#flatten the sentences
sentences = [y for x in sentences for y in x]
print(sentences[:5])

['Chairman Wormsley: Each of you has received the agenda.', 'I will entertain a motion that the agenda be approved.', 'Commissioner Brown: So moved.', 'Commissioner Hobbs: Seconded', 'Chairman Wormsley: It has been moved and seconded that the agenda be approved as received by the members.']


In [27]:
#remove numerical and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z:]", " ")
print(clean_sentences[:5])

0    Chairman Wormsley: Each of you has received th...
1    I will entertain a motion that the agenda be a...
2                        Commissioner Brown: So moved 
3                         Commissioner Hobbs: Seconded
4    Chairman Wormsley: It has been moved and secon...
dtype: object


In [28]:
#remove stopwords from the sentences
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
print(clean_sentences[:5])

['Chairman Wormsley: Each received agenda', 'I entertain motion agenda approved', 'Commissioner Brown: So moved', 'Commissioner Hobbs: Seconded', 'Chairman Wormsley: It moved seconded agenda approved received members']


In [29]:
from sklearn.metrics.pairwise import cosine_similarity
word_embeddings = {}
#open the glove vector dataset to obtain the vector representation of all the words. From these we willl frame the 
#sentence vectors as we read each sentence. We will use the vectors to compute the cosine similarity between sentences 
#the similarity scores will then be used by the Textrank algorithm to compute the rank/score of each sentence. 
f = open('C:\\Users\\HemaRamachandran\\glove.6B\\glove.6B.100d.txt', encoding='utf-8')

In [30]:
#store the vector representation of each word in an array
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [31]:
#for each sentence, sum up the vectors to arrive at the sentence vector for that sentence
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        #print(i.split())
        #print(v)
        #print(i)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [32]:
sim_mat = np.zeros([len(clean_sentences),len(clean_sentences)])

In [33]:
#compute cosine similarity between sentence vectors
for i in range(len(clean_sentences)):
      for j in range(len(clean_sentences)):
        if i != j:
              sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [34]:
#import the pagerank score computation
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [35]:
#sort the ranked sentences on the descending order of scores
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [36]:
# Extract top 10 sentences as the summary
for i in range(10):
     print(ranked_sentences[i][1])

As you know, a motion for previous question, if passed by a two-thirds vote, will cut off further debate and require us to vote yes or no on the resolution before us.
(This is sort of a short cut way that is commonly used for approval of minutes and/or the agenda rather than requiring a motion and second.)
You should vote for this motion if you wish to cut off further debate of the wheel tax increase at this point.
This is the first time this resolution is under consideration.)
Chairman Wormsley: It has been properly moved and seconded that a resolution increasing the wheel tax by $10 to make up the state cut in education funding be passed.
Commissioner Hayes: I move previous question.
This resolution calls for the increases to go to the general fund.
Commissioner Adkins: Each of you has previously received a copy of a resolution to increase the wheel tax by $10 to make up the state cut in education funding.
Will all those in favor of previous question please raise your hand?
Chairman 

In [None]:
#end of program