In [41]:
!pip3 install python-docx
!pip3 install tika
!pip3 install stop_words
!pip3 install continuous_threading



In [42]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import nltk
import docx
import re
from tika import parser
from stop_words import get_stop_words
import continuous_threading as thread
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def getText(filename):
    if filename[len(filename)-3:] == "pdf":
        raw = parser.from_file(filename)
        return raw['content']
    else:
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
    return '\n'.join(fullText)

In [46]:
from google.colab import drive
path = '/content/drive'
drive.mount(path)

root = path + '/My Drive/Text Summarization/'
path_to_file = root + "Petition.docx"
Glove_path = root + 'Gloves/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
raw_text = getText(path_to_file)
raw_text = re.sub(r"\n", " ", raw_text)
paras = [i.replace('\t',' ') for i in raw_text.split('\n') if i!='']

inp_to_spacy = " ".join(paras) # create string from paras list
doc = nlp(inp_to_spacy) # a spacy doc object it has everything

# getting sentences out of doc
sentences = [sentence for idno, sentence in enumerate(doc.sents)]
sentences = [re.sub("…", "", str(sentence)) for sentence in sentences]
sentences = [re.sub("[.][.]+", "", str(sentence)) for sentence in sentences]

# again we input the data in spacy
input_to_spacy = " ".join(sentences)
doc = nlp(input_to_spacy)

tokens = [token for token in doc if not nlp.vocab[str(token)].is_stop]

# still it contains tokens
stop_words = get_stop_words('en')
tokens = [token for token in doc if str(token) not in stop_words]

word_embeddings = {}

dimension_of_embedding = 50 
Glove = Glove_path + str(dimension_of_embedding) + '.txt'
f = open(Glove, encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    word_embeddings[word] = coefs
f.close()

In [50]:
(word_embeddings.get("by"))

array([ 0.35215 , -0.35603 ,  0.25708 , -0.10611 , -0.20718 ,  0.63596 ,
       -1.0129  , -0.45964 , -0.48749 , -0.080555,  0.43769 ,  0.46046 ,
       -0.80943 , -0.23336 ,  0.46623 , -0.10866 , -0.1221  , -0.63544 ,
       -0.73486 , -0.24848 ,  0.4317  ,  0.092264,  0.52033 , -0.46784 ,
        0.016798, -1.5124  , -0.19986 , -0.43351 , -0.59247 ,  0.18088 ,
        3.5194  , -0.7024  ,  0.23613 , -0.68514 , -0.37009 , -0.080451,
        0.10635 , -0.085495, -0.18451 ,  0.29771 ,  0.18123 ,  0.53627 ,
       -0.1001  , -0.55165 ,  0.098833, -0.12942 , -0.82628 , -0.4329  ,
       -0.10301 , -0.56079 ], dtype=float32)

In [0]:
for index, sentence in enumerate(sentences):
    sentences[index] = " ".join(sentence.split())

In [0]:
# create sentence vectors
sentence_vectors = []
for i in sentences:
    a = 0
    if len(i) != 0:
        length = len(i.split())
        v = sum( [word_embeddings.get(i[a], np.zeros((dimension_of_embedding,))) for a in range(length)] ) / (length+0.001)
    else:
        v = np.zeros((dimension_of_embedding,))
    sentence_vectors.append(v)

In [56]:
sentence_vectors

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([-0.01461546,  0.43928691, -0.01237921,  0.28483173,  0.0406931 ,
        -0.25786072,  0.08630457, -0.15886704, -0.11227924,  0.16032655,
         0.1131123 ,  0.44318562,  0.07874042, -0.13343552, -0.02003732,
        -0.0302016 , -0.04502499, -0.08130956, -0.09877707,  0.08246585,
        -0.06313562, -0.10419526,  0.27303231,  0.14749417,  0.10557147,
        -0.10881039, -0.33682106,  0.00343952,  0.07885705, -0.29219259,
         0.65378208,  0.00985938, -0.13509497,  0.42059313, -0.08724092,
        -0.28474841,  0.26755415, -0.

In [0]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

from sklearn.metrics.pairwise import cosine_similarity

for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, dimension_of_embedding), 
                                              sentence_vectors[j].reshape(1, dimension_of_embedding))[0, 0]

In [0]:
def Similarity(i, j):
    sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, dimension_of_embedding),
                                      sentence_vectors[j].reshape(1, dimension_of_embedding))[0, 0]

In [62]:
# similarity matrix
simmat = np.zeros([len(sentences), len(sentences)])
import time
th1 = thread.OperationThread(target=Similarity)
th2 = thread.OperationThread(target=Similarity)
th3 = thread.OperationThread(target=Similarity)
th4 = thread.OperationThread(target=Similarity)
t1 = time.time()
loop = len(sentences)
i = j = 0

while i < loop:
#     print(i)
    while j < loop:
        if j < loop and i != j:
            th1.add_data(i, j)
        
        if j+1 < loop and i != j+1:
            th2.add_data(i, j+1)
        
        if j+2 < loop and i != j+2:
            th1.add_data(i, j+2)
        
        if j+3 < loop and i != j+3:
            th1.add_data(i, j+3)
        j = j + 4
    i = i + 1
    j = 0

th1.stop()
th2.stop()
th3.stop()
th4.stop()
print(time.time() - t1)

2.925926685333252


In [0]:
nx_graph = nx.from_numpy_matrix(sim_mat)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)

In [66]:
scores

{0: 0.0008412134516181313,
 1: 0.0008412134516181313,
 2: 0.005074545090497041,
 3: 0.005009492412448896,
 4: 0.005178187971808276,
 5: 0.0008412134516181313,
 6: 0.005660454458589055,
 7: 0.004984585202867297,
 8: 0.0008412134516181313,
 9: 0.0008412134516181313,
 10: 0.005256718215167398,
 11: 0.0060646302035133,
 12: 0.0008412134516181313,
 13: 0.005074545090497041,
 14: 0.005009492412448896,
 15: 0.005178187971808276,
 16: 0.0008412134516181313,
 17: 0.005660454458589055,
 18: 0.005901216863027769,
 19: 0.0008412134516181313,
 20: 0.0058958376844187935,
 21: 0.0008412134516181313,
 22: 0.005133852867885165,
 23: 0.005728008216576768,
 24: 0.005041037685754699,
 25: 0.005926727948939984,
 26: 0.004885886670442784,
 27: 0.0008412134516181313,
 28: 0.005123267066255282,
 29: 0.005784841108435119,
 30: 0.0008412134516181313,
 31: 0.00564180375877643,
 32: 0.005715153515005406,
 33: 0.0008412134516181313,
 34: 0.0008412134516181313,
 35: 0.005128848009826548,
 36: 0.0059189171986512,
 3

In [68]:
for (i,j,k) in ranked_sentences:
    print(i," : ",j)

0.006175956171307608  :  In case, the respondent No.3 is allotted a seat/post/office during the pendency of the petition, the petitioner shall suffer irreparable loss and injury, which will not be compensated in any term of money.
0.006160659385397777  :  That for the reasons mentioned above, the respondent No.3 has violated the directions/instructions circulated by the respondent No.1 openly for her ill-motive for which the respondent No.3 should be prosecuted and punished as per provisions of law, but the respondent Nos.
0.006150545084261402  :  That the respondent No.3 also opened her two publicity offices, whereas she should have to daily used one office for the said purpose, but she has not acted upon the directions.
0.006147643717320895  :  That the petitioner has filed the above titled petition stating therein the true facts, the contents of which are not repeated here for the sake of brevity and those be read as part and parcel of this application.
0.006143519615098823  :  That

In [0]:
summary = []
i = 0
# Taking top 10% sentences with highest importance
Length = int(len(ranked_sentences)/10)
Threshold = ranked_sentences[0][0]/1000

while i < max(Length, 10):
    # Checking that ith sentence is 0.1% important as compared to 1st sentence 
    if(Threshold > ranked_sentences[i][0]):
        break    
    if(ranked_sentences[i][0] <= 0):
        break
    
    summary.append([ranked_sentences[i][2], ranked_sentences[i][1]])
    i += 1
    if(i >= len(ranked_sentences)):
        break

In [0]:
sorted_summary = sorted(summary)
sorted_summary_final = []

In [0]:
for ind, sent in sorted_summary:
    if(sent.lower() not in sorted_summary_final):
        sorted_summary_final.append(sent)        

In [0]:
d = docx.Document()
d.add_heading('Summary', 0)

for i in sorted_summary_final:
    d.add_paragraph(i, style = 'List Bullet')

d.save('Summary.docx')
d.save(root + 'Summary.docx')

In [78]:
sorted_summary_final

['That Election for Municipal Councillors for the Municipal Corporation of were declared to be held on .',
 'to 12 are Performa respondents, and no relief is claimed against them.',
 'That on each plot, the structure is raised upto third floor, First Floor, Second Floor and Third Floor, which shows that there are 3324 properties/floors more have been shown in the total plots of 831, whereas the aforementioned plots/properties are not existing in the Pockets C-4, C-5 and E-4, Sector 16, Rohini, Delhi, and the same have falsely been mentioned in the list.',
 'That in furtherance malafide intention, in the aforementioned false properties numbering 3324, numerous false voters have also been shown, on whose behalf, base votes have been casted on the day of the election, i.e. 5.4.2007.',
 'It is submitted that the petitioner also submitted a letter dated 4.4.2008 to the respondent No.1 for the use of vehicles in the canvassing with which the goods are transported, but the concerned officials