In [4]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import nltk
import re
from stop_words import get_stop_words

In [5]:
def getText(filename):
    if filename[len(filename)-3:] == "pdf":
        from tika import parser
        raw = parser.from_file(filename)
        return raw['content']
    else:
        import docx
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
    return '\n'.join(fullText)

In [6]:
raw_text =getText("n_thakur_petition.docx")

nlp = spacy.load("en_core_web_sm")

paras = [i.replace('\t',' ') for i in raw_text.split('\n') if i!='']

inp_to_spacy = " ".join(paras) # create string from paras list

doc = nlp(inp_to_spacy) # a spacy doc object it has everything

# getting sentences out of doc
sentences = [sentence for idno, sentence in enumerate(doc.sents)]

sentences = [re.sub("…", "", str(sentence)) for sentence in sentences]

sentences = [re.sub("[.][.]+", "", str(sentence)) for sentence in sentences]

# again we input the data in spacy

input_to_spacy = " ".join(sentences)
doc = nlp(input_to_spacy)

tokens = [token for token in doc if not nlp.vocab[str(token)].is_stop]

# still it contains tokens that we dont need like , . - etc
# removing them manually
stop_words = get_stop_words('en')
tokens = [token for token in doc if str(token) not in stop_words]

# Result from the above cells:
# 1. `sentences`
# 2. `tokens`

nltk.download('punkt') # one time execution

# ## Need to download dataset
# http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

word_embeddings = {}

f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [7]:
# len(word_embeddings.get("by"))

get_stop_words('')

In [8]:


#    create sentence vectors
sentence_vectors = []
for i in sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)



In [9]:
len(sentence_vectors)

In [10]:


sentence_vectors

# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])


from sklearn.metrics.pairwise import cosine_similarity


for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]



In [11]:
import networkx as nx

nx_graph = nx.from_numpy_matrix(sim_mat)
scores = nx.pagerank(nx_graph)


In [12]:
scores

In [13]:

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
summary = []

In [14]:
ranked_sentences

In [15]:
# i=1
# for i in range(1, len(ranked_sentences)):
#     if ranked_sentences[i][0]==ranked_sentences[i-1][0]:
#         ranked_sentences.pop(i)
# print(ranked_sentences)

In [None]:
summary=[]
for i in range(len(ranked_sentences)):
    print(ranked_sentences[i][1])

In [16]:
summary=[]
i=0
while i<max(((int)(len(ranked_sentences)/10)),5):    
    if(ranked_sentences[0][0]/1000>ranked_sentences[i][0]):
        break    
    if(ranked_sentences[i][0]<=0):
        break
    summary.append(ranked_sentences[i][1])
    i+=1
    if(i>=len(ranked_sentences)):
        break
    

ordered_summary = []
ordered_set = {-1}
for i in sentences:
    if i in summary and i.lower() not in ordered_set:
        ordered_summary.append(i)
        ordered_set.add(i.lower())
import docx
d = docx.Document()
d.add_heading('Summary', 0)

for i in ordered_summary:
    d.add_paragraph(
        i, style='List Bullet'
    )

d.save('_summarized_2.docx')

In [None]:
ordered_summary