In [None]:
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import os
import string
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
import csv
import numpy as np
import math

def get_weighted_sum(processed_text):
    unique_words = list(set(processed_text))
    text_length = len(unique_words)
    weighted_edge = np.zeros((text_length,text_length),dtype=np.float32)
    
    score = np.zeros((text_length),dtype=np.float32)
    window_size = 3
    covered_coocurrences = []
    
    for i in range(0,text_length):
        score[i]=1
    for j in range(0,text_length):
        if j==i:
            weighted_edge[i][j]=0
        else:
            for window_start in range(0,(len(processed_text)-window_size)):
                window_end = window_start+window_size
                window = processed_text[window_start:window_end]
                
                if (unique_words[i] in window) and (unique_words[j] in window):
                    
                    index_of_i = window_start + window.index(unique_words[i])
                    index_of_j = window_start + window.index(unique_words[j])
                    
                    # index_of_x is the absolute position of the xth term in the window 
                    # (counting from 0) 
                    # in the processed_text
                      
                    if [index_of_i,index_of_j] not in covered_coocurrences:
                        weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                        covered_coocurrences.append([index_of_i,index_of_j])
    
    connections_sum = np.zeros((text_length),dtype=np.float32)
    for i in range(0,vocab_len):
        for j in range(0,vocab_len):
            connections_sum[i]+=weighted_edge[i][j]
    
    return connections_sum

def get_processed_text(lematized_text,stop_words):
    processed_text = []
    for word in lematized_text:
        if word not in stop_words:
            processed_text.append(word)
    return processed_text 

def get_stopwords(POS_lematized_tag):
    stopwords = []
    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 
    for word in POS_lematized_tag:
        if word[1] not in wanted_POS:
            stopwords.append(word[0])
    punctuations = list(str(string.punctuation))
    stopwords = stopwords + punctuations

    enc = 'utf-8'
    with open('stopword_file.csv', 'r', encoding = enc) as f:
        reader = csv.reader(f)
        keywords = list(reader)
    english_stops = [i[0] for i in keywords]
    
    stopwords = stopwords + punctuations
    
    return list(set(stopwords))

def get_lematized_text(tags):
    wordnet_lemmatizer = WordNetLemmatizer()
    adjective_tags = ['JJ','JJR','JJS']
    lemmatized_text = []
    for word in tags:
        if word[1] in adjective_tags:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
        else:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
    return lemmatized_text

def get_text_from_file(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()
    return data

def get_text(directory):
    text = ""
    for filename in os.listdir(directory):
        text += get_text_from_file("KeywordDocs/" + filename)
    return text


if __name__ == '__main__':
    text = get_text("KeywordDocs")  #gets text of all docs in string format
    tokens = nltk.word_tokenize(text) #tokenizes the string
    POS_tag = nltk.pos_tag(tokens) # adds POS tag to each token
    
    lematized_text = get_lematized_text(POS_tag) #Removes duplicate gramatical counterparts
    POS_lematized_tag = nltk.pos_tag(lematized_text) #adds POS tags to the lematized text
    
    stop_words = get_stopwords(POS_lematized_tag)

    processed_text = get_processed_text(lematized_text,stop_words) # gets the final set of unique words with stopwords removed
    
    weighted_sum = get_weighted_sum(processed_text)