In [1]:
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import os
import string
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
import csv
import numpy as np
import math
from textblob import TextBlob as tb

def get_processed_text(lematized_text,stop_words):
    processed_text = []
    for word in lematized_text:
        if word not in stop_words:
            processed_text.append(word)
    return processed_text 

def get_stopwords(POS_lematized_tag):
    stopwords = []
    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 
    for word in POS_lematized_tag:
        if word[1] not in wanted_POS:
            stopwords.append(word[0])
    punctuations = list(str(string.punctuation))
    stopwords = stopwords + punctuations

    enc = 'utf-8'
    with open('stopword_file.csv', 'r', encoding = enc) as f:
        reader = csv.reader(f)
        keywords = list(reader)
    english_stops = [i[0] for i in keywords]
    
    stopwords = stopwords + punctuations
    
    return list(set(stopwords))

def get_lematized_text(tags):
    wordnet_lemmatizer = WordNetLemmatizer()
    adjective_tags = ['JJ','JJR','JJS']
    lemmatized_text = []
    for word in tags:
        if word[1] in adjective_tags:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
        else:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
    return lemmatized_text

def get_text_from_file(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()
    return data

def get_text(directory):
    text = ""
    for filename in os.listdir(directory):
        text += get_text_from_file("KeywordDocs/" + filename)
        break
    return text

def tf(word, blob):
    return (float)(blob.words.count(word)) / (float)(len(blob.words))

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist): return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)


def create_tfIdfList(document):
    totalLength, splitLength = len(document), int(len(document)/4000)
    bloblist = [ document[i:i+splitLength] for i in range(0, totalLength, splitLength) ]
    for i, blob in enumerate(bloblist):
        blob = tb(blob)
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        enc = 'utf-8'
        if not os.path.isfile('Data/tf-idf.csv'):
            with open('Data/tf-idf.csv', 'w', encoding = enc) as f:
                columnTitleRow = "Word, Score\n"
                f.write(columnTitleRow)
                for word, score in sorted_words:
                    if(score >= 0.1):
                        score = "{},{}\n".format(word, round(score, 5))
                        f.write(score)
        else:
            with open('Data/tf-idf.csv', 'a+', encoding = enc) as f:
                for word, score in sorted_words: 
                    if(score >= 0.1):
                        score = "{},{}\n".format(word, round(score, 5))
                        f.write(score)
                    
if __name__ == '__main__':
    text = get_text("KeywordDocs")  #gets text of all docs in string format
    tokens = nltk.word_tokenize(text) #tokenizes the string
    POS_tag = nltk.pos_tag(tokens) # adds POS tag to each token
    
    lematized_text = get_lematized_text(POS_tag) #Removes duplicate gramatical counterparts
    POS_lematized_tag = nltk.pos_tag(lematized_text) #adds POS tags to the lematized text
    
    stop_words = get_stopwords(POS_lematized_tag)

    processed_text = get_processed_text(lematized_text,stop_words) # gets the final set of unique words with stopwords removed
    unique_text = list(set(processed_text))
    create_tfIdfList(unique_text)

NameError: name 'WordNetLemmatizer' is not defined