In [16]:
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
import csv
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.pdfpage import PDFPage
import string
import math
from textblob import TextBlob as tb

english_stops = []

def clean_tokens(tokens):
    """ Lowercases, takes out punct and stopwords and short strings """
    return [token.lower() for token in tokens if (token not in string.punctuation) and
               	(token.lower() not in english_stops) and len(token) > 2]

def get_stopwords():
    enc = 'utf-8'
    with open('stopword_file.csv', 'r', encoding = enc) as f:
        reader = csv.reader(f)
        keywords = list(reader)
    english_stops = [i[0] for i in keywords]
    #print ( english_stops)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
def get_cleanTokens(directory):
    cleanTokens = []
    wordcount = {} 
    for filename in os.listdir(directory):
        text = get_text("KeywordDocs/" + filename)
        tokens = nltk.word_tokenize(text)
        cleanTokens.extend(clean_tokens(tokens))
    return cleanTokens

def get_text(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()
    return data

def tf(word, blob):
    return (float)(blob.words.count(word)) / (float)(len(blob.words))

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist): return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)


def create_tfIdfList(document):
    totalLength, splitLength = len(document), int(len(document)/4000)
    bloblist = [ document[i:i+splitLength] for i in range(0, totalLength, splitLength) ]
    for i, blob in enumerate(bloblist):
        blob = tb(blob)
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        enc = 'utf-8'
        if not os.path.isfile('Data/tf-idf.csv'):
            with open('Data/tf-idf.csv', 'w', encoding = enc) as f:
                columnTitleRow = "Word, Score\n"
                f.write(columnTitleRow)
                for word, score in sorted_words: 
                    score = "{},{}\n".format(word, round(score, 5))
                    f.write(score)
        else:
            with open('Data/tf-idf.csv', 'a+', encoding = enc) as f:
                for word, score in sorted_words: 
                    score = "{},{}\n".format(word, round(score, 5))
                    f.write(score)
                    
if __name__ == '__main__':
    get_stopwords()
    clean_tokens = get_cleanTokens("KeywordDocs")
    clean_doc = ' '.join(clean_tokens)
    create_tfIdfList(clean_doc)