In [17]:
import re
import os
import pdfplumber
import pandas as pd

import nltk
from nltk import tokenize
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
  
lemmatizer = WordNetLemmatizer()
pd.options.display.max_colwidth = 1000

In [18]:
def clean_text(word):
    # split text phrases into words
    words  = nltk.word_tokenize(word)
    
    # Remove all the special characters
    punctuations = re.sub(r'\W', ' ', str(word))
    
    custom_puntuations = ['.', ',', '/', '!', '?', ';', ':', '(',')', '[',']', '-', '_', '%', 'et', 'al', "et al", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',  'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    # Initialize the stopwords variable, which is a list of words ('and', 'the', 'i', 'yourself', 'is') that do not hold much values as key words
    stop_words  = stopwords.words('english')
    stop_words.extend(custom_puntuations)
    # Getting rid of all the words that contain numbers in them
    w_num = re.sub('\w*\d\w*', '', word).strip()

    
    # Return keywords which are not in stop words 
    #keyword = word if not word in stop_words and word in w_num and word in punctuations
    
    if not word in stop_words and word in w_num and word in punctuations:
        return word
    
def POS(sentence):
    tokens = word_tokenize(sentence)
    parts_of_speech = nltk.pos_tag(tokens)
    chunking = r"""
        NP: {<JJ>*<NN.*>}          # Noun phrase
            {<NN.*><IN><DT>?<JJ>*<NN.*>}  # Noun phrase with preposition
            {<DT><JJ>*<NN.*>}          # Determiner + Noun phrase
            {<VBG><IN><DT>?<JJ>*<NN.*>}  # Gerund phrase
            {<TO><VB><DT>?<JJ>*<NN.*>}   # Infinitive phrase
            {<VB.*><IN>?<DT>?<JJ>*<NN.*>}
            {<IN><DT>?<JJ>*<NN.*>}
    """
    expression = nltk.RegexpParser(chunking)
    chunked = expression.parse(parts_of_speech)
    
    others = []
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'NP'):
        word =" ".join([a for (a,b) in subtree.leaves()])
        if(not any(char.isdigit() for char in word)):
            others.append(word)
    
    return others



def lines2sentences(lines):
    cleaned_lines = []
    sentences = []
    
    for i in lines:
        cleaned_lines.append(i)
    join_lines = ''.join(cleaned_lines)
    sentences = tokenize.sent_tokenize(join_lines)
    return sentences




def unique_count(dfs, column):
    df = dfs[dfs[column].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    lists = df[column].explode().to_list()
    list_count = [[x,lists.count(x), len(x.split())] for x in set(lists)]
    df_words = pd.DataFrame(columns = ['word', 'occurance', 'word_count'])
    for i in range(len(list_count)):
        df_words.loc[i, "word"] = list_count[i][0]
        df_words.loc[i, "occurance"] = list_count[i][1]
        df_words.loc[i, "word_count"] = list_count[i][2]
    df_words = df_words.sort_values(by=['word_count'], ascending=False)
    return df_words


In [19]:
directories = ['Project proposals', 'Policies']
paths = []
for directory in directories:
    files = os.listdir(directory)
    for file in files:
        path = os.path.join(directory, file)
        paths.append(path)

for file in paths:
    baseName = os.path.basename(file)
    fileName = os.path.splitext(baseName)[0]
    extention = os.path.splitext(baseName)[1]
    if extention == '.pdf':
        print(file, " - start working: ")
        pdf = pdfplumber.open(file)
        
        pages_content = []
        lines = []
        for page in pdf.pages:
            pages_content += page.extract_text().split("\n")


        for i in pages_content:
            if i[-1] == "-":
                i = i[:-1]
            lines.append(i)
        df = pd.DataFrame(columns = ['sentence', 'others'])
        print(file, " - total lines: ", len(lines))
        
        sentences = lines2sentences(lines)
        print(file, " - total sentences: ", len(sentences))
        for i in range(len(sentences)):
            sentence = sentences[i]
            pos = POS(sentence)
            df.loc[i, "sentence"] = sentence
            df.loc[i, "others"] = pos
        
        others = unique_count(df,'others')
        
        with pd.ExcelWriter('expressions.xlsx', engine ='openpyxl', mode = 'a', if_sheet_exists = "replace") as writer:
            others.to_excel(writer, sheet_name = fileName,index=False)
            writer.save()
        print("complete others sheet - ", file)

Project proposals/LIRLAP.pdf  - start working: 
Project proposals/LIRLAP.pdf  - total lines:  911
Project proposals/LIRLAP.pdf  - total sentences:  451
complete others sheet -  Project proposals/LIRLAP.pdf
Project proposals/MYrisk.pdf  - start working: 
Project proposals/MYrisk.pdf  - total lines:  689
Project proposals/MYrisk.pdf  - total sentences:  280
complete others sheet -  Project proposals/MYrisk.pdf
Project proposals/emplement!.pdf  - start working: 
Project proposals/emplement!.pdf  - total lines:  975
Project proposals/emplement!.pdf  - total sentences:  486
complete others sheet -  Project proposals/emplement!.pdf
Project proposals/GreenCityLabHue.pdf  - start working: 
Project proposals/GreenCityLabHue.pdf  - total lines:  903
Project proposals/GreenCityLabHue.pdf  - total sentences:  428
complete others sheet -  Project proposals/GreenCityLabHue.pdf
Project proposals/URA.pdf  - start working: 
Project proposals/URA.pdf  - total lines:  1035
Project proposals/URA.pdf  - to

In [26]:
#New Issues
# - extend puntuations from previous list
# "–", ".∙", ".", "‘", "→", "∙", "-", "i.e", "%", '“', "‘", "[", "]", "{", "}", "•"
# "©","<", ">", "€", 