In [145]:
import re
import os
import pdfplumber
import pandas as pd
import string

import nltk
from nltk import tokenize
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
  
lemmatizer = WordNetLemmatizer()
pd.options.display.max_colwidth = 1000

In [146]:
def clean_text(word):
    # split text phrases into words
    tokens  = nltk.word_tokenize(word.lower())
    
    
    
    custom_puntuations = ['.', ',', '/', '!', '?', ';', ':', '(',')', '[',']', '-', '_', '%', 'et', 'al', "et al", 
                          'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',  'l', 'm', 'n', 'o', 'p', 
                          'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "’", "", "’", '”', "”","‐", "WP", "wp", "Wp",
                          "|","•", ".nr", "„", "”", "", "ws", "e.g", "eg", "e g", "▪", "…", "§", "à", "±", "’", "°c",
                          "ns", "hoc", "aalml", "aatse", 
                         "–", ".∙", ".", "‘", "→", "∙", "-", "i.e", "%", '“', "‘", "[", "]", "{", "}", "•","©","<", ">", "€"]
    # Initialize the stopwords variable, which is a list of words ('and', 'the', 'i', 'yourself', 'is') that do not hold much values as key words
    stop_words  = stopwords.words('english')
    stop_words.extend(custom_puntuations)
    # Getting rid of all the words that contain numbers in them
    w_num = re.sub('\w*\d\w*', '', word).strip()

    
    # Return keywords which are not in stop words 
    #keyword = word if not word in stop_words and word in w_num and word in punctuations
    
    #new_words = all(not word in stop_words and word in w_num and word in punctuations for word in words)
    
    clean_tokens = []
    for token in tokens:
        token = re.sub(r'\W', ' ', str(token))
        token = re.sub('\w*\d\w*', '', token).strip()
        # Remove leading/trailing punctuation and convert to lowercase
        #token = token.strip(string.punctuation).lower()
        # Check if token is not a stop word, contains at least one alphabetic character, and is not a single character
        if not token in stop_words:
            # Remove all the special characters
            if len(token) > 0:
                clean_tokens.append(token)
    words = " ".join(clean_tokens)
    return words
    
def POS(sentence):
    tokens = word_tokenize(sentence)
    parts_of_speech = nltk.pos_tag(tokens)
    chunking = r"""
      Target: {<NN><NN><NN>}
      Target: {<NN><NN>}  
      Target: {<JJ>*<NN>}
      Target: {<JJ><JJ><JJ>}
      Target: {<JJ><JJ>}
      Target: {<JJ>}
      Verb: {<VB.*>}
    """
    expression = nltk.RegexpParser(chunking)
    chunked = expression.parse(parts_of_speech)
    
    verbs = []
    others = []
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Target'):
        word =" ".join([a for (a,b) in subtree.leaves()])
        cleaned = clean_text(word)
        if(not any(char.isdigit() for char in word) ):
            if len(cleaned) > 0:
                others.append(cleaned)

    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Verb'):
        word =" ".join([a for (a,b) in subtree.leaves()])
        cleaned = clean_text(word)
        if(not any(char.isdigit() for char in word)):
            if cleaned is not None:
                verbs.append(lemmatizer.lemmatize(cleaned, wordnet.VERB))
    
    return verbs, others



def lines2sentences(lines):
    cleaned_lines = []
    sentences = []
    
    for i in lines:
        cleaned_lines.append(i)
    join_lines = ''.join(cleaned_lines)
    sentences = tokenize.sent_tokenize(join_lines)
    return sentences




def unique_count(dfs, column):
    df = dfs[dfs[column].apply(lambda x: isinstance(x, list) and len(x) > 0)]

    lists = df[column].explode().to_list()
    list_count = [[x,lists.count(x), len(x.split())] for x in set(lists)]
    df_words = pd.DataFrame(columns = ['word', 'occurance', 'word_count'])
    for i in range(len(list_count)):
        df_words.loc[i, "word"] = list_count[i][0]
        df_words.loc[i, "occurance"] = list_count[i][1]
        df_words.loc[i, "word_count"] = list_count[i][2]
    df_words = df_words.sort_values(by=['word_count'], ascending=False)
    return df_words


In [148]:
directories = ['Project proposals', 'Policies']
paths = []
for directory in directories:
    files = os.listdir(directory)
    for file in files:
        path = os.path.join(directory, file)
        paths.append(path)

for file in paths:
    baseName = os.path.basename(file)
    fileName = os.path.splitext(baseName)[0]
    extention = os.path.splitext(baseName)[1]
    if extention == '.pdf':
        print(file, " - start working: ")
        pdf = pdfplumber.open(file)
        
        pages_content = []
        lines = []
        for page in pdf.pages:
            pages_content += page.extract_text().split("\n")


        for i in pages_content:
            if i[-1] == "-":
                i = i[:-1]
            lines.append(i)
        df = pd.DataFrame(columns = ['sentence', 'others', 'verbs'])
        print(file, " - total lines: ", len(lines))
        
        sentences = lines2sentences(lines)
        print(file, " - total sentences: ", len(sentences))
        for i in range(len(sentences)):
            sentence = sentences[i]
            pos = POS(sentence)
            df.loc[i, "sentence"] = sentence
            df.loc[i, "verbs"] = pos[0]
            df.loc[i, "others"] = pos[1]
        
        verbs = unique_count(df,'verbs')
        others = unique_count(df,'others')
        
        with pd.ExcelWriter('verbs.xlsx', engine ='openpyxl', mode = 'a', if_sheet_exists = "replace") as writer:
            verbs.to_excel(writer, sheet_name = fileName,index=False)
            writer.save()
        print("complete verb sheet - ", file)
        
        with pd.ExcelWriter('others.xlsx', engine ='openpyxl', mode = 'a', if_sheet_exists = "replace") as writer:
            others.to_excel(writer, sheet_name = fileName,index=False)
            writer.save()
        print("complete others sheet - ", file)


Project proposals/LIRLAP.pdf  - start working: 
Project proposals/LIRLAP.pdf  - total lines:  911
Project proposals/LIRLAP.pdf  - total sentences:  451
complete verb sheet -  Project proposals/LIRLAP.pdf
complete others sheet -  Project proposals/LIRLAP.pdf
Project proposals/MYrisk.pdf  - start working: 
Project proposals/MYrisk.pdf  - total lines:  689
Project proposals/MYrisk.pdf  - total sentences:  280
complete verb sheet -  Project proposals/MYrisk.pdf
complete others sheet -  Project proposals/MYrisk.pdf
Project proposals/emplement!.pdf  - start working: 
Project proposals/emplement!.pdf  - total lines:  975
Project proposals/emplement!.pdf  - total sentences:  486
complete verb sheet -  Project proposals/emplement!.pdf
complete others sheet -  Project proposals/emplement!.pdf
Project proposals/GreenCityLabHue.pdf  - start working: 
Project proposals/GreenCityLabHue.pdf  - total lines:  903
Project proposals/GreenCityLabHue.pdf  - total sentences:  428
complete verb sheet -  Proj

In [26]:
#New Issues
# - extend puntuations from previous list
# "–", ".∙", ".", "‘", "→", "∙", "-", "i.e", "%", '“', "‘", "[", "]", "{", "}", "•","©","<", ">", "€", 

In [54]:
dffull.shape


(39180, 3)

In [83]:
df_unique = dffull.groupby('word').agg({'occurance': 'sum', 'word_count': 'first'})
df_unique = df_unique.sort_values(by=['occurance'], ascending=False)
df_unique = df_unique.reset_index()
with pd.ExcelWriter('full.xlsx', engine ='openpyxl', mode = 'a', if_sheet_exists = "replace") as writer:
    df_unique.to_excel(writer,index=False)
    writer.save()

In [84]:
df_unique

Unnamed: 0,word,occurance,word_count
0,urban,1150,1
1,city,1004,1
2,local,869,1
3,development,799,1
4,research,760,1
...,...,...,...
27664,great strain,1,2
27665,great spatial,1,2
27666,great social heterogeneity,1,3
27667,great social diversity,1,3


In [134]:
p = "this -is # great ...that's project 9 sho9uld be ’projects10"
r = p.split()
r

['this',
 '-is',
 '#',
 'great',
 "...that's",
 'project',
 '9',
 'sho9uld',
 'be',
 '’projects10']

In [128]:
nltk.word_tokenize(p)

['this',
 'is',
 '#',
 'great',
 'that',
 "'s",
 'project',
 'should',
 'be',
 '’',
 'projects']

In [129]:
clean_text(p)

"# great 's project projects"

In [135]:
re.sub(r'\W', ' ', str(p))

'this  is   great    that s project 9 sho9uld be  projects10'

In [133]:
re.sub('\w*\d\w*', '', p).strip()

"this is # great that's project   be ’"

In [136]:
re.sub('\w*\d\w*', '', ['this',
 '-is',
 '#',
 'great',
 "...that's",
 'project',
 '9',
 'sho9uld',
 'be',
 '’projects10']).strip()

TypeError: expected string or bytes-like object

In [140]:
len(p)

59

In [141]:
type(len(p))

int

In [142]:
if len(p) > 0:
    print(True)

True
