In [None]:
import pandas as pd
import pickle
import re
import numpy as np

In [None]:
import nltk
from nltk import bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer
from pattern.en import lemma

In [None]:
import warnings
warnings.filterwarnings("ignore")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
def remove_phrases(text):
    # import unidecode
    # text = unidecode.unidecode(text)
    regex = "(Copyright)?( © )(.+?)(\\.)"
    text = re.sub(regex, '', text)
    text = text.replace("et\u2009al", "").replace("et\xa0al", "").replace("et al", "")
    text = text.replace("de la", "").replace("i.e.", "").replace("e.g.", "")
    text = text.replace(" Todos los derechos reservados.", "").replace(" -H.Clout", "")
    return text

In [None]:
def remove_citations(text):
    regex = "\\((.*?)\\)"
    return re.sub(regex, '', text)

In [None]:
remove = ['destination', 'experience', 'industry', 'product', 'segment']

def sent_with_keyword(text, keyword):
    tokens = sent_tokenize(text)
    return " ".join([sent for sent in tokens if keyword in sent and any(i in sent for i in remove)])

In [None]:
stop_words = set(stopwords.words('english'))
regex = "(?u)\\b[\\w-]+\\b"

def tokenize(text, keyword):
    text = sent_with_keyword(text.lower(), keyword)
    tokenizer = RegexpTokenizer(regex)
    tokens = tokenizer.tokenize(text)
    mwe_tokenizer = MWETokenizer([('u', 's', 'a'), ('u', 's'), ('b', 'b')], separator='')
    tokens = mwe_tokenizer.tokenize(tokens)
    return tokens

In [None]:
# try except necessary to prevent StopIteration error in Py3.7
def lemmatize(text):
    tmp = []
    for t in text:
        try:
            tmp.append(lemma(t))
        except:
            pass
    return tmp

In [None]:
def remove_stopwords(text, to_ignore):
    return [token for token in text if token not in to_ignore]

In [None]:
def pos_tagger(text):
    tagged = nltk.pos_tag(text, tagset='universal')
    return [word for word, tag in tagged if tag in ['VERB']]

In [None]:
def preprocess(df, to_ignore=None, keyword=None, save_to_file=None):
    if to_ignore is None:
        # use default
        to_ignore = ignore_words

    new_df = df[['Title', 'Date', 'Abstract', 'URL', 'Content']]
    # remove instances with empty content
    new_df = new_df[new_df.Content != '']
    new_df.reset_index(drop=True, inplace=True)
    
    # remove copyright phrases and citations
    new_df['Content'] = new_df['Content'].apply(remove_phrases)
    new_df['Content'] = new_df['Content'].apply(remove_citations)
    
    # create tokens from selected sentences only, based on keyword
    tokens = [tokenize(t, keyword) for t in new_df['Content']]
    
    # keep all tokens for later
    original = tokens
    new_df['original tokens'] = original
    pickle.dump(original, open('stakeholders-original-tokens.pkl', 'wb'))
    
    # keep only verbs
    tokens = [remove_stopwords(lemmatize(pos_tagger(t)), to_ignore) for t in tokens]
    tokens = [[t for t in doc if t not in stop_words and not re.match("[0-9]", t) and t] for doc in tokens]

    new_df['tokens'] = tokens
    pickle.dump(tokens, open('stakeholders-tokens.pkl', 'wb'))
    
    if save_to_file:
        new_df.to_csv(save_to_file, index=False)       
    
    return original, tokens, new_df

### Word Frequencies

Find most frequent words in the corpus.

In [None]:
from nltk.probability import FreqDist

def word_frequencies(tokens):
    flat_tokens = [t for doc in tokens for t in doc]
    fdist = FreqDist(flat_tokens)
    return fdist

### Context of words

Find common contexts where the words from the list frequently occur.

In [None]:
from nltk import Text

def word_contexts(tokens, words, num=20):
    flat_tokens = [t for doc in tokens for t in doc]
    text = Text(flat_tokens)   
    return text.common_contexts(words, num)

## Concordances

Concordance of selected words.

In [None]:
def concordance(data, word):
    for text in data['Content']:
        tokens = sent_tokenize(text)
        for sent in tokens:
            if word in sent:
                print(sent)
        print("---- End of document ----")