In [7]:
# import packages #
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
from numba import jit

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords

# for removing accented characters
import unicodedata

# for expanding contractions
from contractions import CONTRACTION_MAP
from pycontractions import Contractions

# for lemmatization
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# multiple outputs in one chunk (jupyter notebook)
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'



# load corpus #
def load_corpus(articles):
    '''
    Load corpus, 100 articles.
    '''
    df = pd.read_csv(articles)
    df = df.to_dict()
    return df 



# load Loughran & McDonald finance dictionary #
jit(nopython = True, parallel = True)
def load_LM_dict(dictionary, pos = True, neg = True):
    '''
    Load positive/negative words list. Adopt dictionary to improve efficiency.
    '''
    if pos: 
        poslst = pd.read_excel(dictionary, sheet_name = "Positive", header = None)[0].tolist()
        posdict = {pos:1 for pos in poslst}
    if neg:
        neglst = pd.read_excel(dictionary, sheet_name = "Negative", header = None)[0].tolist()
        negdict = {neg:1 for neg in neglst}
    return posdict, negdict



# text pre-processing #

## remove HTML tags ##
def remove_html_tags(text):
    '''
    Remove the noise, html tags.
    '''
    bs = BeautifulSoup(text, 'html.parser')
    text = bs.get_text()
    return text


## remove accented characters ##
def remove_accented_characters(text):
    '''
    Convert the corpus into English pharsings.
    '''
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


## expand contractions ##
def expand_contractions(text, contraction_mapping = CONTRACTION_MAP):
    '''
    Expand contractions. If necessary, add additional contraction-expansion pairs into contraction.py.
    '''
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags = re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'","", expanded_text)
    return expanded_text


## remove special characters ##
def remove_special_characters(text):
    '''
    Remove puntuations, numbers, redundant spaces.
    '''
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(' +' ,' ', text)
    return text


## tokenize words ##
def tokenize_words(text, wt = False, wpt = False, wst = False, tbwt = False, tkt = False):
    '''
    Multiple tokenizers are available in NLTK library. Create a tokenizer container.
    '''
    if wt:
        return word_tokenize(text)
    if wpt:
        return WordPunctTokenizer().tokenize(text)
    if wst:
        return WhitespaceTokenizer().tokenize(text)
    if tbwt:
        return TreebankWordTokenizer().tokenize(text)
    if tkt:
        return ToktokTokenizer().tokenize(text)
    
    
## lemmatization ##
def wordnet_pos(tag):
    '''
    Obtain Part-Of-Speech tags (in lowercase).
    '''
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

jit(nopython = True, parallel = True)
def lemmatization(tokens):
    '''
    Convert multiple word forms backs into lemmas.
    '''
    wnl = WordNetLemmatizer()
    lemmas = [] 
    for token, tag in pos_tag(tokens):
        pos = wordnet_pos(tag) or wordnet.NOUN
        lemmas.append(wnl.lemmatize(token, pos)) 
    return lemmas


## remove stopwords ##
jit(nopython = True, parallel = True)
def remove_stopwords(stopwords_list, tokens):
    '''
    Remove stopwords in the corpus.
    '''
    cleaned_tokens = list(filter(lambda x: x not in stopwords_list, tokens))
#     cleaned_tokens = [token for token in tokens if not token in stopwords_list]
    return cleaned_tokens



# build a text normalizer #
jit(nopython = True, parallel = True)
def normalizer(text, stopwords_list, wtk = False, wptk = False, wstk = False, tbwtk = False, tktk = False,
               no_accented_chars = True, no_contracted_chars = True, no_lammas = True, no_stopwords = False):
    '''
    Build a text normalizer. Assign the Boolean value True to the applied tokenizer. 
    By defualt, do not remove accented characters, do not expand contractions, 
    do not do words lemmatization and remove stopwords.
    '''
    text = remove_html_tags(text)
    if not no_accented_chars:
        text = remove_accented_characters(text)   
    if not no_contracted_chars:
        text = expand_contractions(text, contraction_mapping = CONTRACTION_MAP)
    text = remove_special_characters(text).lower()
    tokens = tokenize_words(text, wt = wtk, wpt = wptk, wst = wstk, tbwt = tbwtk, tkt = tktk) 
    if not no_lammas:
        tokens = lemmatization(tokens)    
    if not no_stopwords:
        tokens = remove_stopwords(stopwords_list, tokens)   
    normalized_tokens = [token.upper() for token in tokens if len(token) > 1]
    return normalized_tokens



# sentiment analysis #
jit(nopython = True, parallel = True)
def sentiment(words, posdict, negdict):
    '''
    Count positive/negative words. 
    Calculate sentiment score.
    Sentiment score is positive words fraction minus negative words fraction in the article content
    '''
    poswds, negwds = 0, 0
    poslst, neglst = [], []
    totwds = len(words)
    for word in words:
        try:
            posdict[word] == 1
            poswds += 1
            poslst.append(word)
        except:
            try:
                negdict[word] == 1
                negwds += 1
                neglst.append(word)
            except:
                continue        
    sentisc = (poswds - negwds)/totwds 
    return poswds, negwds, totwds, sentisc, poslst, neglst



if __name__ == "__main__":
    
    # record initial time
    time_start = time.time()
    
    # load the corpus, dictionary
    corpus = load_corpus("articles.csv")
    posdict, negdict = load_LM_dict("LoughranMcDonald_SentimentWordLists_2018.xlsx")
    
    # update stopwords list, remove overlap: stopwords corpus, LM dictionary
    stop_words = nltk.corpus.stopwords.words('english')
    stopwords_list = [word for word in stop_words if not word.upper() in posdict and not word.upper() in negdict]
    
    # prepare a dictionary for the output
    opdict = {'article_id': list(corpus['aid'].values()), 'pos_word': [], 'neg_word': [], 
              'total_word': [], 'sentiment': [], 'pos_lst': [], 'neg_lst': []}
    
    jit(nopython = True, parallel = True)
    def output():
        for i in tqdm_notebook(range(len(corpus['aid']))):

            # text pre-processing
            text = corpus['content'][i]
            normalized_tokens = normalizer(text, stopwords_list, wtk = True)

            # sentiment analysis
            poswds, negwds, totwds, sentisc, poslst, neglst = sentiment(normalized_tokens, posdict, negdict)

            # update output
            opdict['pos_word'].append(poswds)
            opdict['neg_word'].append(negwds)
            opdict['total_word'].append(totwds)
            opdict['sentiment'].append(sentisc)
            opdict['pos_lst'].append(poslst)
            opdict['neg_lst'].append(neglst)
        
        return pd.DataFrame(opdict)
    
    opdf = output()
    opdf[['article_id', 'pos_word', 'neg_word', 'total_word', 'sentiment']].to_csv("results_nltk.csv", index = False)
    
    
    # record finish time
    time_end = time.time()
    print(time_end - time_start)

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

HBox(children=(IntProgress(value=0), HTML(value='')))


1.8030929565429688
