In [2]:
# import packages #
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
from numba import jit

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# for removing accented characters
import unicodedata

# for expanding contractions
from contractions import CONTRACTION_MAP
from pycontractions import Contractions

# multiple outputs in one chunk (jupyter notebook)
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'



# load corpus #
def load_corpus(articles):
    '''
    Load corpus, 100 articles.
    '''
    df = pd.read_csv(articles)
    df = df.to_dict()
    return df 



# load Loughran & McDonald finance dictionary #
jit(nopython = True, parallel = True)
def load_LM_dict(dictionary, pos = True, neg = True):
    '''
    Load positive/negative words list. Adopt dictionary to improve efficiency.
    '''
    if pos: 
        poslst = pd.read_excel(dictionary, sheet_name = "Positive", header = None)[0].tolist()
        posdict = {pos:1 for pos in poslst}
    if neg:
        neglst = pd.read_excel(dictionary, sheet_name = "Negative", header = None)[0].tolist()
        negdict = {neg:1 for neg in neglst}
    return posdict, negdict



# text pre-processing #

## remove HTML tags ##
def remove_html_tags(text):
    '''
    Remove the noise, html tags.
    '''
    bs = BeautifulSoup(text, 'html.parser')
    text = bs.get_text()
    return text


## remove accented characters ##
def remove_accented_characters(text):
    '''
    Convert the corpus into English pharsings.
    '''
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


## expand contractions ##
def expand_contractions(text, contraction_mapping = CONTRACTION_MAP):
    '''
    Expand contractions. If necessary, add additional contraction-expansion pairs into contraction.py.
    '''
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags = re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'","", expanded_text)
    return expanded_text


## remove special characters ##
def remove_special_characters(text):
    '''
    Remove puntuations, numbers, redundant spaces.
    '''
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(' +' ,' ', text)
    return text


## intialize a nlp model ##
jit(nopython = True, parallel = True)
def set_custom_boundaries(doc):
    '''
    Fix the bug in spacy: specify $. and }. as additional forms to end a sentence.
    '''
    for token in doc[:-1]:
        if "$." in token.text or "}." in token.text or token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc

def initialization(model):
    '''
    Multiple statistical models are available in the spaCy library,
    including en_core_web_sm, en_core_web_md, en_core_web_lg, en.
    '''
    nlp = spacy.load(model, disable = ['parser', 'tagger', 'ner'])
    try:
        nlp.add_pipe(set_custom_boundaries, before = "parser")
    except:
        pass
    return nlp


# build a text normalizer #
jit(nopython = True, parallel = True)
def normalizer(text, nlp, stopwords_list,
               no_accented_chars = True, no_contracted_chars = True, no_lammas = True, no_stopwords = False):
    '''
    Build a text normalizer. Assign the Boolean value True to the applied tokenizer. 
    By defualt, do not remove accented characters, do not expand contractions, 
    do not do words lemmatization and remove stopwords.
    '''
    text = remove_html_tags(text)
    if not no_accented_chars:
        text = remove_accented_characters(text)    
    if not no_contracted_chars:
        text = expand_contractions(text, contraction_mapping = CONTRACTION_MAP)   
    text = remove_special_characters(text).lower()
    
    text = nlp(text)
    if not no_lammas:
        
        tokens = [word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]
    else:
        tokens = [token.text for token in text]
    if not no_stopwords:
        tokens = [token for token in tokens if not token in stopwords_list]
#         tokens = list(filter(lambda x: x not in stopwords_list, tokens))
    normalized_tokens = [token.upper() for token in tokens if len(token) > 1]
    return normalized_tokens



# sentiment analysis #
jit(nopython = True, parallel = True)
def sentiment(words, posdict, negdict):
    '''
    Count positive/negative words. 
    Calculate sentiment score.
    Sentiment score is positive words fraction minus negative words fraction in the article content
    '''
    poswds, negwds = 0, 0
    poslst, neglst = [], []
    totwds = len(words)
    for word in words:
        try:
            posdict[word] == 1
            poswds += 1
            poslst.append(word)
        except:
            try:
                negdict[word] == 1
                negwds += 1
                neglst.append(word)
            except:
                continue        
    sentisc = (poswds - negwds)/totwds 
    return poswds, negwds, totwds, sentisc, poslst, neglst



if __name__ == "__main__":
    
    # record initial time
    time_start = time.time()
    
    # load the corpus, dictionary
    corpus = load_corpus("articles.csv")
    posdict, negdict = load_LM_dict("LoughranMcDonald_SentimentWordLists_2018.xlsx")
    
    # update stopwords list, remove overlap: stopwords corpus, LM dictionary
    stopwords_list = [word for word in STOP_WORDS if not word.upper() in posdict and not word.upper() in negdict]
    
    # prepare a dictionary for the output
    opdict = {'article_id': list(corpus['aid'].values()), 'pos_word': [], 'neg_word': [], 
              'total_word': [], 'sentiment': [], 'pos_lst': [], 'neg_lst': []}
    
    # initialize nlp model
    nlp = initialization('en')
    
    jit(nopython = True, parallel = True)
    def output():
        for i in tqdm_notebook(range(len(corpus['aid']))):

            # text pre-processing
            text = corpus['content'][i]
            normalized_tokens = normalizer(text, nlp, stopwords_list)

            # sentiment analysis
            poswds, negwds, totwds, sentisc, poslst, neglst = sentiment(normalized_tokens, posdict, negdict)

            # update output
            opdict['pos_word'].append(poswds)
            opdict['neg_word'].append(negwds)
            opdict['total_word'].append(totwds)
            opdict['sentiment'].append(sentisc)
            opdict['pos_lst'].append(poslst)
            opdict['neg_lst'].append(neglst)
        
        return pd.DataFrame(opdict)
    
    opdf = output()
    opdf[['article_id', 'pos_word', 'neg_word', 'total_word', 'sentiment']].to_csv("results_spacy.csv", index = False)
    
    
    # record finish time
    time_end = time.time()
    print(time_end - time_start)

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

<function numba.decorators._jit.<locals>.wrapper(func)>

HBox(children=(IntProgress(value=0), HTML(value='')))


2.482034921646118
