In [39]:
"""
Created on Wed Dec 29 15:53:19 2019

@author: zionzhou

Introduction:
    Hi, the file introduce a function of calculating the sentiment of articles. The result will be export as
    a csv file, containing columns (article_id, pos_word, neg_word, total_word, sentiment).
    
    The function contains the method of tokenization, contraction expansion, lemmatization and stop-words exclusion.
    In my analysis, by default, I don't use the contraction expansion and lemmatization methods, because the two
    have little contribution on the sentiment analysis from my analysis. However, I include these method in the 
    function once anyone want to use them in his/her project. 
    
    It's worth noting that the counting method of total words I used is to count the words after excluding the stop 
    words from the article. Most of the stop words are useless and influential to our result. As our sentiment
    analysis method is calculated by the difference of positive words and negative words divided by the total words.
    The words that counted in total words should have the sentiment potential instead of the meaningless stop words,
    so we would exclude the stop words before counting the total words.
    
"""
# import the packages needed first
import re
import pandas as pd
import time
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from contractions import CONTRACTION_MAP
from bs4 import BeautifulSoup

# Record the initial time
time1 = time.time()
# Read the article file and the postive, negative list
# Convert the data in dictionary format would boost the speed when analysing afterwards
df = pd.read_csv('articles.csv')
df = df.to_dict()
negative_list_pre = tuple(pd.read_excel('LoughranMcDonald_SentimentWordLists_2018.xlsx',
                                    sheet_name = 'Negative',header = None)[0])
positive_list_pre = tuple(pd.read_excel('LoughranMcDonald_SentimentWordLists_2018.xlsx',
                                    sheet_name = 'Positive',header = None)[0])
negative_list = {char:1 for char in negative_list_pre}
positive_list = {char:1 for char in positive_list_pre}
# Define the stop_words
StopWords = stopwords.words('english')
# As the stop_words we used is not customized for our analysis, some words in negative/positive list are also in the stop words list
# Therefore we choose to redefine a new cleaned-stop-words that exclude words in either negative list or positive list
def cleaned_stopwords(pos_list,neg_list,stopwords):
    c_list=[words for words in stopwords if words.upper() not in (negative_list_pre+positive_list_pre)]
    return c_list

CleanedStopWords = cleaned_stopwords(positive_list_pre,negative_list_pre,StopWords)

# Define the function of calculating the sentiment score of a collection of articles
# Here, by default, we don't use the Expand_Contraction method and Lemmatization method
# If you want to use any of these two methods, set the corresponding status value equal to 1

def calculate_sentiment(DataFrame,negative_list,positive_list,expand_contraction=0,Lemmatization=0):
    try:
        # At first, define a empty dictionary to store the outcome we need
        result = {'article_id':list(df['aid'].values()), 'pos_word':[], 'neg_word':[],'total_word':[], 'sentiment':[]}
        # define the function of calculating the sentiment score and negative/positive words
        def sentiment_score(words,pos_list,neg_list):
            pos,neg = 0,0
            tot_wds = len(words)
            for word in words:
                try:
                    positive_list[word] == 1
                    pos += 1
                except:
                    try:
                        negative_list[word] == 1
                        neg += 1
                    except:
                        continue 
            senti = (pos-neg)/tot_wds
            return pos,neg,tot_wds,senti
        # Define the function of expand contractions
        def expand_contractions(text,contraction_mapping=CONTRACTION_MAP):
            contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())),flags=re.IGNORECASE|re.DOTALL)
            def expand_match(contraction):
                match=contraction.group(0)
                first_char=match[0]
                expanded_contraction=contraction_mapping.get(match)\
                    if contraction_mapping.get(match)\
                    else contraction_mapping.get(match.lower())
                expanded_contraction=first_char+expanded_contraction[1:]
                return expanded_contraction
            expanded_text=contractions_pattern.sub(expand_match,text)
            expanded_text=re.sub("'","", expanded_text)
            return expanded_text
        
        # Define the function of tokenizing the article into words and cleaning the tokens
        def cleaned_tokenize(doc, stop_words = ()):
            doc=BeautifulSoup(doc, 'html.parser').get_text()
            if expand_contraction == 1:
                doc=expand_contractions(doc)
            doc=re.sub(r'[^a-zA-Z\s]',' ',doc)
            doc=doc.lower()
            tokens = word_tokenize(doc)
            if Lemmatization == 0:
                cleaned_tokens = [token.upper() for token in tokens if (token not in stop_words and len(token)>1)]
            else:
                cleaned_tokens = []
                for token, tag in pos_tag(tokens):
                    if tag.startswith("NN"):
                        pos = 'n'
                    elif tag.startswith('VB'):
                        pos = 'v'
                    else:
                        pos = 'a'
                    lemmatizer = WordNetLemmatizer()
                    token = lemmatizer.lemmatize(token, pos)
                    if len(token) > 0 and token not in string.punctuation and token not in stop_words:
                        cleaned_tokens.append(token.upper())
            return cleaned_tokens
        # Calculate the sentiment score and other necessary items and store them in pre-created dictionary
        for i in range(len(DataFrame['aid'])):
            content = DataFrame['content'][i]
            tokens = cleaned_tokenize(content, stop_words = CleanedStopWords)
            pos,neg,tot_words,senti = sentiment_score(tokens,positive_list,negative_list)
            result['pos_word'].append(pos)
            result['neg_word'].append(neg)
            result['total_word'].append(tot_words)
            result['sentiment'].append(senti)
    # When exception happens, report at the bottom 
    except Exception as e:
        print("Exception Occured:",e)

    result=pd.DataFrame(data=result)    # transfer the dictionary into dataframe
    return result
# Execute the whole sentiment analysis function
# If you want to use the Expand_Contraction or Lemmatization method, add the statement in the variable bracket
result = calculate_sentiment(df,negative_list,positive_list)
# Output the csv file
result.to_csv('result.csv',index=False)
# Record the last time
time2 = time.time()
# Print the time used in total
print(time2-time1)

1.362084150314331
