This notebook is for the development of the Sentiment Analysis script that was used in the Trading Systems Development IQP. The indicator attempts to measure the sentiment of news headlines, which will be used to predict upcomming movements in the markets. 

The headlines are scraped using Selenium from the Investing.com website. The headlines are then preprocessed and their sentiment analyzed using the lexicon approach described by Loughran and McDonald [1]. The lexicon approach entails the use of two lists of words, one negative and one positive, counting the number of each type of word as well as the frequency of each individual word. This method yields a score for each headline, which is then aggregated on a daily basis to determine the score of the day.

[1] "When is a liability not a liability? Textual Analysis, Dictionaries, and 10-Ks" https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1331573 


The sentiment score is calculated using the following formula:

$$
f(x) = \left\{
        \begin{array}{ll}
            \frac{(1+log(tf_{i,j}))}{(1+log(a))}*log(\frac{N}{df_{i,j}}) & tf_{i,j} \geq 1\\
            0 & otherwise
        \end{array}
    \right.
$$

Where the following variables are used:

$$
\text{N, the total number of headlines in the sample.}
\\
\text{a, the average word count in the sample.}
\\
\text{ $tf_{i,j}$, the raw count of the $i^{th}$ word in the $j^{th}$ document.}
\\
\text{$df_i$, the number of headlines containing at least one occurrence of the $i^{th}$ word.}
$$


The following lines preload the data, import necessary libraries and create user defined functions.

In [None]:
import os
import pandas as pd
#import collocation_analysis as ca
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#from investing_scrape import scrape_headlines
import numpy as np
import re
from decimal import Decimal
import json
import time
import unicodedata
from pandas import DataFrame
from selenium import webdriver
from bs4 import BeautifulSoup
import datetime

#region FUNCTION DEFINITIONS
def clean_Text(text):
    """
    removes punctuation, stopwords and returns lowercase text in a list of single words
    """    
    print(text)
    # If the string happens to be NaN, the regexp module will throw an error. By checking for the length of the 
    # string I can catch whether it is a string or not for a TypeError will be thrown if it is not a string. 
    try:
        t = len(text)
        #Tokenize the text
        tokenizer = RegexpTokenizer(r'\w+')
        text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
        text = text.lower()
        text = tokenizer.tokenize(text)

        #Clean the text
        clean = [word for word in text if word not in stopwords.words('english')]

        return clean
    
    except TypeError:
        return "null"

# Load the list of positive words
def loadPositive():
    #filepath = os.path.dirname(os.path.abspath(__file__))
    #C:\\Users\\afn\\Desktop\\Intern_Projects\\
    #C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\
    with open('C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\LMC_Positive.csv', 'r') as f:
        pos_lines = f.readlines()
        pos_lex_lists = [word.strip() for word in pos_lines]
        return pos_lex_lists


# Load the list of negative words
def loadNegative():
    #filepath = os.path.dirname(os.path.abspath(__file__))
    #C:\\Users\\afn\\Desktop\\Intern_Projects\\
    #C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\
    with open('C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\LMC_Negative.csv', 'r') as f:
        neg_lines = f.readlines()
        neg_lex_lists = [word.strip() for word in neg_lines]
        return neg_lex_lists


def countNegative(sentence, neg_lex_list, total_headline_count, doc_average_word_count):
    # Sentence is a sentence broken up into a list of words
    # Add a row to the df_frequency dataframe that represents the weight assigned to that word
    # Use the Loughran & McDonald formula to create the weight:
    # Wij = ((1+log(tfij))/(1+log(a)))*log(N/dfi)
    # Where N represents the total number of headlines in the sample
    # dfi the number of headlines containing at least one occurrence of the ith word
    # tfij the raw count of the ith word in the jth document
    # a the average word count in the document
    N = total_headline_count
    headline_score = 0
    a = doc_average_word_count.item()
    for word in sentence:
        if word.upper() in neg_lex_list:
            # Find the index of the word then obtain the corresponding frequency and dfi
            wIndex = list(np.where(df_frequency["word"] == word.lower())[0])[0]
            dfi = df_frequency["dfi"][wIndex]
            # Determine the frequency of this word in this particular headline
            local_freq = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), ' '.join(sentence)))
            # Calculate the weight assigned to this word
            if local_freq == 0 or a == 0 or N/dfi == 0:
                print('*******************************')
                print(local_freq)
                print('*******************************')
                
            weight = ((1+np.log(local_freq))/(1+np.log(a)))*np.log(N/dfi)
            # Add this word score to the headline's score
            if weight != 0:
                headline_score += weight.item()
            else:
                headlines_score += 1

    return headline_score


def countPositive(sentence, pos_lex_list, total_headline_count, doc_average_word_count):
    # Sentence is a sentence broken up into a list of words
    # Add a row to the df_frequency dataframe that represents the weight assigned to that word
    # Use the Loughran & McDonald formula to create the weight:
    # Wij = ((1+log(tfij))/(1+log(a)))*log(N/dfi)
    # Where N represents the total number of headlines in the sample
    # dfi the number of headlines containing at least one occurrence of the ith word
    # tfij the raw count of the ith word in the jth document
    # a the average word count in the document
    N = total_headline_count
    headline_score = 0
    a = doc_average_word_count.item()
    for word in sentence:
        if word.upper() in pos_lex_list:
            # Find the index of the word then obtain the corresponding frequency and dfi
            wIndex = list(np.where(df_frequency["word"] == word.lower())[0])[0]
            dfi = df_frequency["dfi"][wIndex]
            # Determine the frequency of this word in this particular headline
            local_freq = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), ' '.join(sentence)))
            # Calculate the weight assigned to this word
            if local_freq == 0 or a == 0 or N/dfi == 0:
                print('*******************************')
                print(local_freq)
                print('*******************************')
                
            weight = ((1+np.log(local_freq))/(1+np.log(a)))*np.log(N/dfi)
            # Add this word score to the headline's score
            if weight != 0:
                headline_score += weight.item()
            else:
                headlines_score += 1

    return headline_score


def calculateSentiment(clean_headline, positive_list, negative_list, awc):
    # print("CALCULATING SENTIMENT")
    lexicon_negative = countNegative(clean_headline, negative_list, len(df_headlines), awc)
    lexicon_positive = countPositive(clean_headline, positive_list, len(df_headlines), awc)
    return lexicon_positive-lexicon_negative

# Using webdriver to launch the website and allow the javascript to load. Once the JS loads,
# it is possible to extract the source code -> innerHTML
# BeautifulSoup processes the innerHTML, which is then parsed for the desired sections.
# Title, date and url are all stored in a dictionary that has the title as the key.
# The dictionary is stored in a json file for further usage.
def scrape_headlines(starting_page, ending_page, section):
    browser = webdriver.Chrome()
    url_upto_item = "https://www.investing.com/currencies/" + section + '/'
    page_number = starting_page

    # Create a dictionary to store these values. Each title is the key to an entry.
    news = {}
    article_total = 0

    # try:
    #     news[article_total]["source"] = element.span.get_text()[3:]
    # except AttributeError:

    # Loop through all the pages
    while page_number <= ending_page:

        # Join parts of url
        url = url_upto_item + str(page_number)
        # Navigate browser to url
        browser.get(url)

        # Wait 10 seconds for the js to finish loading the webpage
        time.sleep(2)
        # Get innerHTML from webpage
        inner_html = browser.execute_script("return document.body.innerHTML")
        # Parse the innerHTML
        soup = BeautifulSoup(inner_html, 'html.parser')

        # Find all the classes that contain the desired information.
        # Title, url and date are all contained within the 'fxs_floatingMedia..' class in div objects.
        items = soup.find_all("article", class_="articleItem")

        article_counter = 1
        # Clean the gathered text before putting it in the dictionary
        for element in items:
            # Last two articles are not real articles rather some extra stuff on the webpage
            if article_counter < len(items)-2:
                news[article_total] = {}
                news[article_total]["title"] = element.div.a["title"]
                print(element.div.a["title"])
                news[article_total]["date"] = element.findAll("span", {"class": "date"})[0].get_text()[3:]

                # The href standards changed at some point. They are stored in two ways. Here I account for that difference
                if "https://invst.ly" in element.div.a["href"]:
                    news[article_total]["url"] = element.div.a["href"]
                else:
                    news[article_total]["url"] = "https://investing.com" + element.div.a["href"]

                try:
                    news[article_total]["source"] = element.span.span.get_text()[3:]
                except AttributeError:
                    news[article_total]["source"] = element.span.get_text()[3:]

            article_counter += 1
            article_total += 1

        # Increment page number by 1 to navigate to the next page
        page_number += 1

    # ----------------------- DATAFRAME -----------------------

    # create the dataframe from the dictionary
    df = pd.DataFrame.from_dict(news)

    # transpose the columns/rows
    df = df.T

    # Sort by date
    # df = df.sort_values(by=['date'])

    # ----------------------- SAVING -----------------------
    filename = 'headlines_InvestingCom_' + str(starting_page) + '_to_' + str(ending_page) + '.csv'
    # Save the dataframe to a csv file
    df.to_csv(filename, encoding='utf-8')

    print("***********************")
    print("Headline Scraping Done.")
    print("***********************")

    browser.close()

    return df

def wordFrequency(df, content_column):
    # Create a dictionary where to store the words and counts
    frequency = {}

    # Iterate through each row and count the frequency of words
    for index, row in df.iterrows():
        # First clean the text
        text_string = clean_Text(row[content_column])

        # Count word frequency
        for word in text_string:
            if word in frequency:
                frequency[word] += 1
            else:
                frequency[word] = 1

        print(frequency)
    # Create the list with the counts
    frequency_list = frequency.keys()

    # Print words and counts
    for words in frequency_list:
        print(words, frequency[words])

    word_df = pd.DataFrame(list(frequency.items()))
    word_df = word_df.T

    # Save the frequency dictionary to a csv file for further analysis
    with open('word_frequency.csv', 'w') as csv_file:
        word_df.to_csv(csv_file, header=False)
    csv_file.close()

    return word_df

# Using webdriver to launch the website and allow the javascript to load. Once the JS loads,
# it is possible to extract the source code -> innerHTML
# BeautifulSoup processes the innerHTML, which is then parsed for the desired sections.
# Title, date and url are all stored in a dictionary that has the title as the key.
# The dictionary is stored in a json file for further usage.
def scrape_headlines(starting_page, ending_page, section):
    browser = webdriver.Chrome()
    url_upto_item = "https://www.investing.com/currencies/" + section + '/'
    page_number = starting_page

    # Create a dictionary to store these values. Each title is the key to an entry.
    news = {}
    article_total = 0

    # try:
    #     news[article_total]["source"] = element.span.get_text()[3:]
    # except AttributeError:

    # Loop through all the pages
    while page_number <= ending_page:

        # Join parts of url
        url = url_upto_item + str(page_number)
        # Navigate browser to url
        browser.get(url)

        # Wait 10 seconds for the js to finish loading the webpage
        time.sleep(2)
        # Get innerHTML from webpage
        inner_html = browser.execute_script("return document.body.innerHTML")
        # Parse the innerHTML
        soup = BeautifulSoup(inner_html, 'html.parser')

        # Find all the classes that contain the desired information.
        # Title, url and date are all contained within the 'fxs_floatingMedia..' class in div objects.
        items = soup.find_all("article", class_="articleItem")

        article_counter = 1
        # Clean the gathered text before putting it in the dictionary
        for element in items:
            # Last two articles are not real articles rather some extra stuff on the webpage
            if article_counter < len(items)-2:
                news[article_total] = {}
                news[article_total]["title"] = element.div.a["title"]
                print(element.div.a["title"])
                news[article_total]["date"] = element.findAll("span", {"class": "date"})[0].get_text()[3:]

                # The href standards changed at some point. They are stored in two ways. Here I account for that difference
                if "https://invst.ly" in element.div.a["href"]:
                    news[article_total]["url"] = element.div.a["href"]
                else:
                    news[article_total]["url"] = "https://investing.com" + element.div.a["href"]

                try:
                    news[article_total]["source"] = element.span.span.get_text()[3:]
                except AttributeError:
                    news[article_total]["source"] = element.span.get_text()[3:]

            article_counter += 1
            article_total += 1

        # Increment page number by 1 to navigate to the next page
        page_number += 1

    # ----------------------- DATAFRAME -----------------------

    # create the dataframe from the dictionary
    df = pd.DataFrame.from_dict(news)

    # transpose the columns/rows
    df = df.T

    # Sort by date
    # df = df.sort_values(by=['date'])

    # ----------------------- SAVING -----------------------
    filename = 'headlines_InvestingCom_' + str(starting_page) + '_to_' + str(ending_page) + '.csv'
    # Save the dataframe to a csv file
    df.to_csv(filename, encoding='utf-8')

    print("***********************")
    print("Headline Scraping Done.")
    print("***********************")

    browser.close()

    return df


# This function iterates through the dataframe and removes advert entries as well as corrects dates that are improperly formatted
# as a number of hours ago rather than a date.
def clean_dataframe(df):
    #global df_headlines
    
    now = datetime.datetime.now()
    df_headlines_clean = df.copy()
    df_headlines_cp = df.copy()
    index_remove = []
    
    # Inneficiently remove ads and clean up the dataframe.
    # Iterating multiple times because the index will not correspond after something gets removed.
    df_headlines_cp = df_headlines_clean.copy()
    for index, row in df_headlines_cp.iterrows():
        # Replace "X hours ago" entries for today's date
        if 'hour' in row['date'] or 'hours' in row['date']:
            df_headlines_clean.iloc[index]['date'] = now.strftime('%b %d, %Y')
    
    df_headlines_cp = df_headlines_clean.copy()
    for index, row in df_headlines_cp.iterrows():
        if 'https://investing.com/education/' in row['url']:
            df_headlines_clean = df_headlines_clean.drop([index])    
    
    df_headlines_cp = df_headlines_clean.copy()
    for index, row in df_headlines_cp.iterrows():
        if 'EDT' in row['date']:
            df_headlines_clean = df_headlines_clean.drop([index]) 
            
    return df_headlines_clean


In [None]:
df_headline

Scrape the Investing.com website and obtain a list of headlines.

In [None]:
df_headlines = scrape_headlines(1, 943, 'usd-chf-news')
clean_dataframe()
df_headlines

Next, create/import the dataframes containing the data to analyze. In this case the headlines have been downloaded previously, so they are imported from a csv file. The date column is converted to a datetime object for easy manipulation.


In [None]:
#df_prices = pd.read_csv("C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\USD_CHF_Sent_Prices.csv")
#df_aaii = pd.read_csv("C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\aaii_sentiment.csv")
df_prices = pd.read_csv("C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\USD_CHF_Sent_Prices.csv")
df_aaii = pd.read_csv("C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\aaii_sentiment.csv")
df_headlines = pd.read_csv("C:\\Users\\Alan Fernandez\\Documents\\GitHub\\Investing-IQP-18\\Jupyter_MLX\\headlines_InvestingCom_Stock_1_to_1000.csv")
df_prices['date'] = pd.to_datetime(df_prices['date'])
df_aaii['date'] = pd.to_datetime(df_aaii['date'])
print(df_headlines)

Create lists to store calculations. These lists will eventually become the columns in the final dataframe. The positive and negative lexicon are loaded as well.

In [None]:
pos_lex_list = loadPositive()
neg_lex_list = loadNegative()
# Indicator lists
aaii_sent_list = []
lexicon_sent_list = []
market_sent_list = []
# A list of ngrams that have more weight in the analysis.
# ngram_list = []
ngram_match_list = []
# Where the final aggregate sentiment is stored
final_sent_list = []
final_sent_list_word = []
new_range  = [i * i for i in range(5) if i % 2 == 0]

Determine word frequency in the entire database of headlines. Frequency will be used as a weight to balance 
terms with different levels of popularity.

In [None]:
df_frequency = wordFrequency(df_headlines, "title")

N = len(df_headlines)
average_word_count = df_frequency[1:2].sum(axis=1)/N
word_occurrence = [0 for x in range(df_frequency.count(axis='columns')[1])]
headlines_list = df_headlines['title'].tolist()
word_list = df_frequency[0:1].values.tolist()[0]

for word_index, word in enumerate(word_list):
    for headline in headlines_list:
        #print(word)
        hd = RegexpTokenizer(r'\w+').tokenize(headline)
        hd_upper = [word.upper() for word in hd]
        #print(hd_upper)
        if word.upper() in hd_upper:
            #print(word.upper())
            word_occurrence[word_index] += 1

# Add the number of occurrences to the data frame
df_frequency = df_frequency.append([word_occurrence], ignore_index=True)
df_frequency = df_frequency.T
df_frequency.columns = ["word", "frequency", "dfi"]

# Calculate the weight
word_weight = [0 for x in range(df_frequency.count(axis='columns')[1])]

print(df_frequency)

Analyze the sentiment of the headlines.

In [None]:
%%capture
lexicon_sent_list = []
for index, row in df_headlines.iterrows():
    # First clean and tokenize the text
    # print("INDEX: " + str(index))
    word_list = clean_Text(row["title"])
    # If one of the lists does not contain a certain date, an error will be thrown upon trying to access that date
    # Catch that error and skip that date
    try:
        # row_index = df_prices.loc[df_prices['date'] == df_headlines.iloc[index]['date']]
        # LEXICON SENTIMENT OF THE TEXT
        # Set the sentiment to 1, -1 or 0 (Pos, Neg, Neutral)
        # Set a flag to identify which list threw the error and to react accordingly
        flag = 0
        s = calculateSentiment(word_list, pos_lex_list, neg_lex_list, average_word_count)

        lexicon_sent_list += [s]

        #
        # # NGRAM SENTIMENT OF THE TEXT
        # ngram_sent_temp = checkNgram(word_list, pos_ngram_list, neg_ngram_list)
        # if (ngram_sent_temp[0] > ngram_sent_temp[1]):
        #     ngram_sent_list += [-1]
        # elif (ngram_sent_temp[1] > ngram_sent_temp[0]):
        #     ngram_sent_list += [1]
        # else:
        #     ngram_sent_list += [0]
        # # Store the matching ngrams
        # ngram_match_list += [ngram_sent_temp[2]]
        #
        # # MARKET SENTIMENT OF THE TEXT
        # flag = 1
        # market_sent_list += [1 if row_index['close'].item() - row_index['open'].item() > 0 else -1]


    except ValueError:
        # If the lexicon sentiment throws an error
        if flag == 0:
            market_sent_list += [0]
            lexicon_sent_list += [0]
        # If the market sentiment throws an error
        else:
            market_sent_list += [0]
            lexicon_sent_list[index] = 0
        # skipcount += 1
        # print("Skip: " + str(skipcount))
        continue
#endregion

Now combine the sentiment results with the corresponding headlines. Since each element in the sentiment list is the sentiment of the corresponding headline (index wise), this should be as simple as appending both lists to a DF. This new DF should be a copy of df_headlines with the sentiment column appended. 

In [None]:
df_headlines_sent = df_headlines.copy()
df_headlines_sent['sent'] = lexicon_sent_list

Now, with the DF ready with both headlines and sentiment, aggregate those sentiment scores into a final DF that includes one entry per date with the aggreate sentiment for that day. This will be the final output of the sentiment analyzer and will be imported into TradeStation. The date column should be date objects and the sentiment column should be titled 'Close' to match the format required by TS for third party data.

In [None]:
from datetime import datetime
from dateutil.parser import parse

sentiment_dict = {}
for index, row in df_headlines_sent.iterrows():
    if index > 5:
        # If the date already exists, then just add to the sentiment score.
        if row['date'] not in sentiment_dict:
            sentiment_dict[row['date']] = row['sent']
        # If the date doesn't exist, then append it to the end of the dictionary.
        else:
            sentiment_dict[row['date']] += row['sent']

sentiment_dict

In [None]:
df_final = pd.Series(sentiment_dict).to_frame()
df_final = df_final.reset_index()
df_final.columns = ['date', 'sent']
df_final['date'] = pd.to_datetime(df_final.date)
df_final.sort_values(by='date', inplace=True)
df_final

In [None]:
df_headlines

### Headline and Lexicon Visualization 
Create a pie chart to visualize the percentage of headlines in each category. Also, visualize the entire lexicon by category. This comparison should help in determining whether the sentiment calculation is heavily biased towards one end of the spectrum.

Percentage of headlines that are positive

In [None]:
a = 0
for index, row in df_final.iterrows():
    if row['sent'] > 0:
        a += 1
a = a/len(df_final)
a

Percentage of headlines that are negative

In [None]:
b = 0
for index, row in df_final.iterrows():
    if row['sent'] < 0:
        b += 1
b = b/len(df_final)
b

Percentage of headlines that are neutral

In [None]:
c = 0
for index, row in df_final.iterrows():
    if row['sent'] == 0:
        c += 1
c = c/len(df_final)
c

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Create a piechart
import matplotlib.pyplot as plt
labels = 'Positive', 'Negative', 'Neutral'
sizes = [a, b, c]
colors = ['green', 'red', 'yellow']
explode = (0.2, 0, 0)
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title('Percentage of Headlines by Sentiment')
plt.show()

In [None]:
# Create a piechart
pos = len(pos_lex_list)/(len(neg_lex_list)+len(pos_lex_list))
neg = len(neg_lex_list)/(len(neg_lex_list)+len(pos_lex_list))
labels = 'Positive Words', 'Negative Words'
sizes = [pos, neg]
colors = ['green', 'red']
explode = (0.2, 0,)
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title('Percentage of Words in the Lexicon in Each Sentiment Category')
plt.show()

## Sentiment Analysis With NGRAM

In this second sentiment analysis, NGRAMs are identified and added to the lexicon. With the ngrams it will be posible to identify and to correct for the negation of sentiment words.
This requires a list of negation words. The script should iterate through the headline dataframe, find each instance of a negation word, check if it is followed by a sentiment word and if it is, then add the ngram to a list.
When the sentiment calculation is performed, the headline will be checked for each ngram to correct for negation.

Create new functions necessary for NGRAM analysis and import the necessary libraries.

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from os import path
from nltk.collocations import *

def csvToList(df):
    # Huge list containing ALL of the headlines
    word_list = []

    for index, row in df.iterrows():
        # First clean and tokenize the text
        word_list += clean_Text(row["text"])
        word_sentence = ' '.join(word_list)
    return word_list

def prepareSources():
    # Create a dictionary with three values: ((day), (all_headlines_for_day), (bar color), (empty_space_for_collocations))
    # Import the headlines and create the list ^^
    fgreen_db = pd.read_csv('C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\headline_daycolor_green.csv', encoding='iso-8859-1')
    fred_db = pd.read_csv('C:\\Users\\Alan Fernandez\\IQPython\\fxnews\\fxnews\\csv_files\\headline_daycolor_red.csv', encoding='iso-8859-1')

    # Create a list with all the headlines
    red_list = csvToList(fred_db)
    green_list = csvToList(fgreen_db)
    return [red_list, green_list]


# Returns a list containing both lists of days
def getBigrams(word_list, freq_filter, best_filter):
    # Bigram Analysis
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    # Get collocations for the green day
    bigram_finder = BigramCollocationFinder.from_words(word_list)
    # Ignore those occuring less than "freq_filter" times
    bigram_finder.apply_freq_filter(freq_filter)
    # Get the best "best_filter" bigrams
    bigram_list = bigram_finder.nbest(bigram_measures.pmi, best_filter) # doctest: +NORMALIZE_WHITESPACE
    return bigram_list

def getTrigrams(word_list, freq_filter, best_filter):
    # Trigram Analysis
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    # Get collocations 
    trigram_finder = TrigramCollocationFinder.from_words(word_list)
    # Ignore those occuring less than "freq_filter" times
    trigram_finder.apply_freq_filter(3)
    # Get the best best_filter trigrams
    trigram_list = trigram_finder.nbest(trigram_measures.pmi, 30) # doctest: +NORMALIZE_WHITESPACE
    return trigram_list

def generateNgrams(pos_text_list, neg_text_list, minOcurrence, topN):
    clean_green_list = listCleanUp(pos_text_list)
    green_list = getBigrams(clean_green_list, minOcurrence, topN)
    clean_red_list = listCleanUp(neg_text_list)
    red_list = getBigrams(clean_red_list, minOcurrence, topN)

    return [green_list, red_list]


# print("preparing sources")
# lists = prepareSources()
# print("generating green list")
# green_list = getGreenBigrams(lists[1], 5, 20)
# print(green_list)
# print("generating red list")
# red_list = getRedBigrams(lists[0], 5, 20)
# print(red_list)
# print("Done")

def listCleanUp(dirty_list):
    clean_list = []
    for item in dirty_list:
        # First clean and tokenize the text
        clean_list += clean_Text(item)
    return clean_list

def is_in_lexicon(word):
    list_of_lex = [neg_lex_list, pos_lex_list]
    for index, lst in enumerate(list_of_lex):
        if word.upper() in lst:
            if index == 0:
                print("Neg_Lex ")
            else:
                print("Pos_Lex")
            print("True")
        else:
            if index == 0:
                print("Neg_Lex ")
            else:
                print("Pos_Lex")
            print("False")

In [None]:
pos_title_list = df_headlines_sent[df_headlines_sent['sent'] > 0]['title'].tolist()
neg_title_list = df_headlines_sent[df_headlines_sent['sent'] < 0]['title'].tolist()

ngram_lists = generateNgrams(pos_title_list, neg_title_list, 5, 100);

The addition of ngrams to the lexicon analysis is sometimes considered unnecessary because one of the words in the ngram is typically already included in the lexicon, so the ngram does not bring any additional value. It is possible to account for negation using ngrams although simply searching for a negation word before a sentiment word achieves the same purpose. There are, however, certain ngrams that should contribute to the analysis such as 'trade war' which is always negative in a financial context.

In the cells below the entire list of headlines is searched for ngrams that could potentially bring some value to the analysis. A comparison of how many words in the list of ngrams are also found in the lexicon is performed.

In [None]:
# Test generation of ngrams from the entire set of headlines without discrimination for their 
# sentiment score. 
minOccurence = 5
topN = 100
ngram_list_1 = getBigrams(listCleanUp(headlines_list), minOccurence, topN)

In [None]:
# Check if a word is in either of the lexicon.
count = [0,0] # where index 0 is negative and index 1 is positive count
matches = []
list_of_lex = [neg_lex_list, pos_lex_list]

for tupl in ngram_list_1:
    for word in tupl:
        for index, lst in enumerate(list_of_lex):
            if word.upper() in lst and word not in matches:
                matches += [word]
                print(word)
                count[index] += 1

Now, re-define the countPositive and countNegative functions to include negation detection as well as ngram detection.

In [None]:
def countPositive_extra(sentence, pos_lex_list, total_headline_count, doc_average_word_count):
    # Sentence is a sentence broken up into a list of words
    # Add a row to the df_frequency dataframe that represents the weight assigned to that word
    # Use the Loughran & McDonald formula to create the weight:
    # Wij = ((1+log(tfij))/(1+log(a)))*log(N/dfi)
    # Where N represents the total number of headlines in the sample
    # dfi the number of headlines containing at least one occurrence of the ith word
    # tfij the raw count of the ith word in the jth document
    # a the average word count in the document
    N = total_headline_count
    headline_score = 0
    a = doc_average_word_count
    negation_word_list = ['not']
    negated = 1
    stop_list = [word for word in stopwords.words('english') if word not in ['not']]
    tokenized_sentence = [word for word in sentence if word not in stop_list]
    # ****** Word search ******
    for index, word in enumerate(tokenized_sentence):
        if word.upper() in pos_lex_list:
            # Determine if this word is preceded by a negation word
            if index > 0:
                if tokenized_sentence[index-1] in negation_word_list:
                    negated = -1
                else:
                    negated = 1
            # Find the index of the word then obtain the corresponding frequency and dfi
            wIndex = list(np.where(df_frequency["word"] == word.lower())[0])[0]
            dfi = df_frequency["dfi"][wIndex]
            # Determine the frequency of this word in this particular headline
            local_freq = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), ' '.join(sentence)))
            # Calculate the weight assigned to this word               
            weight = ((1+np.log(local_freq))/(1+np.log(a)))*np.log(N/dfi)
            # Add this word score to the headline's score
            if np.float(weight) != 0:
                headline_score += weight.item() * negated
            else:
                headlines_score += 1 * negated
    
    # ****** NGRAM Search ******
    for ngram in pos_ngram_list:
        if ngram in tokenized_sentence:
            headline_score += 1

    return headline_score

def countNegative_extra(sentence, neg_lex_list, total_headline_count, doc_average_word_count):
    # Sentence is a sentence broken up into a list of words
    # Add a row to the df_frequency dataframe that represents the weight assigned to that word
    # Use the Loughran & McDonald formula to create the weight:
    # Wij = ((1+log(tfij))/(1+log(a)))*log(N/dfi)
    # Where N represents the total number of headlines in the sample
    # dfi the number of headlines containing at least one occurrence of the ith word
    # tfij the raw count of the ith word in the jth document
    # a the average word count in the document
    N = total_headline_count
    headline_score = 0
    a = doc_average_word_count.item()
    negation_word_list = ['not']
    negated = 1
    stop_list = [word for word in stopwords.words('english') if word not in ['not']]
    tokenized_sentence = [word for word in sentence if word not in stop_list]
    # Word Search
    for index, word in enumerate(tokenized_sentence):
        if word.upper() in neg_lex_list:
            #Determine if this word is preceded by a negation word
            if index > 0:
                if tokenized_sentence[index-1] in negation_word_list:
                    negated = -1
                else:
                    negated = 1
                    
            wIndex = list(np.where(df_frequency["word"] == word.lower())[0])[0]
            dfi = df_frequency["dfi"][wIndex]
            # Determine the frequency of this word in this particular headline
            local_freq = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), ' '.join(sentence)))
            # Calculate the weight assigned to this word                
            weight = ((1+np.log(local_freq))/(1+np.log(a)))*np.log(N/dfi)
            # Add this word score to the headline's score
            if np.float(weight) != 0:
                headline_score += weight.item() * negated
            else:
                headlines_score += 1 * negated
    
    # ****** NGRAM Search ******
    for ngram in neg_ngram_list:
        if ngram in tokenized_sentence:
            headline_score += 1

    return headline_score


# Determine the sentence of the sentence. 
def calculateSentiment_extra(clean_headline, positive_list, negative_list, awc):
    lexicon_negative_extra = countNegative_extra(clean_headline, negative_list, len(df_headlines), awc)
    lexicon_positive_extra = countPositive_extra(clean_headline, positive_list, len(df_headlines), awc)
    return lexicon_positive_extra-lexicon_negative_extra

In [None]:
pos_lex_list = loadPositive()
neg_lex_list = loadNegative()
# Indicator lists
aaii_sent_list = []
lexicon_sent_list = []
market_sent_list = []
# A list of ngrams that have more weight in the analysis.
# ngram_list = []
ngram_match_list = []
# Where the final aggregate sentiment is stored
final_sent_list = []
final_sent_list_word = []
new_range  = [i * i for i in range(5) if i % 2 == 0]

In [None]:
df_frequency = wordFrequency(df_headlines, "title")

N = len(df_headlines)
average_word_count = df_frequency[1:2].sum(axis=1)/N
word_occurrence = [0 for x in range(df_frequency.count(axis='columns')[1])]
headlines_list = df_headlines['title'].tolist()
word_list = df_frequency[0:1].values.tolist()[0]

for word_index, word in enumerate(word_list):
    for headline in headlines_list:
        #print(word)
        hd = RegexpTokenizer(r'\w+').tokenize(headline)
        hd_upper = [word.upper() for word in hd]
        #print(hd_upper)
        if word.upper() in hd_upper:
            #print(word.upper())
            word_occurrence[word_index] += 1

# Add the number of occurrences to the data frame
df_frequency = df_frequency.append([word_occurrence], ignore_index=True)
df_frequency = df_frequency.T
df_frequency.columns = ["word", "frequency", "dfi"]

# Calculate the weight
word_weight = [0 for x in range(df_frequency.count(axis='columns')[1])]

In [None]:
lexicon_sent_list = []
for index, row in df_headlines.iterrows():
    # First clean and tokenize the text
    # print("INDEX: " + str(index))
    word_list = clean_Text(row["title"])
    # If one of the lists does not contain a certain date, an error will be thrown upon trying to access that date
    # Catch that error and skip that date
    try:
        # row_index = df_prices.loc[df_prices['date'] == df_headlines.iloc[index]['date']]
        # LEXICON SENTIMENT OF THE TEXT
        # Set the sentiment to 1, -1 or 0 (Pos, Neg, Neutral)
        # Set a flag to identify which list threw the error and to react accordingly
        flag = 0
        s = calculateSentiment_extra(word_list, pos_lex_list, neg_lex_list, average_word_count)
        lexicon_sent_list += [s]
        
    except ValueError:
        # If the lexicon sentiment throws an error
        if flag == 0:
            market_sent_list += [0]
            lexicon_sent_list += [0]
        # If the market sentiment throws an error
        else:
            market_sent_list += [0]
            lexicon_sent_list[index] = 0
        # skipcount += 1
        # print("Skip: " + str(skipcount))
        continue

In [None]:
df_headlines_sent.iloc[0]['date']

In [None]:
df_headlines_sent = df_headlines.copy()
df_headlines_sent['sent'] = lexicon_sent_list
df_headlines_sent

In [None]:
from datetime import datetime
from dateutil.parser import parse

sentiment_dict = {}
for index, row in df_headlines_sent.iterrows():
    # If the date already exists, then just add to the sentiment score.
    if row['date'] not in sentiment_dict:
        sentiment_dict[row['date']] = row['sent']
    # If the date doesn't exist, then append it to the end of the dictionary.
    else:
        sentiment_dict[row['date']] += row['sent']

df_final_ngram = pd.Series(sentiment_dict).to_frame()
df_final_ngram = df_final_ngram.reset_index()
df_final_ngram.columns = ['date', 'sent']
df_final_ngram['date'] = pd.to_datetime(df_final_ngram.date)
df_final_ngram.sort_values(by='date', inplace=True)

#### TESTING
This section is for testing the results of the script. Delete this section when finished.


In [None]:
match = 0
for index, row in df_final.iterrows():
    if row['sent'] == df_final_ngram.iloc[index]['sent']:
        match += 1
    else:
        print('False')
match/len(df_final)

In [None]:
df_comparison = pd.DataFrame()
df_comparison = pd.concat([df_final])
df_comparison

In [None]:
df_comparison = df_comparison.T

In [None]:
df_comparison

In [None]:
found = 0
for index, sentence in enumerate(headlines_list):
    if 'not' in sentence:
        found += 1
        print(index)

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(headlines_list[4])
countPositive_extra(text, pos_lex_list, len(headlines_list), average_word_count)

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(headlines_list[4])
for word in (text):
    if word in neg_lex_list:
        print(word)

In [None]:
a = calculateSentiment(text, pos_lex_list, neg_lex_list, average_word_count)
b = calculateSentiment_extra(text, pos_lex_list, neg_lex_list, average_word_count)
print(a)

In [None]:
df_headlines

In [None]:
for index, row in df_comparison.iterrows():
    if row['sent'] < -10:
        print(index)

In [None]:
df_final_ngram

In [None]:
df_final

In [None]:
df_comparison = pd.concat([df_final])

In [None]:
df_comparison = pd.concat([df_final_ngram])

In [None]:
df_final_ngram = df_final_ngram.rename(index=str, columns={"date": "date_2", "sent": "sent_2"})


In [None]:
df_final_ngram

In [None]:
df_comparison = pd.concat([df_final, df_final_ngram], axis=1, ignore_index=True)

In [None]:
df_comparison

In [None]:
ngram_lists

In [None]:
neg_ngram_list = ['missing forecasts', 'lowest since', 'shorts increase', 'trade war', 'extends losses', 'losing streak', 'government shutdown', 'retraces weakness', ]
pos_ngram_list = ['winning streak', 'job creation', 'matches concensus']

In [None]:
ngram_list_1

In [None]:
lexicon_sent_list

In [None]:
df_headlines_sent = clean_dataframe(df_headlines_sent)

In [None]:
df_headlines_sent