In [1]:
import re
import sys

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA  # for compactness

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alexanderdesouza/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexanderdesouza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexanderdesouza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

NLTK's sentiment intensity analyzer is based on a rule-based model that implements the model described here  _Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014._

In [6]:
pd.set_option('max.columns', 999)

In [7]:
raw_headlines = pd.read_csv('../data/abcnews_million_headlines.csv')

In [8]:
raw_headlines.sample(3)

Unnamed: 0,date,headline
185464,20050831,leslie tests positive for drug use police
657266,20111202,missing man
622744,20110624,push on for indigenous constitutional recognition


In [9]:
raw_headlines.shape

(1048575, 2)

One million documents presents an extremely large corpus, that for the sake of this demonstration will make training excessively long without a smarter way in which to build up the TF-IDF matrix. For the present context then we randomly sample 1,000 documents for the experiments below.

In [10]:
raw_headlines = raw_headlines.sample(1000)
raw_headlines.reset_index(drop=True, inplace=True)

In [11]:
raw_headlines.sample(3)

Unnamed: 0,date,headline
23,20130819,senex energy granted exploration licence for c...
500,20140603,woman found dead at lithgow in suspicious circ...
216,20050814,principals seek review of report card system


In [12]:
raw_headlines.shape

(1000, 2)

A semantic intensity analyzer is constructed using NLTK's SIA (a robust, but rule-based, lexicographic heuristic).

In [13]:
sia = SIA()  # initialize a nltk semantic intensity analyzer

In [14]:
scored_headlines = []

for headline in raw_headlines['headline'].values:
    sia_scores = sia.polarity_scores(headline)
    sia_scores['headline'] = headline
    scored_headlines.append(sia_scores)
    
headlines = pd.DataFrame(scored_headlines)
# headlines['date'] = raw_headlines['date']  # in application the date is irrelevant and thus omitted here from further consideration

In [15]:
headlines.sample(5)

Unnamed: 0,compound,headline,neg,neu,pos
33,0.0,six month reprieve for dubbo grandstand,0.0,1.0,0.0
433,0.4215,lolo jones admits to emotional roller coaster ...,0.0,0.648,0.352
197,0.0,binge drinking on peoples minds ama,0.0,1.0,0.0
649,0.0,the road ahead for the relationship between,0.0,1.0,0.0
974,-0.3612,man dead another questioned by police sunshine...,0.41,0.36,0.23


In [16]:
# The memory footprint of this object is...
sys.getsizeof(headlines)

129615

Roughly a single Mb, which is sufficiently small as to be maintainable in memory by the application.

For compactness the set is divided into positively and negatively associated corpuses...

In [17]:
pve_articles = headlines[headlines['compound'] >= 0.0]
nve_articles = headlines[headlines['compound'] < 0.0]

In [18]:
# Distribution of the articles
print("{:.1f}% articles have positive sentiment".format(len(pve_articles)/(len(pve_articles)+len(nve_articles))*100))
print("{:.1f}% articles have negative sentiment".format(len(nve_articles)/(len(pve_articles)+len(nve_articles))*100))

64.4% articles have positive sentiment
35.6% articles have negative sentiment


In [19]:
# Total length is preserved
len(pve_articles)+len(nve_articles)==len(raw_headlines)

True

We construct some regular expressions to be able to clean any text that is input into the system, and define a preprocessor method that lower cases the input text, "cleans" abbreviations, and removes general special characters, and strips dashes and underscores.

In [20]:
re_abbr = re.compile(r'(?:^|\s)((?:\w(\.\s|\s|\.))(?:\w\2)+)', re.UNICODE)
re_abbr_separator = re.compile(r'(\s|\.)', re.UNICODE)
re_numword = re.compile(r'(\s\d*\s)|\s\d*\.\d*\s', re.UNICODE)
re_specialchar_removal = re.compile(r'(!|@|#|&|\(|\)|\+|=|\{|\}|\[|\]|:|;|\"|\'|,|\.$|\?)', re.UNICODE)
re_specialchar_numsymbremoval = re.compile(r'(\$|%)', re.UNICODE)
re_dash_removal = re.compile(r'-|_', re.UNICODE)


def abbreviations_to_words(text):
    """
    Converts all abbreviations found in the input string to a single word format.
    """
    text += " "
    all_abbreviations = [x[0] for x in re_abbr.findall(text + " ")]
    for abbreviation in all_abbreviations:
        new_form = re_abbr_separator.sub('', abbreviation)
        text = text.replace(abbreviation, new_form)
    return text.strip()


def preprocessor(text):
    """
    Applies the following preprocessing steps to any input text:
        - lowercases all text
        - maps abbreviations to same format (e.g., A.D., A. D., A D to AD)
        - removes general special characters (e.g., an '!' or an '&' symbol)
        - splits words that contains dashes or underscores
        - strips the any newline characters
    """
    text = text.lower()
    text = abbreviations_to_words(text)
    text = re_specialchar_removal.sub('', text)
    text = re_dash_removal.sub(' ', text)
    text = re_specialchar_numsymbremoval.sub('', text) # depending on intent, this should be optional
    text = re_numword.sub(' numword ', text)           # and this one
    return text.strip()


def score_text(text):
    """
    """
    scored_text = []
    sia_score = sia.polarity_scores(text)
    sia_score['headline'] = text
    scored_text.append(sia_score)
    return pd.DataFrame(scored_text)


def add_article(text):
    """
    """
    global pve_articles
    global nve_articles
    text = preprocessor(text)
    scored_text = score_text(text)
    if scored_text['compound'].values[0] >= 0:
        pve_articles = pd.concat([pve_articles, scored_text]).reset_index(drop=True)
        print("\"{}\" added to the pve article set with score {}.".format(scored_text['headline'].values[0], scored_text['compound'].values[0]))
        return 1
    else:
        nve_articles = pd.concat([nve_articles, scored_text]).reset_index(drop=True)
        print("\"{}\" added to the nve article set with score {}.".format(scored_text['headline'].values[0], scored_text['compound'].values[0]))
        return -1

I Googled around for some headlines from CBC.ca/news, NPR.org, Bloomberg.com, and the MIT Technology Review (https://www.technologyreview.com/) to test the functionality of this preprocessing; the resulting examples below.

In [21]:
headline_tests = ["Hi! My name is Alexander.",  # not actually a news item :P
                  "\'Storm of a lifetime\': 1.7 million ordered to flee approaching fury of Florence",
                  "Trump Administration Transferred $9.8-Million From F.E.M.A. To I.C.E.",
                  "A $100 Million Haircut for the Buyout Crowd",
                  "Crypto Plunges 80%! Now Worse Than the Dot-Com Crash!",
                  "How Bank Workers Emerged From the Crash $12.5 Billion Richer",
                  "H.N.A.'s Debt Declines for First Time, Shrinking by $8.3-Billion"]

In [22]:
headline_processed_tests = []
for hl in headline_tests:
    headline_processed_tests.append(preprocessor(hl))

headline_processed_tests

['hi my name is alexander',
 'storm of a lifetime numword million ordered to flee approaching fury of florence',
 'trump administration transferred numword million from fema to ice',
 'a numword million haircut for the buyout crowd',
 'crypto plunges numword now worse than the dot com crash',
 'how bank workers emerged from the crash numword billion richer',
 'hnas debt declines for first time shrinking by numword billion']

Looks good enough!

Next we add the articles, performing the preprocessing as an intermediate step.

In [23]:
for hl in headline_tests:
    add_article(hl)

"hi my name is alexander" added to the pve article set with score 0.0.
"storm of a lifetime numword million ordered to flee approaching fury of florence" added to the nve article set with score -0.5719.
"trump administration transferred numword million from fema to ice" added to the pve article set with score 0.0.
"a numword million haircut for the buyout crowd" added to the pve article set with score 0.0.
"crypto plunges numword now worse than the dot com crash" added to the nve article set with score -0.7003.
"how bank workers emerged from the crash numword billion richer" added to the pve article set with score 0.1779.
"hnas debt declines for first time shrinking by numword billion" added to the nve article set with score -0.3612.


Next, a new headline is introduced that is roughly equivalent to one of the above to act as a comparative reference in evaluting the efficacy of what follows.

In [24]:
new_headline = preprocessor("Big Bank Employees Came Out Wealthier from the Great Recession!")
new_headline

'big bank employees came out wealthier from the great recession'

In order to match different, but analogous, sentences the body text should be simplified. One way to accomplish this is to remove stopwords, stem, and lemmatize all the sentences to be matched. Here, a tokenizer and lemmatizer are applied to the text. Text objects can then be compared by calculating and evaluating the cosine distance between the resultant document vectors.

In [25]:
# This initial attempt utilized Porter Stemming instead of Lemmatization
def tokenize_and_stem(text):
    """
    """
    stemmer = nltk.stem.porter.PorterStemmer()  # porter stemming, rule-based word reduction
    tokens = nltk.word_tokenize(text)  # generate word tokens
    return [i for i in [stemmer.stem(t) for t in tokens] if len(i) > 2]

In [26]:
# The above method is recomposed as a class that can be passed directly to the TF-IDF vectorizer that follows;
# lemmatization is selected and applied in place of the Porter stemming
class TokenLemmatizer(object):
    """
    """
    def __init__(self):
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
    def __call__(self, text):
        tokens = nltk.word_tokenize(text)
        return [i for i in [self.lemmatizer.lemmatize(t) for t in tokens] if len(i) > 2]

In [27]:
# Visual inspection of the output from the TokenLemmatizer() on the `new_headline`
tokenize_and_stem(new_headline)

['big',
 'bank',
 'employe',
 'came',
 'out',
 'wealthier',
 'from',
 'the',
 'great',
 'recess']

TF-IDF is used to construct the vectorized forms of documents.

Performance while varying `ngram_range` between one and three resulted in significantly decreased performance. This is likely due to the fact that we're specifically analyzing analogue sentences, where not only are the words between two documents potentially in a different order, but synonyms for words may effect the groupings as well. In such a case the construction of n-grams will artificially increase the distance between associated word vectors.

Choice of `max_df` and `min_df` below made intuitively with several manual iterations used to optimize performance (default values are [1.0, 1.0]).

In [28]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=TokenLemmatizer(), max_df=0.5, min_df=0.01)  # hence, no n-grams here
train_tfidf = tfidf_vectorizer.fit_transform(headline_processed_tests)

In [29]:
# And let's take a look at what the train_tfidf matrix looks like, as a dataframe
pd.DataFrame(train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())

Unnamed: 0,administration,alexander,approaching,bank,billion,buyout,com,crash,crowd,crypto,debt,decline,dot,emerged,fema,flee,florence,fury,haircut,hnas,ice,lifetime,million,ordered,plunge,richer,shrinking,storm,time,transferred,trump,worker,worse
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.365065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365065,0.365065,0.365065,0.0,0.0,0.0,0.365065,0.259024,0.365065,0.0,0.0,0.0,0.365065,0.0,0.0,0.0,0.0,0.0
2,0.426268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.426268,0.0,0.0,0.0,0.0,0.0,0.426268,0.0,0.30245,0.0,0.0,0.0,0.0,0.0,0.0,0.426268,0.426268,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.534261,0.0,0.0,0.534261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.534261,0.0,0.0,0.0,0.379074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.348019,0.0,0.419257,0.0,0.0,0.419257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257
5,0.0,0.0,0.0,0.431207,0.357939,0.0,0.0,0.357939,0.0,0.0,0.0,0.0,0.0,0.431207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.431207,0.0,0.0,0.0,0.0,0.0,0.431207,0.0
6,0.0,0.0,0.0,0.0,0.348019,0.0,0.0,0.0,0.0,0.0,0.419257,0.419257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.0,0.0,0.0,0.0,0.0,0.0,0.419257,0.0,0.419257,0.0,0.0,0.0,0.0


Casting a new sentence against this corpus, the sentence's similarity is evaluated as the cosine similarity between the new sentence and the existing documents.

In [30]:
response = tfidf_vectorizer.transform([new_headline])
print(response)  # CSR result, feature(s) enumerated and significance, matching entirely due to the presence of the word 'bank'

  (0, 3)	1.0


In [31]:
# Checking our matricies...

In [32]:
train_tfidf

<7x33 sparse matrix of type '<class 'numpy.float64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [33]:
response

<1x33 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [34]:
similarity = linear_kernel(response, train_tfidf).flatten()
similarity

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.43120736, 0.        ])

In [35]:
# Retreive a tuple of the maximally similar result
np.argmax(similarity), max(similarity)

(5, 0.4312073587067964)

Below we rearrange this code for future implementation...

In [36]:
# Rename and reprint our corpus and reframe the `new_headline` in the form of a query
pve_corpus = pve_articles['headline'].values
nve_corpus = nve_articles['headline'].values
sample_query = "did bank employees come out wealthier from the recession"

In [37]:
class TokenLemmatizer(object):
    """
    """
    def __init__(self):
        self.lemmatizer = nltk.stem.WordNetLemmatizer()

    def __call__(self, text):
        tokens = nltk.word_tokenize(text)
        return [i for i in [self.lemmatizer.lemmatize(t) for t in tokens] if len(i) > 2]


class TfIdfer(object):
    """
    """
    def __init__():
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=TokenLemmatizer(), max_df=0.5, min_df=0.0)
        self.tfidf_corpus = None
        self.vocab = None
        
    def train(corpus):
        self.tfidf_corpus = tfidf_vectorizer.fit_transform(corpus)
        self.vocab = self.tfidf_vectorizer.vocabulary_
    
    def transform(doc):
        return self.tfidf_vectorizer.transform([doc])
    
    # def update():
        # To do...
        # self.tfidf_vectorizer = TfidfVectorizer(vocabulary=self.vocab,
        #                                         stop_words='english',
        #                                         tokenizer=TokenLemmatizer(),
        #                                         max_df=0.5, min_df=0.0)


def calculate_similarities(doc, corpus):
    """
    """
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=TokenLemmatizer(), max_df=0.5, min_df=0.0)
    tfidf_corpus = tfidf_vectorizer.fit_transform(corpus)
    
    doc_vector = tfidf_vectorizer.transform([doc])

    return linear_kernel(doc_vector, tfidf_corpus).flatten()


def retreive_results(similarities, corpus):
    """
    """
    # argmax_index = np.argmax(similarities)
    argmax_indicies = [np.where(similarities!=0)[0][:]]
    score = max(similarities)
    return {"index": argmax_indicies,
            "item": corpus[np.argmax(similarities)],
            "score": score}


def query_articles(query_statement):
    """
    """
    results = {}
    
    # processed_query_statement = query_statement  # will first have to process the query_statement, but here it's assumed that's been done
    processed_query_statement = preprocessor(query_statement)  # forgot to make this call
    print(processed_query_statement)
    
    pve_similarities = calculate_similarities(processed_query_statement, pve_articles['headline'].values)
    nve_similarities = calculate_similarities(processed_query_statement, nve_articles['headline'].values)
    
    results["positive"] = retreive_results(pve_similarities, pve_articles['headline'].values)
    results["negative"] = retreive_results(nve_similarities, nve_articles['headline'].values)
    
    return results

Randomized testing below...

In [38]:
pve_articles['headline'].sample(1)

294    three in court over melbourne cocaine bust
Name: headline, dtype: object

In [39]:
nve_articles['headline'].sample(1)

350    tasmania police seek information on two seriou...
Name: headline, dtype: object

In [40]:
query_articles("Is Baddeley confident?")

is baddeley confident


{'positive': {'index': [array([ 44, 230, 577])],
  'item': 'watson confident tigers will release him',
  'score': 0.504988628218755},
 'negative': {'index': [array([], dtype=int64)],
  'item': 'andrew michael burke guilty of joan ryther murder life in prison',
  'score': 0.0}}

In [41]:
query_articles("upper hunter winery's waste warning")



{'positive': {'index': [array([396])],
  'item': 'flash flooding in the hunter',
  'score': 0.6097046913553162},
 'negative': {'index': [array([ 89, 156, 187, 213, 223, 310, 314, 316])],
  'score': 0.29499644109645606}}