In [None]:
import re

import numpy as np
import pandas as pd

In [None]:
import nltk

In [None]:
# import nltk.corpus
# from nltk.tokenize import TreebankWordTokenizer
# import nltk.stem.snowball
# from nltk.corpus import wordnet
# import string

In [None]:
nltk.download('vader_lexicon')

In [None]:
nltk.download('punkt')

In [None]:
# nltk.download('averaged_perceptron_tagger')

In [None]:
# nltk.download('wordnet')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

NLTK's sentiment intensity analyzer is based on a rule-based model that implements the model described here  _Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014._

In [None]:
pd.set_option('max.columns', 999)

In [None]:
raw_headlines = pd.read_csv('../data/abcnews_million_headlines.csv')

In [None]:
raw_headlines.shape

In [None]:
raw_headlines.columns = ['date', 'headline']  # rename columns 'cause the others were verbose

In [None]:
raw_headlines.sample(20)['headline'].values

In [None]:
sia = SIA()  # initialize a nltk semantic intensity analyzer

In [None]:
scored_headlines = []

for headline in raw_headlines.sample(20)['headline'].values:
    sia_scores = sia.polarity_scores(headline)
    sia_scores['headline'] = headline
    scored_headlines.append(sia_scores)

In [None]:
pd.DataFrame(scored_headlines)

We construct some regular expressions to be able to clean any text that is input into the system, and define a preprocessor method that lower cases the input text, "cleans" abbreviations, and removes general special characters, and strips dashes and underscores.

This is one step up from rudimentary; the trouble with regex is you're never done.

In [None]:
re_abbr = re.compile(r'(?:^|\s)((?:\w(\.\s|\s|\.))(?:\w\2)+)', re.UNICODE)
re_abbr_separator = re.compile(r'(\s|\.)', re.UNICODE)
re_numword = re.compile(r'(\s\d*\s)|\s\d*\.\d*\s', re.UNICODE)
re_specialchar_removal = re.compile(r'(!|@|#|&|\(|\)|\+|=|\{|\}|\[|\]|:|;|\"|\'|,|\.$|\?)', re.UNICODE)
re_specialchar_numsymbremoval = re.compile(r'(\$|%)', re.UNICODE)
re_dash_removal = re.compile(r'-|_', re.UNICODE)


def abbreviations_to_words(text):
    """
    Converts all abbreviations found in the input string to a single word format.
    """
    text += " "
    all_abbreviations = [x[0] for x in re_abbr.findall(text + " ")]
    for abbreviation in all_abbreviations:
        new_form = re_abbr_separator.sub('', abbreviation)
        text = text.replace(abbreviation, new_form)
    return text.strip()


def preprocessor(text):
    """
    Applies the following preprocessing steps to any input text:
        - lowercases all text
        - maps abbreviations to same format (e.g., A.D., A. D., A D to AD)
        - removes general special characters (e.g., an '!' or an '&' symbol)
        - splits words that contains dashes or underscores
        - strips the any newline characters
    """
    text = text.lower()
    text = abbreviations_to_words(text)
    text = re_specialchar_removal.sub('', text)
    text = re_dash_removal.sub(' ', text)
    text = re_specialchar_numsymbremoval.sub('', text) # depending on intent, this should be optional
    text = re_numword.sub(' numword ', text)           # and this one
    return text.strip()

I Googled around for some headlines from CBC.ca/news, NPR.org, Bloomberg.com, and the MIT Technology Review (https://www.technologyreview.com/) to test the functionality of this preprocessing; the resulting examples below.

In [None]:
headline_tests = ["Hi! My name is Alexander.",  # not actually a news item :P
                  "\'Storm of a lifetime\': 1.7 million ordered to flee approaching fury of Florence",
                  "Trump Administration Transferred $9.8-Million From F.E.M.A. To I.C.E.",
                  "A $100 Million Haircut for the Buyout Crowd",
                  "Crypto Plunges 80%! Now Worse Than the Dot-Com Crash!",
                  "How Bank Workers Emerged From the Crash $12.5 Billion Richer",
                  "H.N.A.'s Debt Declines for First Time, Shrinking by $8.3-Billion"]

In [None]:
headline_processed_tests = []
for hl in headline_tests:
    headline_processed_tests.append(preprocessor(hl))

headline_processed_tests

Looks good enough!

Introduce a new headline that is roughly equivalent to one of the above; this will be used for comparison in evaluting the effecacy of what follows.

In [None]:
new_headline = preprocessor("Big Bank Employees Came Out Even Wealthier from the Great Recession!")
new_headline

In order to match different, but analogous, sentences the body text should be simplified. One way to accomplish this is to remove stopwords, stem, and lemmatize all the sentences to be matched. Here, a tokenizer and lemmatizer are applied to the text. Text objects can then be compared by calculating and evaluating the cosine distance between the resultant document vectors.

In [None]:
# This initial attempt utilized Porter Stemming instead of Lemmatization
def tokenize_and_stem(text):
    """
    """
    stemmer = nltk.stem.porter.PorterStemmer()  # porter stemming, rule-based word reduction
    tokens = nltk.word_tokenize(text)  # generate word tokens
    return [i for i in [stemmer.stem(t) for t in tokens] if len(i) > 2]

In [None]:
# The above method is recomposed as a class that can be passed directly to the TF-IDF vectorizer that follows;
# lemmatization is selected and applied in place of the Porter stemming
class TokenLemmatizer(object):
    """
    """
    def __init__(self):
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
    def __call__(self, text):
        tokens = nltk.word_tokenize(text)
        return [i for i in [self.lemmatizer.lemmatize(t) for t in tokens] if len(i) > 2]

In [None]:
# 
tokenize_and_stem(new_headline)

TF-IDF is used for vectorization.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Performance while varying `ngram_range` between one and three resulted in significantly decreased performance. This is likely due to the fact that we're specifically analyzing analogue sentences, where not only are the words between two documents in a different order, but synonyms may be in use as well.

Choice of `max_df` and `min_df` below made intuitively with several manual iterations optimizing for performance (default values are [1.0, 1.0]).

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=Lemmatizer(), max_df=0.5, min_df=0.01)  # hence, no n-grams here
train_tfidf = tfidf_vectorizer.fit_transform(headline_processed_tests)

In [None]:
# And let's take a look at what the train_tfidf matrix looks like, as a dataframe
pd.DataFrame(train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())

Casting a new sentence against this corpus, the sentence's similarity is evaluated as the cosine similarity between the new sentence and the existing documents.

In [None]:
response = tfidf_vectorizer.transform([new_headline])
print(response)  # CSR result, feature(s) enumerated and significance

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
# Checking our matricies...

In [None]:
train_tfidf

In [None]:
response

In [None]:
similarity = linear_kernel(response, train_tfidf).flatten()
similarity

In [None]:
# Retreive a tuple of the maximally similar result
np.argmax(similarity), max(similarity)

Below we rearrange this code for future implementation...

In [None]:
# Rename and reprint our corpus and reframe the `new_headline` in the form of a query
corpus = headline_processed_tests
sample_query = "did bank employees come out even wealthier from the recession"

In [None]:
class TokenLemmatizer(object):
    """
    """
    def __init__(self):
        self.lemmatizer = nltk.stem.WordNetLemmatizer()

    def __call__(self, text):
        tokens = nltk.word_tokenize(text)
        return [i for i in [self.lemmatizer.lemmatize(t) for t in tokens] if len(i) > 2]


class TfIdfer(object):
    """
    """
    def __init__():
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=TokenLemmatizer(), max_df=0.5, min_df=0.01)
        self.tfidf_corpus = None
        self.vocab = None
        
    def train(corpus):
        self.tfidf_corpus = tfidf_vectorizer.fit_transform(corpus)
        self.vocab = self.tfidf_vectorizer.vocabulary_
    
    def transform(doc):
        return self.tfidf_vectorizer.transform([doc])
    
    # def update():
        # To do...
        # self.tfidf_vectorizer = TfidfVectorizer(vocabulary=self.vocab,
        #                                         stop_words='english',
        #                                         tokenizer=TokenLemmatizer(),
        #                                         max_df=0.5, min_df=0.01)


def calculate_similarities(query_statement):
    """
    """
    # Collect imports here for reference
    # from sklearn.feature_extraction.text import TfidfVectorizer
    # from sklearn.metrics.pairwise import linear_kernel  # recall the linear_kernal method is optimized for vectorize dot-product

    tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=TokenLemmatizer(), max_df=0.5, min_df=0.01)  # hence, no n-grams here
    tfidf_corpus = tfidf_vectorizer.fit_transform(corpus)

    query_vector = tfidf_vectorizer.transform([query_statement])

    return linear_kernel(query_vector, tfidf_corpus).flatten()


def retreive_results(similarities):
    argmax_index = np.argmax(similarities)
    score = max(similarities)
    return argmax_index, corpus[argmax_index], score


def query_articles(query_statement):
    """
    """
    processed_query_statement = query_statement  # will first have to process the query_statement, but here it's assumed that's been done
    similarities = calculate_similarities(processed_query_statement)
    results = retreive_results(similarities)
    return results

In [None]:
query_articles("how bank workers emerged from the crash numword billion richer")  # on the original doc itself

In [None]:
query_articles(sample_query)  # on the sample headline query

In [None]:
# Perhaps split positive and negative sentiments...