In [31]:
import re

import pandas as pd

In [8]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/alexanderdesouza/nltk_data...


True

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [10]:
pd.set_option('max.columns', 999)

In [16]:
raw_headlines = pd.read_csv('./data/abcnews_million_headlines.csv')

In [17]:
raw_headlines.columns = ['date', 'headline']  # rename columns 'cause the others were verbose

In [63]:
raw_headlines.sample(20)['headline'].values

array(['crown posts profit rise', 'interview des hasler',
       'remembering albert henry jackson and the 1929 floods',
       'conflicting claims over long bay escapee',
       'interview daniel de silva',
       'sunshine coast film hopes to be cut above the rest',
       'listen to remote communities on climate change say women',
       '10k needed to set up millicent penola menswatch',
       'cup runneth over in madrid football parade',
       'former malaysian pm sick in melbourne hospital',
       'ukraine tension is more combustible than ever un told',
       'rural qld john cox 2709', 'new dogs for biosecurity tasmania',
       'warm weather puts crops in doubt',
       'workers rally against ohs changes',
       'new labelling laws proposed to reduce binge',
       'dark nosed lions are fair game researchers say',
       'socceroos happy to settle for germany draw',
       'police probe road rage bashing',
       'australian market abandon oil stocks as global'], dtype=objec

In [19]:
sia = SIA()  # initialize a nltk semantic intensity analyzer

In [28]:
scored_headlines = []

for headline in raw_headlines.sample(20)['headline'].values:
    sia_scores = sia.polarity_scores(headline)
    sia_scores['headline'] = headline
    scored_headlines.append(sia_scores)

In [29]:
pd.DataFrame(scored_headlines)

Unnamed: 0,compound,headline,neg,neu,pos
0,0.0,flying doctors future still up in the air,0.0,1.0,0.0
1,0.0,meredith hellicar quits last corporate post,0.0,1.0,0.0
2,-0.5267,s korea slams n korean threat to civilian flights,0.362,0.638,0.0
3,-0.7003,kimberley residents warned of tax phone scam,0.537,0.463,0.0
4,-0.4215,lamb price struggling,0.583,0.417,0.0
5,0.0,country hour podcast 26 march,0.0,1.0,0.0
6,0.0516,stosur reaches lucrative season ender,0.0,0.769,0.231
7,0.0516,shooters miss double trap pairs medal,0.39,0.3,0.31
8,0.4019,15m to help open many rivers office,0.0,0.69,0.31
9,0.0,sa carryover,0.0,1.0,0.0


We construct some regular expressions to be able to clean any text that is input into the system, and define a preprocessor method that lower cases the input text, "cleans" abbreviations, and removes general special characters, and strips dashes and underscores.

This is one step up from rudimentary; the trouble with regex is you're never done.

In [157]:
re_abbr = re.compile(r'(?:^|\s)((?:\w(\.\s|\s|\.))(?:\w\2)+)', re.UNICODE)
re_abbr_separator = re.compile(r'(\s|\.)', re.UNICODE)
re_specialchar_removal = re.compile(r'(!|@|#|&|\(|\)|\+|=|\{|\}|\[|\]|:|;|\"|\'|,|\?)', re.UNICODE)
re_dash_removal = re.compile(r'-|_', re.UNICODE)


def abbreviations_to_words(text):
    """
    Converts all abbreviations found in the input string to a single word format.
    """
    text += " "
    all_abbreviations = [x[0] for x in re_abbr.findall(text + " ")]
    for abbreviation in all_abbreviations:
        new_form = re_abbr_separator.sub('', abbreviation)
        text = text.replace(abbreviation, new_form)
    return text


def preprocessor(text):
    """
    Applies the following preprocessing steps to any input text:
        - lowercases all text
        - maps abbreviations to same format (e.g., A.D., A. D., A D to AD)
        - removes general special characters (e.g., an '!' or an '&' symbol)
        - splits words that contains dashes or underscores
        - strips the any newline characters
    """
    text = text.lower()
    text = abbreviations_to_words(text)
    text = re_specialchar_removal.sub('', text)
    text = re_dash_removal.sub(' ', text)
    return text.strip()

I Googled around for some headlines from CBC.ca/news, NPR.org, Bloomberg.com, and the MIT Technology Review (https://www.technologyreview.com/) to test the functionality of this preprocessing; the resulting examples below.

In [164]:
headline_tests = ["\'Storm of a lifetime\': 1.7 million ordered to flee approaching fury of Florence",
                  "Trump Administration Transferred $9.8-Million From F.E.M.A. To I.C.E.",
                  "A $100 Million Haircut for the Buyout Crowd",
                  "Crypto Plunges 80%! Now Worse Than the Dot-Com Crash!",
                  "How Bank Workers Emerged From the Crash $12.5 Billion Richer",
                  "H.N.A.'s Debt Declines for First Time, Shrinking by $8.3-Billion"]

In [165]:
hl_processed_tests = []
for hl in hl_tests:
    hl_processed_tests.append(preprocessor(hl))

hl_processed_tests

['storm of a lifetime 1.7 million ordered to flee approaching fury of florence',
 'trump administration transferred $9.8 million from fema to ice',
 'a $100 million haircut for the buyout crowd',
 'crypto plunges 80% now worse than the dot com crash',
 'how bank workers emerged from the crash $12.5 billion richer',
 'hnas debt declines for first time shrinking by $8.3 billion']

Looks good enough! :)

In [None]:
# To Do:
#    - TF-IDF the input hl against the corpus of existing hls
#    - return the top-n matches
#    - from the top-n matches, select the top +vely and -vely sia scored matches to be returned