# Data Pipeline
Initial data analysis pipeline including a naive sentiment analysis using TextBlob.

In [1]:
import re
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from string import punctuation

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import calmap # for making GitHub-style calendar plots of time-series
# Plot using Pandas datatime objects
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
rc_fonts = {'axes.labelsize': 20,
            'xtick.labelsize': 16,
            'ytick.labelsize': 16}
plt.rcParams.update(rc_fonts)
plt.style.use('ggplot')

In [4]:
# Scikit-learn for TF-IDF and similarity detection
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import MDS

## Use ```spaCy``` for tokenization and sentence segmentation

In [5]:
import spacy
from spacy import displacy
# Load spaCy language model (blank model to which we add pipeline components)
sentencizer = spacy.blank('en')
sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))

## Use ```flair``` NLP library for sentiment prediction

In [6]:
from flair.models import TextClassifier
from flair.data import Sentence

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


#### Specify named entity of interest

In [7]:
name = "Ryan Lochte"

#### Write data: Boolean
Specify if we want to write the output data to csv or not.

In [8]:
write_ = True

In [None]:
datafile = 'all_the_news_v2.csv'
datapath = Path('../') / 'data' / datafile 
colnames = ['title', 'author', 'date', 'content', 'year', 'month', 'publication', 'length']

news = pd.read_csv(datapath, usecols=colnames, parse_dates=['date'])
news['author'] = news['author'].str.strip()
news.head()

Unnamed: 0,title,author,date,content,year,month,publication,length
0,Agent Cooper in Twin Peaks is the audience: on...,Tasha Robinson,2017-05-31,And never more so than in Showtime’s new...,2017.0,5.0,Verge,2376
1,"AI, the humanity!",Sam Byford,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017.0,5.0,Verge,2125
2,The Viral Machine,Kaitlyn Tiffany,2017-05-25,Super Deluxe built a weird internet empi...,2017.0,5.0,Verge,3310
3,How Anker is beating Apple and Samsung at thei...,Nick Statt,2017-05-22,Steven Yang quit his job at Google in th...,2017.0,5.0,Verge,3632
4,Tour Black Panther’s reimagined homeland with ...,Kwame Opam,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017.0,5.0,Verge,262


In [None]:
news = news.dropna(subset=['date', 'title'])
news.shape[0]

143156

In [None]:
news['date'].describe()

count                  143156
unique                   1480
top       2017-01-13 00:00:00
freq                      415
first     2000-05-15 00:00:00
last      2017-07-05 00:00:00
Name: date, dtype: object

### Filter articles based on name match
In this section we only select those news articles that contain part of or all of the name we input as ```name```.

In [None]:
def check_name(content, name):
    flag = False
    if name in content:
        flag = True
    return flag

def filter_df(df):
    df['match'] = df['content'].apply(lambda x: check_name(x, name))
    df_relevant = df.loc[df['match'].eq(True)]
    return df_relevant.drop(['match'], axis=1)

news_relevant = filter_df(news)
print(news_relevant.shape[0])
news_relevant.head()

182


Unnamed: 0,title,author,date,content,year,month,publication,length
3095,"After Olympic-sized goof, Ryan Lochte begins a...",David Wharton,2017-04-27,Don’t cringe or shake your head or stop readin...,2017.0,4.0,Los Angeles Times,1925
8946,Michael Phelps is a touch off in 100 butterfly...,Everett Cook,2014-08-08,Michael Phelps\' bid for his most important wi...,2014.0,8.0,Los Angeles Times,996
13789,Michael Phelps Powers U.S. to Victory in 4x100...,"Victor Mather, Karen Crouse and Doug Mills",2016-08-10,RIO DE JANEIRO — Michael Phelps won his 19th O...,2016.0,8.0,New York Times,276
13825,Rio Olympics: Simone Manuel Makes History in t...,Karen Crouse,2016-08-16,RIO DE JANEIRO — Simone Manuel managed to make...,2016.0,8.0,New York Times,1468
13839,"Rio Olympics: A Phelps Upset, a Judo Snub, and...",Sam Manchester and Victor Mather,2016-08-16,Katie Ledecky did what Katie Ledecky does best...,2016.0,8.0,New York Times,1614


### Perform sentence segmentation
Store the sentences in each news articles as a list of sentences, from which we can easily extract per-sentence sentiment.

In [None]:
def get_relevant(text, name):
    doc = sentencizer(text)
    relevant = []
    for sent in doc.sents:
        for n in name.split():
            if n in sent.text:
                clean = sent.text.replace("\n", " ").replace("\xa0", " ")
                # Strip bad characters at the start of sentences
                clean = clean.strip("[\'").strip("\']").strip('\"').strip("\'\"")
                clean = clean.strip(",\'").strip("\',").strip('\"').strip("\'\"").strip()
                relevant.append(clean)
    # Remove duplicates
    relevant = list(dict.fromkeys(relevant))
    return relevant

In [None]:
news_relevant['relevant'] = news_relevant['content'].apply(lambda x: get_relevant(x, name))

In [None]:
for i in news_relevant['relevant'][:5]:
    print(i, '\n--')

["That’s all Ryan Lochte wants as he ambles across the pool deck on a bright Southern California day, looking tanned and relaxed, if a bit weary from his morning workout.', '“", 'Lochte now finds himself living in Los Angeles with his pregnant fiancée and training at USC with thoughts of a comeback.', "Here’s the condensed version:', 'After a fifth-place finish in the 200-meter individual medley — his dissatisfaction hardly assuaged by a gold in the 800 freestyle relay — Lochte partied with three teammates at the French team’s hospitality house.',", "Lochte initially told authorities they were pulled over and robbed by armed men posing as police officers.',", "Lochte suffered an additional blow as Speedo and other corporate sponsors walked away.',", 'Ryan Lochte didn’t have to get drunk and vandalize a Rio de Janeiro gas station during the Summer Olympics in August.', "But he did, and it’s at times like this when people really need their...', 'Ryan Lochte didn’t have to get drunk and v

### Sentiment scoring using ```flair```

In [None]:
# Path to trained flair PyTorch model
model_path = '../modules/classification/flair/models/elmo_md/final-model.pt'

In [None]:
# Load model
classifier = TextClassifier.load_from_file(model_path)

2019-04-07 06:36:31,076 loading file ../modules/classification/flair/models/elmo_md/final-model.pt


#### Preprocess text and tokenize as per FastText requirements

In [None]:
# reviews = [
#     "This restaurant literally changed my life. This is the best food I've ever eaten!",
#     "I do not like this place at all. They were very rude.",
#     "I don't know. It was ok, I guess. Not really sure what to say.",
# ]

In [None]:
def get_score_flair(text_list):
    scores = []
    for item in text_list:
        sentence = Sentence(item)
        classifier.predict(sentence)
        scores.append(int(sentence.labels[0].value))
    sentiment_list = [(s-3)/2 if s else 0 for s in scores]
    score = np.mean(sentiment_list)
    deviation = np.std(sentiment_list)
    return score, deviation

In [None]:
news_relevant['score'], news_relevant['deviation'] = zip(*news_relevant['relevant'].map(get_score_flair))
news_relevant.head(5)

### Lemmatize relevant sentences for comparison
This is to remove duplicates.

In [None]:
add_removed_words = {n.lower() for n in name.split()}
# Include specific words to be removed
stopwords = sentencizer.Defaults.stop_words
stopwords = stopwords.union(add_removed_words)

In [None]:
# Tokenize and lemmatize text
def lemmatize(text):
    doc = sentencizer(text)
    tokens = [str(tok.lemma_).lower() for tok in doc if tok.text not in stopwords \
              and tok.text not in punctuation]
    return tokens

In [None]:
news_relevant['lemmas'] = news_relevant['relevant'].str.join(' ').apply(lemmatize).str.join(' ')
news_relevant[['relevant', 'lemmas']].head()

### Drop duplicates

In [None]:
news_relevant = news_relevant.drop_duplicates(subset=['lemmas'])
news_relevant.shape[0]

#### Positive sentiment group

In [None]:
pos = news_relevant[news_relevant['score'] > 0.0].sort_values(by=['score'], ascending=False).reset_index(drop=True)
print("Found {} overall positive articles for {}".format(pos.shape[0], name))
pos.head(3)

#### Write positive results

In [None]:
if write_:
    out_filename = '_'.join(name.split()).lower() + '_pos.csv'
    out_path = Path('./') / "results/flair" / out_filename
    pos.sort_values(by='publication')[['publication', 'title', 'date', 'relevant', 'score', 'deviation']] \
                    .to_csv(out_path, index=False, header=True)

#### Negative sentiment group

In [None]:
neg = news_relevant[news_relevant['score'] < 0.0].sort_values(by=['score']).reset_index(drop=True)
print("Found {} overall negative articles for {}".format(neg.shape[0], name))
neg.head(3)

#### Write negative results

In [None]:
if write_:
    out_filename = '_'.join(name.split()).lower() + '_neg.csv'
    out_path = Path('./') / "results/flair" / out_filename
    neg.sort_values(by='publication')[['publication', 'title', 'date', 'relevant', 'score', 'deviation']] \
                    .to_csv(out_path, index=False, header=True)

In [None]:
mixed = news_relevant[news_relevant['score'] == 0.0].reset_index(drop=True)
print("Found {} overall mixed articles for {}".format(mixed.shape[0], name))
mixed.head(3)

### Highlight relevant named entities using ```spaCy```

In [None]:
from IPython.display import Markdown, display
options = {'ents': ['PERSON', 'ORG', 'GPE', 'EVENT'], 
           'colors': {'PERSON': '#9fafe5', 'ORG': '#d59b9b', 'GPE':'#81cba6'}}
def printmd(string):
    display(Markdown(string))
    
def display_entities(nlp, df, max_entries=5):
    # Set relevant named entities that we want to extract
    for idx, sent in enumerate(df['relevant'].str.join(' ')[:max_entries]):
        doc = nlp(sent)
        printmd('**{}**'.format(df['title'][idx]))
        displacy.render(doc, style='ent', jupyter=True, options=options)
        print('\n')
        
def vis(pos, neg, mixed, spacy_lang='en_core_web_md'):
    nlp = spacy.load(spacy_lang)
    # Visualize positive and negativ groups using markdown
    printmd('<font color=green>**Positive**</font>')
    display_entities(nlp, pos)
    printmd('<font color=red>**Negative**</font>')
    display_entities(nlp, neg)
    printmd('<font color=yellow>**Mixed**</font>')
    display_entities(nlp, mixed)

In [None]:
# vis(pos, neg, mixed, spacy_lang='en_core_web_md')

## Visualization

### Plot sentiment score and magnitude versus time of publishing of the article
In this section, sentiment "score" is the median of all polarity values (positive or negative) obtained per-sentence of the article from TextBlob. Sentiment "magnitude" is the standard deviation of sentiment among the per-sentence polarity values. 

In [None]:
news_avg_score = news_relevant.groupby('date')['score'].mean()
news_avg_dev = news_relevant.groupby('date')['deviation'].mean()

#### Get article count per day

In [None]:
news_count = news_relevant.groupby(['date']).count()['title']

#### Get peak polar article per day (min negative or max positive score)

In [None]:
news_relevant['abs'] = news_relevant['score'].abs()
news_relevant[['date', 'score', 'abs']].head(3)

In [None]:
news_peak_polar = news_relevant.groupby('date').max()[['title', 'publication', 'relevant']]
# Extract just the first 3 relevant sentences from the article and convert to single string
news_peak_polar['relevant'] = news_peak_polar['relevant'].apply(lambda x: x[:3]).str.join(' ')
print(news_peak_polar.shape[0])

#### Combine scores, magnitudes and article counts per day

In [None]:
scores = pd.concat((news_avg_score, news_avg_dev, news_count), axis=1).sort_values(by=['date'])
scores.columns = ['mean_score', 'mean_dev', 'count']
scores.head()

#### Concatenate scores/counts DataFrame with most polar news content for that day

In [None]:
data = pd.concat((news_peak_polar, scores), axis=1).sort_index()
data.head()

#### Reindex data to show daily scores
Since we have really sparse data (news articles about the target are not written every day, we reindex the time series and fill missing values with zeros. 

In [None]:
idx = pd.date_range('1/1/2014', '7/5/2017')
daily = data.reindex(idx, fill_value=0.0)

In [None]:
%matplotlib inline

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8))
ax1.fill_between(daily.index, daily['mean_score'], step='mid', color='black', alpha=0.6, linewidth=4);
ax1.set_ylabel('Mean Score');
ax1.set_title('Sentiment scores and deviations with time for "{}"'.format(name), size=15);
ax2.fill_between(daily.index, daily['mean_dev'], step='mid', color='black', alpha=0.6, linewidth=4);
ax2.set_ylabel('Mean Deviation');
ax2.set_xlabel('Date');
# Initiate a second y-axis with a shared x-axis for the article counts
ax2_2 = ax2.twinx();
ax2_2.plot(daily.index, daily['count'], 'r--', alpha=0.6, linewidth=2);
ax2_2.grid(False);
ax2_2.set_ylabel('Article Count');
plt.tight_layout()
# plt.savefig("{}_scores".format('_'.join(name.split()).lower()))

#### Make calendar plot to show periods of activity

In [None]:
fig, axes = calmap.calendarplot(data['mean_score'],
                    vmin = -1.0,
                    vmax=1.0,
                    daylabels='MTWTFSS',
                    dayticks=[0, 2, 4, 6],
                    fig_kws=dict(figsize=(12.5, 9)),
                    linewidth=1,
                    fillcolor='lightgrey',
                    cmap='coolwarm_r',
                   );
fig.suptitle("Calendar map of aggregated sentiment for {}".format(name), fontsize=18);
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

if write_:
    out_filename = '_'.join(name.split()).lower()
    plt.savefig('calmap_{}.png'.format(out_filename))

#### Get counts of positive and negative mentions based on Publication

In [None]:
grouped = news_relevant.groupby('publication').apply(lambda x: x['score'] >= 0.0)
grouped = grouped.groupby('publication').value_counts().to_frame()
grouped = grouped.unstack().fillna(0.0)
grouped.columns = ['Negative', 'Positive']
grouped = grouped.sort_values(by='Negative')
grouped

#### Plot article breakdown

In [None]:
grouped.plot(kind='barh', figsize=(12, 8));
plt.title('Count of number of articles with Positive/Negative Sentiment for {}'.format(name));
# plt.savefig("{}_breakdown".format('_'.join(name.split()).lower()))

#### Output result to CSV

In [None]:
if write_:
    out_filename = '_'.join(name.split()).lower() + '_breakdown.csv'
    out_path = Path('./') / "results/flair" / out_filename
    grouped.to_csv(out_path, header=True)

In [None]:
if write_: 
    data_filename = '_'.join(name.split()).lower() + '_data.csv'
    data_path = Path('./') / "results/flair" / data_filename
    daily[~daily['relevant'].eq(0)].to_csv(data_path, header=True)

## Visualize Cosine Similarity Distances
To see how similar or different each article is based on publication, we can compute the cosine distances between articles to generate a "distance matrix" and then visualize these distances in two-dimensional space.

#### Calculate TF-IDF for document similarity
We first define the term frequency-inverse document frequency to vectorize the text for each article into parameters, and generate a ```tf-idf``` matrix. 

Once we compute the ```tf-idf``` matrix, we can find a "distance matrix" that stores how similar or how different two documents are.

In [None]:
# Define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2)
# 
tfidf_matrix = tfidf_vectorizer.fit_transform(news_relevant['lemmas'] ) #fit the vectorizer to synopses
print(tfidf_matrix.shape)

# Display some key terms
terms = tfidf_vectorizer.get_feature_names()

In [None]:
# Get cosine distance matrix
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
embedding = MDS(n_components=2, dissimilarity="precomputed", random_state=37)
dist_transformed = embedding.fit_transform(dist)
print(dist_transformed.shape)

xs, ys = dist_transformed[:, 0], dist_transformed[:, 1]

#### Generate an MDS DataFrame for plotting
We combine the x-y distances from the MDS calculation with the original publication labels to see how different the articles are from each other, colored by publication.

In [None]:
compare = pd.DataFrame(dict(label=news_relevant['publication'], x=xs, y=ys))
compare.head()

In [None]:
L = news_relevant['publication'].nunique()
print("Found {} unique categories for publications".format(L))

In [None]:
groups = compare.groupby('label').agg({'label': 'count', 'x': 'mean', 'y': 'mean'})
groups.columns = ['count', 'x', 'y']
groups = groups.sort_values(by='count')
groups

#### Visualize similarities as embedded cosine distances

In [None]:
fig, ax = plt.subplots(figsize=(12, 9))

colors = [i for i in range(len(groups.index))]

ax.scatter(groups['x'], groups['y'], c=colors, 
            s=groups['count']*100, linewidths=1.5, alpha=0.7,
            edgecolors='k', cmap=plt.cm.gist_rainbow,
            );

for i, txt in enumerate(groups.index):
    ax.annotate(txt, (groups['x'][i], groups['y'][i]),
                fontsize=18, alpha=0.7);
ax.set_xticklabels(['']);
ax.set_yticklabels(['']);
plt.tight_layout()