# Data Pipeline
Initial data analysis pipeline including a naive sentiment analysis using TextBlob.

In [1]:
import re
import html
import spacy
import textacy
from textblob import TextBlob
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
import matplotlib.pyplot as plt
# Plot using Pandas datatime objects
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
plt.style.use('ggplot')
%matplotlib inline

In [3]:
# Standard plotly imports
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

## Use ```spaCy``` for tokenization and sentence segmentation

In [4]:
import spacy
from spacy import displacy
# Load spaCy language model (blank model to which we add pipeline components)
sentencizer = spacy.blank('en')
sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))

#### Specify named entity of interest

In [5]:
name = "Ryan Lochte"

In [6]:
datafile = 'all_the_news_v2.csv'
datapath = Path('../') / 'data' / datafile 
colnames = ['title', 'author', 'date', 'content', 'year', 'month', 'publication', 'length']

news = pd.read_csv(datapath, usecols=colnames, parse_dates=['date'])
news['author'] = news['author'].str.strip()
news.head()

Unnamed: 0,title,author,date,content,year,month,publication,length
0,Agent Cooper in Twin Peaks is the audience: on...,Tasha Robinson,2017-05-31,And never more so than in Showtime’s new...,2017.0,5.0,Verge,2376
1,"AI, the humanity!",Sam Byford,2017-05-30,AlphaGo’s victory isn’t a defeat for hum...,2017.0,5.0,Verge,2125
2,The Viral Machine,Kaitlyn Tiffany,2017-05-25,Super Deluxe built a weird internet empi...,2017.0,5.0,Verge,3310
3,How Anker is beating Apple and Samsung at thei...,Nick Statt,2017-05-22,Steven Yang quit his job at Google in th...,2017.0,5.0,Verge,3632
4,Tour Black Panther’s reimagined homeland with ...,Kwame Opam,2017-05-15,Ahead of Black Panther’s 2018 theatrical...,2017.0,5.0,Verge,262


In [7]:
news = news.dropna(subset=['date', 'title'])
news.shape[0]

143156

### Filter articles based on name match
In this section we only select those news articles that contain part of or all of the name we input as ```name```.

In [8]:
def check_name(content, name):
    flag = False
    if name in content:
        flag = True
    return flag

def filter_df(df):
    df['match'] = df['content'].apply(lambda x: check_name(x, name))
    df_relevant = df.loc[df['match'].eq(True)]
    return df_relevant.drop(['match'], axis=1)

news_relevant = filter_df(news)
print(news_relevant.shape[0])
news_relevant.head()

182


Unnamed: 0,title,author,date,content,year,month,publication,length
3095,"After Olympic-sized goof, Ryan Lochte begins a...",David Wharton,2017-04-27,Don’t cringe or shake your head or stop readin...,2017.0,4.0,Los Angeles Times,1925
8946,Michael Phelps is a touch off in 100 butterfly...,Everett Cook,2014-08-08,Michael Phelps\' bid for his most important wi...,2014.0,8.0,Los Angeles Times,996
13789,Michael Phelps Powers U.S. to Victory in 4x100...,"Victor Mather, Karen Crouse and Doug Mills",2016-08-10,RIO DE JANEIRO — Michael Phelps won his 19th O...,2016.0,8.0,New York Times,276
13825,Rio Olympics: Simone Manuel Makes History in t...,Karen Crouse,2016-08-16,RIO DE JANEIRO — Simone Manuel managed to make...,2016.0,8.0,New York Times,1468
13839,"Rio Olympics: A Phelps Upset, a Judo Snub, and...",Sam Manchester and Victor Mather,2016-08-16,Katie Ledecky did what Katie Ledecky does best...,2016.0,8.0,New York Times,1614


### Perform sentence segmentation
Store the sentences in each news articles as a list of sentences, from which we can easily extract per-sentence sentiment.

In [9]:
def get_relevant(text, name):
    doc = sentencizer(text)
    relevant = []
    for sent in doc.sents:
        for n in name.split():
            if n in sent.text:
                relevant.append(sent.text.strip())
    # Remove duplicates
    relevant = list(dict.fromkeys(relevant))
    return relevant

In [10]:
news_relevant['relevant'] = news_relevant['content'].apply(lambda x: get_relevant(x, name))

In [11]:
for i in news_relevant['relevant'][:5]:
    print(i, '\n--')

["That’s all Ryan Lochte wants as he ambles across the pool deck on a bright Southern California day, looking tanned and relaxed, if a bit weary from his morning workout.', '“", 'Lochte now finds himself living in Los Angeles with his pregnant fiancée and training at USC with thoughts of a comeback.', "Here’s the condensed version:', 'After a fifth-place finish in the 200-meter individual medley — his dissatisfaction hardly assuaged by a gold in the 800 freestyle relay — Lochte partied with three teammates at the French team’s hospitality house.', '", "Lochte initially told authorities they were pulled over and robbed by armed men posing as police officers.', '", "Lochte suffered an additional blow as Speedo and other corporate sponsors walked away.', '", 'Ryan Lochte didn’t have to get drunk and vandalize a Rio de Janeiro gas station during the Summer Olympics in August.', "But he did, and it’s at times like this when people really need their...', 'Ryan Lochte didn’t have to get drunk

### Naive sentiment scoring using TextBlob

In [12]:
def get_score_textblob(text_list):
    # Calculate polarity for each sentence
    sentiment_list = [round(TextBlob(text).sentiment.polarity, 4) for text in text_list]
    score = np.median(sentiment_list)
    magnitude = np.std(sentiment_list)
    return score, magnitude

In [13]:
news_relevant['score'], news_relevant['magnitude'] = zip(*news_relevant['relevant'].map(get_score_textblob))
news_relevant.head(3)

Unnamed: 0,title,author,date,content,year,month,publication,length,relevant,score,magnitude
3095,"After Olympic-sized goof, Ryan Lochte begins a...",David Wharton,2017-04-27,Don’t cringe or shake your head or stop readin...,2017.0,4.0,Los Angeles Times,1925,[That’s all Ryan Lochte wants as he ambles acr...,0.0,0.213016
8946,Michael Phelps is a touch off in 100 butterfly...,Everett Cook,2014-08-08,Michael Phelps\' bid for his most important wi...,2014.0,8.0,Los Angeles Times,996,"[Ryan Lochte, another preeminent American swim...",0.0,0.0
13789,Michael Phelps Powers U.S. to Victory in 4x100...,"Victor Mather, Karen Crouse and Doug Mills",2016-08-10,RIO DE JANEIRO — Michael Phelps won his 19th O...,2016.0,8.0,New York Times,276,[Ryan Held and Nathan Adrian brought the U.S. ...,0.45,0.05


#### Positive sentiment group

In [14]:
pos = news_relevant.sort_values(by=['score'], ascending=False).reset_index(drop=True)
pos.iloc[:5, :].head(3)

Unnamed: 0,title,author,date,content,year,month,publication,length,relevant,score,magnitude
0,"Hey, Watch This",Kevin D. Williamson,2016-08-07,"[‘Hey, Dad, watch this!” Here is a scene that ...",2016.0,8.0,National Review,1226,[Ryan Lochte and America’s Man-Boy ProblemAmer...,0.75,0.0
1,Can Ryan Lochte Redeem Himself on Dancing With...,Megan Garber,2016-08-24,[ For us to continue writing great stor...,2016.0,8.0,Atlantic,1358,"[Rumor has it that, now, another celebrity wil...",0.7,0.367423
2,The Rio Olympics: Catch the Fever!,Jim Geraghty,2016-08-05,[The International Olympic Committee wanted to...,2016.0,8.0,National Review,1217,[Ryan Lochte and America’s Man-Boy ProblemAmer...,0.65,0.0


#### Negative sentiment group

In [15]:
neg = news_relevant.sort_values(by=['score']).reset_index(drop=True)
neg.iloc[:5, :].head(3)

Unnamed: 0,title,author,date,content,year,month,publication,length,relevant,score,magnitude
0,"2016 Gave Us 262,800 'Two Minutes Hates.' Some...",Daniel J. Flynn,2016-12-30,Andy Warhol envisioned a future in which every...,2016.0,12.0,Breitbart,787,[Ryan Lochte suffered a terrible hangover in 2...,-1.0,0.0
1,‘This Only Confirms the Image of the U.S.A.’ -...,Lela Moore and Lindsey Underwood,2016-08-22,Here are the top 10 comments of the week on ou...,2016.0,8.0,New York Times,1596,"[Jose in Rio de Janeiro, reacting to an articl...",-0.2,0.0
2,'Bayou Billionaire' star Valerie Wells dead in...,,2016-11-09,"[According to People, the Shreveport, Louisian...",2016.0,11.0,Fox News,265,"[Playboy model in hot water, Hailey Baldwin st...",-0.1923,0.0


In [16]:
#### 

In [17]:
mixed = news_relevant.sort_values(by=['magnitude'], ascending=False).reset_index(drop=True)
mixed.iloc[:5, :].head(3)

Unnamed: 0,title,author,date,content,year,month,publication,length,relevant,score,magnitude
0,Can Ryan Lochte Redeem Himself on Dancing With...,Megan Garber,2016-08-24,[ For us to continue writing great stor...,2016.0,8.0,Atlantic,1358,"[Rumor has it that, now, another celebrity wil...",0.7,0.367423
1,Roundup: Smart Thoughts On Ryan Lochte And Whi...,Leah Donnella,2016-08-19,"[This week, in a tale of Olympic scandal and i...",2016.0,8.0,NPR,1495,"[[This week, in a tale of Olympic scandal and ...",0.0,0.356279
2,Ryan Lochte 'in talks' to join 'Dancing With T...,Chloe Melas,2016-08-24,[ (CNN)Ryan Lochte might be ditching his speed...,2016.0,8.0,CNN,168,[[ (CNN)Ryan Lochte might be ditching his spee...,0.2,0.339935


### Highlight relevant named entities using ```spaCy```

In [18]:
# Load full spaCy language model for NER
nlp = spacy.load('en_core_web_md')

In [19]:
from IPython.display import Markdown, display
options = {'ents': ['PERSON', 'ORG', 'GPE', 'EVENT'], 
           'colors': {'PERSON': '#9fafe5', 'ORG': '#d59b9b', 'GPE':'#81cba6'}}
def printmd(string):
    display(Markdown(string))
    
def display_entities(df, max_entries=5):
    # Set relevant named entities that we want to extract
    for idx, sent in enumerate(df['relevant'].str.join(' ')[:max_entries]):
        doc = nlp(sent)
        printmd('**{}**'.format(df['title'][idx]))
        displacy.render(doc, style='ent', jupyter=True, options=options)
        print('\n')
        
def vis(pos, neg, mixed):
    # Visualize positive and negativ groups using markdown
    printmd('<font color=green>**Positive**</font>')
    display_entities(pos)
    printmd('<font color=red>**Negative**</font>')
    display_entities(neg)
    printmd('<font color=yellow>**Mixed**</font>')
    display_entities(mixed)

In [20]:
vis(pos, neg, mixed)

<font color=green>**Positive**</font>

**Hey, Watch This**





**Can Ryan Lochte Redeem Himself on Dancing With the Stars? **





**The Rio Olympics: Catch the Fever!**





**How to train like an Olympian – eat loads of peanut butter and listen to robots**





**Will Alicia Machado join ‘Dancing with the Stars’ next season?**





<font color=red>**Negative**</font>

**2016 Gave Us 262,800 'Two Minutes Hates.' Some Lasted Longer Than Others. - Breitbart**





**‘This Only Confirms the Image of the U.S.A.’ - The New York Times**





**'Bayou Billionaire' star Valerie Wells dead in murder-suicide, reports say**





**Stars urge their fans to 'fight' after Trump win**





**Michael Moore to Democrats: I told you Trump was going to win**





<font color=yellow>**Mixed**</font>

**Can Ryan Lochte Redeem Himself on Dancing With the Stars? **





**Roundup: Smart Thoughts On Ryan Lochte And White Privilege**





**Ryan Lochte 'in talks' to join 'Dancing With The Stars'**





**U.S. swimmers' tall tale touches a raw nerve in Brazil**





**Ryan Lochte, ’12 Olympic Champion, Misses Rio Cut in 400 I.M. - The New York Times**





## Visualization

### Plot sentiment score and magnitude versus time of publishing of the article
In this section, sentiment "score" is the median of all polarity values (positive or negative) obtained per-sentence of the article from TextBlob. Sentiment "magnitude" is the standard deviation of sentiment among the per-sentence polarity values. 

In [21]:
pos_avg_score = pos.groupby('date')['score'].mean()
pos_avg_mag = pos.groupby('date')['magnitude'].mean()

#### Concatenate scores and magnitudes for plotting

In [22]:
scores = pd.concat((pos_avg_score, pos_avg_mag), axis=1).sort_values(by=['date'])
scores.to_csv('test.csv', header=True)
scores.head()

Unnamed: 0_level_0,score,magnitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-08-08,0.0,0.0
2016-06-27,0.0,0.315103
2016-08-05,0.325,0.0
2016-08-06,0.25,0.0
2016-08-07,0.75,0.0


#### Plot data using ```Plotly```

In [23]:
scores.iplot(subplots=True, shape=(2, 1), shared_xaxes=True, 
             fill=True,
             xTitle='Date',
             title='Sentiment scores and magnitudes with time for "{}"'.format(name)
            )

#### Matplotlib option

In [24]:
# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8))
# ax1.fill_between(pos_avg_score.index, pos_avg_score.values, color='red', alpha=0.5);
# ax1.set_ylabel('Score');
# ax1.set_title('Sentiment scores and magnitudes with time for "{}"'.format(name), size=15);
# ax2.fill_between(pos_avg_mag.index, pos_avg_mag.values, color='skyblue', alpha=0.5);
# ax2.set_ylabel('Magnitude');
# ax2.set_xlabel('Date');
# plt.savefig("{}_scores".format('_'.join(name.split()).lower()))

#### Get counts of positive and negative mentions based on Publication

In [25]:
grouped = news_relevant.groupby('publication').apply(lambda x: x['score'] >= 0.0)
grouped = grouped.groupby('publication').value_counts().to_frame()
grouped = grouped.unstack().fillna(0.0)
grouped.columns = ['Negative', 'Positive']
grouped

Unnamed: 0_level_0,Negative,Positive
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
Atlantic,1.0,22.0
Breitbart,2.0,5.0
Business Insider,0.0,6.0
Buzzfeed News,2.0,9.0
CNN,3.0,17.0
Fox News,20.0,8.0
Guardian,5.0,9.0
Los Angeles Times,0.0,2.0
NPR,3.0,10.0
National Review,0.0,8.0


#### Plot article breakdown using ```Plotly```

In [26]:
grouped[['Negative', 'Positive']].iplot(
    kind='barh',
    xTitle='Publication',
    yTitle='Article Counts',
    title='{}: Positive/Negative Sentiment Breakdown vs Publication'.format(name))

#### Matplotlib option

In [27]:
# grouped.plot(x='publication', kind='barh', figsize=(12, 8));
# plt.title('Count of number of articles with Positive/Negative Sentiment for {}'.format(name));
# plt.savefig("{}_breakdown".format('_'.join(name.split()).lower()))

#### Output result to CSV

In [28]:
out_filename = '_'.join(name.split()).lower() + '.csv'
out_path = Path('../') / "modules/results" / out_filename
# grouped.to_csv(out_path, index=False, header=True)