# Query news and blog data

In [1]:
import numpy as np

#Get all country codes
import pycountry
iso = []
name = []
for e in list(pycountry.countries):
    iso.append(e.alpha_2)
    name.append(e.name)
    
name = np.array(name)
iso = np.array(iso)
iso = iso[np.argsort(name)]
name = name[np.argsort(name)]

countr = []
for i,e in enumerate(name):
    countr.append((e,iso[i]))

In [None]:
from ipywidgets import Layout, Box, FloatText, Textarea, Dropdown, Label, Checkbox, SelectMultiple, Text, interact

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
    align_content = "flex-start",
    width = "auto"
)

form_items = [
    Box([Label(value='Webhose Token',layout=Layout(width="auto")), Textarea(value = "76171c1e-faf9-4646-981e-10dda4eb680b")], layout=form_item_layout),
    Box([Label(value='Main query (in brackets)',layout=Layout(width="auto")), Textarea()], layout=form_item_layout),
    Box([Label(value='Language',layout=Layout(width="auto")),
         Dropdown(options=['english',"german"])], layout=form_item_layout),
    Box([Label(value='Is first (Exclude comments)',layout=Layout(width="auto")),
         Checkbox(value=True)], layout=form_item_layout),
    Box([Label(value='Performance score threshold (greater than x)',layout=Layout(width="auto")),
         FloatText(value = 1)], layout=form_item_layout),
    Box([Label(value='Domain Rank (Top x sites by monthly traffic)',layout=Layout(width="auto")),
         FloatText(value = 10000)], layout=form_item_layout),
     Box([Label(value='Published X days ago',layout=Layout(width="auto")),
         FloatText(value = 30)], layout=form_item_layout),
    Box([Label(value='Site type(s) (in brackets)',layout=Layout(width="auto")),
         Textarea(value='(site_type:news OR site_type:blogs)')], layout=form_item_layout),
    Box([Label(value='Sort by',layout=Layout(width="auto")),
         Dropdown(options={'Crawl Date': "crawled", 'Relevancy': "relevancy", 'Published': "published"})], layout=form_item_layout),
    Box([Label(value='All countries',layout=Layout(width="auto")),
         Checkbox(value=False)], layout=form_item_layout),
    Box([Label(value='Country'),
         SelectMultiple(options=countr,value = ["GB"],layout=Layout(display="flex", flex_flow='column'))], layout=form_item_layout),
    Box([Label(value='Other (thread.title or organization in brackets)',layout=Layout(width="auto")),
         Textarea()], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    flex = "flex-basis",
    flex_flow='column',
    border='solid 1px',
    width='60%',
    height = "auto"
))
form

In [None]:
#Queries
"""
(electric car OR electric cars OR autonomous driving OR autonomous car OR connected car)

((sport OR luxury) AND future AND (car sharing OR car rental OR car ownership OR car club OR car purchase OR car lease))

((sport OR luxury) AND future AND (car OR vehicle) AND (rent OR rental OR lease OR leasing OR own OR buy OR purchase OR club OR subscription OR share OR sharing))

"""

In [20]:
# Set up query with form inputs
params = []
for el in form.children:
    params.append(el.children[1].value)
    
#Transform multi select countries
if params[-3]:
    params[-2]=""
else:    
    repl = []
    for i, c in enumerate(params[-2]):
        repl.append("thread.country:"+c)
    if len(params[-2])>1:
        params[-2] = "("+" OR ".join(repl)+")"
    else: 
        params[-2] = repl[0]

In [None]:
#Build and run query
import webhoseio
import time
import datetime

webhoseio.config(token=params[0])
query_params = {"q":params[1]+" language:"+params[2]+" is_first:"+str(params[3]).lower()+" performance_score:>"+str(int(params[4]))+
                " domain_rank:<"+str(int(params[5]))+" "+params[7]+" "+params[10]+" "+params[11],
               "ts":str(int(time.mktime((datetime.datetime.now() + datetime.timedelta(-params[6])).timetuple()) * 1000)),
               "sort":params[8]}

output = webhoseio.query("filterWebContent", query_params)
print("Total number of documents: ",output['totalResults'])

In [22]:
import pandas as pd

def get_list(obj,key):
    l = []
    for e in obj:
        l.append(e[key])
    return l

data = []

#Get all results to pandas dataframe
while True:
    
    if len(output["posts"]) != 0:
        for post in output['posts']:
            data.append({
                "entity_persons":get_list(post["entities"]["persons"],"name"),
                "entity_organizations":get_list(post["entities"]["organizations"],"name"),
                "entity_locations":get_list(post["entities"]["locations"],"name"),
                "uuid":post["uuid"],
                "author":post["author"],
                "url":post["url"],
                "language":post["language"],
                "title":post["title"],
                "highlightText":post["highlightText"],
                "text":post["text"],
                "publishedDate":post["published"],
                "crawledDate":post["crawled"],
                "fb_likes":post["thread"]["social"]["facebook"]["likes"],
                "fb_shares":post["thread"]["social"]["facebook"]["shares"],
                "linkedin_shares":post["thread"]["social"]["linkedin"]["shares"],
                "gplus_shares":post["thread"]["social"]["gplus"]["shares"],
                "source":post["thread"]["site_full"],
                "country":post["thread"]["country"],
                "performance_score":post["thread"]["performance_score"],
                "participants_counts":post["thread"]["participants_count"],
                "site_type":post["thread"]["site_type"]
                
            })
    else:
        break
        
    #Get next batch of data
    time.sleep(30)
    output = webhoseio.get_next()
    
data = pd.DataFrame.from_dict(data)
data.crawledDate = pd.to_datetime(data.crawledDate).dt.date
data.publishedDate = pd.to_datetime(data.publishedDate).dt.date

In [23]:
#Save data
data.to_csv("FBCA.csv",header=True, index=False, encoding='utf-8',sep=";")

In [None]:
#data["source"].unique()

# Run topic model

In [None]:
import NewsTrends
import pandas as pd
import locale

#locale.setlocale(locale.LC_ALL, 'deu_deu')
%matplotlib inline

data = pd.read_csv("FBCA.csv",sep=";",encoding='utf-8')
#data.date = pd.to_datetime(data.date,format="%d. %B %Y")
#data.shape

In [None]:
t = NewsTrends.topicModel(data,
                          key_idx=list(data.columns).index("uuid"),
                          text_idx=list(data.columns).index("text"),
                          lang="en",random_state = 1,
                          bigram = True)

In [None]:
tok_dat = t.tokenize_docs(rmv_tokens=["'s","$","datum"])
corpus = t.get_bow(below = 0.05,above = 0.9)

In [None]:
#k,results = t.LDA_tune_k(max_k=30,iterations=800)

In [None]:
#Train topic model
LDA = t.fit_LDA(10,iterations=2000)

In [None]:
#Inspect topic results
topics = t.inspect_topics(n_top_words=20)

result = ''
for i, topic_words in enumerate(topics):
    result += '* **Topic {}:** {}\n'.format(i, ' '.join(topic_words))
NewsTrends.MD(result)

In [None]:
topics = [["new app and site features"],
          ["politics and social media issue"],
          ["people ask questions, understand problems"],
          ["private data collection by advertisers & tech giants"],
          ["stock price, investors, twitter, apple"],
          ["new internet world"],
          ["Zuckerberg congress hearing"],
          ["Trump campaign investigation, Cambridge Analytica, Russia"],
          ["Cambridge Analytica company, Aleksandr Kogan"],
          ["New data privacy regulation"],
          
         ]
len(topics)

In [None]:
# Print urls most representative of topic

In [71]:
doctopics = t.get_doc_topic(topics)
t.generate_network(node_attr=[ u'country', u'publishedDate',u'fb_likes', u'fb_shares',u'url',u"site_type","source"],
                  node_label="title", similarity_cutoff = 0.8,
                 similarity_measure = "correlation")

In [9]:
#doctopics = t.get_doc_topic(topics)
#t.generate_network(node_attr=[ u'title',u'location'],
 #                 node_label="title", similarity_cutoff = 0.9,
  #               similarity_measure = "correlation")

# In the news: The Cambridge Analytica data scandal

In [99]:
analysiscode="FBCA"

## Summary

1. By using intelligent text processing and learning directly from the textual context, the following analysis finds meaningful connections and relationships in a large corpus of unstructured data.

2. The analysis of recent news articles related to the Facebook and Cambridge Analytica data scandal reveals two large topic clusters:
    * Zuckerberg's congress hearings (Global attention) 
    * The ongoing investigation of the Trump campaign and connections to Cambridge Analytica (Most attention in the U.S.)
       
3. However, in most recent days stories around new privacy features in Facebook apps have started to dominate
4. Finally, news articles that focused on the topic of data privacy or the Trump campaign investigation are most heavily shared on Facebook

## Data Summary

In [112]:
import pprint
print("The data set contains {} stories between {} and {}.".format(data.shape[0],data.publishedDate.min(),data.publishedDate.max()))
print()
print("Top sources include:")
print()
for a, b in enumerate(data.groupby("source")["uuid"].agg("count").sort_values(ascending=False)[0:5].keys(), 1):
    print('{} {}'.format(a, b))

The data set contains 993 stories between 2018-04-09 and 2018-05-08.

Top sources include:

1 www.theguardian.com
2 www.businessinsider.com
3 www.theverge.com
4 www.cbc.ca
5 www.cnet.com


In [113]:
#Plotly
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from ipywidgets import widgets
from plotly.widgets import GraphWidget

plotly.offline.init_notebook_mode(connected=True)
plotly.tools.set_credentials_file(username='bockjohannes@gmx.de', api_key='U8wOvUmXSdm9E9IB1l47')

## Document Analysis

*Note: If the following plots are not displayed nicely, hit refresh and wait until fully loaded.*

In [114]:
import os
from shutil import copyfile

# Move templates
source = "01_Vorlage/"
dst = "graphics/network/"

for f in ["index.html","css/style.css","js/main.js","js/sigma/sigma.parseJson.js"]:
    copyfile(source+f, dst+f)


### The discussion is dominated by two distinct clusters, namely 1) the congress hearings of Zuckerberg and 2) the ongoing investigation of the Trump campaign ([Network guide](http://nbviewer.jupyter.org/github/bockjo/Udacity_portfolio/blob/master/networks_how_to.png)).

In [115]:
from IPython.core.display import HTML

HTML('<iframe src="graphics/network/index.html" width="100%" height="480" seamless>Graph not rendered.</iframe>')

## Topics over time

### In week 15 the news coverage focused on Zuckerberg's congress hearings, but shifted to new privacy features and apps such as the announced Facebook dating app during week 18.

In [116]:
# generate a function to handle changes in the widget
def timeline(y,date_col,colorby,aggregate="nunique",timeinterval=None):

    plot_dat = t.df.copy()
    plot_dat[date_col] = pd.to_datetime(plot_dat[date_col]).dt.week
        
    plot_dat = plot_dat.groupby([date_col,colorby])[y].agg(aggregate)
    dat = plot_dat.reset_index()
    
    dataset = []
    for gr in dat[colorby].unique():
        dataset.append(go.Bar(
        x=dat[dat[colorby]==gr][date_col],
        y=dat[dat[colorby]==gr][y],
        name=gr
    ))

    layout = go.Layout(
        autosize=True,
        barmode='stack',
        showlegend = True,
        legend=dict(orientation="h",x=0.1,y=1.5),
        yaxis = dict(title = "Publish-Count"),
        xaxis = dict(title="Week number")
    )
    
    
    fig = go.Figure(data=dataset, layout=layout)
    return py.iplot(fig, filename=analysiscode+'stacked-bar')

timeline(y="uuid", date_col = 'publishedDate',colorby="topic_terms",aggregate="nunique")

## Histogram

### Especially stories of hackers exploiting bitcoin's anonymity to blackmail their victims were shared heavily on Facebook.

In [81]:
# generate a function to handle changes in the widget
def hist(x,colorby="topic_terms",logx = False):

    plot_dat = t.df.copy()
    
    if logx:
        plot_dat[x]=np.log(plot_dat[x])
        #typ = 'log'
    #else:
        #typ="normal"
    data = []
    for gr in plot_dat[colorby].unique():
        data.append(go.Histogram(
            x=plot_dat.loc[plot_dat[colorby]==gr,x],
            name=gr
            
    ))

    layout = go.Layout(barmode='stack',legend=dict(orientation="h",x=0.1,y=1.5),
                       autosize=True,
                #width=800,
                #height=550,
                       xaxis=dict(title=x,type="normal",autorange=True,
                                 exponentformat= "e" if logx else "none"),
                      yaxis = dict(title="count"))
    fig = go.Figure(data=data, layout=layout)

    
    return py.iplot(fig, filename=analysiscode+'stacked histogram')

hist(x='fb_shares',colorby="topic_terms",logx=True)

## Scatter plot

### While many news stories were concerned with covering the congress hearings, social media users shared news reports about private data collection practices of tech giants more heavily.

In [117]:
def scatter(x,y,groupby,colorby = None,sizeby = "fixed", aggregate = {"x":"sum","y":"sum","colorby":"sum"},
            axistype = {"x":"normal","y":"normal"},
            xtitle="", ytitle=""):
    
    plot_dat = t.df.copy()
    if sizeby=="fixed":
        s=15
    if colorby == None:
        colorby = groupby
        
    if plot_dat[colorby].dtype == np.object:
        plot_datx = plot_dat.groupby([colorby,groupby])[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby([colorby,groupby])[y].agg(aggregate["y"])
        
        dat = []
        for cat in plot_dat[colorby].unique():
            
            dat.append({"x":plot_datx[cat].values,
                       "y":plot_daty[cat].values,
                        "text":plot_daty[cat].index,
                       "name":cat,
                       "mode":"markers",
                       "marker":dict(size=s)})
        fig = {
            'data': dat,
            'layout': {"legend":dict(orientation="h",x=0.1,y=1.5),"showlegend": True,
                'xaxis': {'title': xtitle,"type":axistype["x"],"exponentformat": "e" if axistype["x"]=="log" else "none"},
                'yaxis': {'title': ytitle,"type":axistype["y"],"exponentformat": "e" if axistype["y"]=="log" else "none"},
                "autosize":True
            }
        }
    else:
        plot_datx = plot_dat.groupby(groupby)[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby(groupby)[y].agg(aggregate["y"])
        plot_datcol = plot_dat.groupby(groupby)[colorby].agg(aggregate["colorby"])
        
        trace1 = go.Scatter(
            y = plot_daty.values,
            x = plot_datx.values,
            mode='markers',
            marker=dict(
                size=s,
                color = plot_datcol.values, #set color equal to a variable
                colorscale='Viridis',
                showscale=True
            )
        )
        layout = dict(xaxis=dic(title=xtitle,type=axistype["x"],exponentformat= "e" if axistype["x"]=="log" else "none"),
                      yaxis=dic(title=ytitle,type=axistype["y"],exponentformat= "e" if axistype["y"]=="log" else "none"))
        fig = go.Figure(data=[trace1],layout = layout)
        
    return py.iplot(fig, filename=analysiscode+'bar')

scatter(x="uuid",y='fb_shares',groupby="topic_terms",colorby = "topic_terms",
        sizeby = "fixed",xtitle="Publish-Count",ytitle="Median fb-share per article",
        axistype=dict(x="normal",y="normal"),
        aggregate = {"x":"nunique","y":"median","colorby":"sum"})

## Bar Chart

### U.S. news dominated social media, but the Zuckerberg congress hearings were covered and liked more internationally. Especially Indian news on the congress hearings were liked a lot on Facebook.

In [118]:
def bar_continuous():
    plot_dat = t.df.copy()
    
    plot_dat = plot_dat.groupby(groupby)[x].agg(aggregate)
    data =[
        go.Bar(
        y=plot_dat.index.values,
        x=plot_dat.values,
        orientation = 'h')
    ]
    fig = go.Figure(data=data)
    
    return py.iplot(fig, filename='bar_cont')
        
def bar(x,groupby,xtitle="",colorby = "topic_terms", aggregate = "sum"):
    plot_dat = t.df.copy()
    
    data = []
    plot_dat = plot_dat.groupby([colorby,groupby])[x].agg(aggregate)
    for gr in plot_dat.index.levels[0]: 
        data.append(go.Bar(
        y=plot_dat[gr].index.values,
        x=plot_dat[gr].values,
        name=gr,
        orientation = 'h'
    ))

    layout = go.Layout(
        autosize=False,
    width=800,
    height=550,
        barmode='stack',
        showlegend = True,
        xaxis = dict(title=xtitle,position=-1.),
        yaxis = dict(),
        legend=dict(orientation="h",x=0.1,y=1.2),
        margin = go.Margin(l=350,r=10)
    )

    fig = go.Figure(data=data, layout=layout)
    
    return py.iplot(fig, filename=analysiscode+'bar_cat')

bar(x="fb_likes",groupby="topic_terms",colorby="country",aggregate="sum",xtitle="fb_likes")

In [119]:
# jupyter nbconvert CryptoNews7.ipynb --to slides --template temp --post serve
# http://nbviewer.jupyter.org/format/slides/github/bockjo/Udacity_portfolio/blob/master/filename.ipynb