# Query news and blog data

In [1]:
import numpy as np

#Get all country codes
import pycountry
iso = []
name = []
for e in list(pycountry.countries):
    iso.append(e.alpha_2)
    name.append(e.name)
    
name = np.array(name)
iso = np.array(iso)
iso = iso[np.argsort(name)]
name = name[np.argsort(name)]

countr = []
for i,e in enumerate(name):
    countr.append((e,iso[i]))

In [None]:
from ipywidgets import Layout, Box, FloatText, Textarea, Dropdown, Label, Checkbox, SelectMultiple, Text, interact

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
    align_content = "flex-start",
    width = "auto"
)

form_items = [
    Box([Label(value='Webhose Token',layout=Layout(width="auto")), Textarea(value = "76171c1e-faf9-4646-981e-10dda4eb680b")], layout=form_item_layout),
    Box([Label(value='Main query (in brackets)',layout=Layout(width="auto")), Textarea()], layout=form_item_layout),
    Box([Label(value='Language',layout=Layout(width="auto")),
         Dropdown(options=['english',"german"])], layout=form_item_layout),
    Box([Label(value='Is first (Exclude comments)',layout=Layout(width="auto")),
         Checkbox(value=True)], layout=form_item_layout),
    Box([Label(value='Performance score threshold (greater than x)',layout=Layout(width="auto")),
         FloatText(value = 1)], layout=form_item_layout),
    Box([Label(value='Domain Rank (Top x sites by monthly traffic)',layout=Layout(width="auto")),
         FloatText(value = 10000)], layout=form_item_layout),
     Box([Label(value='Published X days ago',layout=Layout(width="auto")),
         FloatText(value = 30)], layout=form_item_layout),
    Box([Label(value='Site type(s) (in brackets)',layout=Layout(width="auto")),
         Textarea(value='(site_type:news OR site_type:blogs)')], layout=form_item_layout),
    Box([Label(value='Sort by',layout=Layout(width="auto")),
         Dropdown(options={'Crawl Date': "crawled", 'Relevancy': "relevancy", 'Published': "published"})], layout=form_item_layout),
    Box([Label(value='All countries',layout=Layout(width="auto")),
         Checkbox(value=False)], layout=form_item_layout),
    Box([Label(value='Country'),
         SelectMultiple(options=countr,value = ["GB"],layout=Layout(display="flex", flex_flow='column'))], layout=form_item_layout),
    Box([Label(value='Other (thread.title or organization in brackets)',layout=Layout(width="auto")),
         Textarea()], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    flex = "flex-basis",
    flex_flow='column',
    border='solid 1px',
    width='60%',
    height = "auto"
))
form

In [None]:
#Queries
"""
(electric car OR electric cars OR autonomous driving OR autonomous car OR connected car)

((sport OR luxury) AND future AND (car sharing OR car rental OR car ownership OR car club OR car purchase OR car lease))

((sport OR luxury) AND future AND (car OR vehicle) AND (rent OR rental OR lease OR leasing OR own OR buy OR purchase OR club OR subscription OR share OR sharing))

"""

In [None]:
# Set up query with form inputs
params = []
for el in form.children:
    params.append(el.children[1].value)
    
#Transform multi select countries
if params[-3]:
    params[-2]=""
else:    
    repl = []
    for i, c in enumerate(params[-2]):
        repl.append("thread.country:"+c)
    if len(params[-2])>1:
        params[-2] = "("+" OR ".join(repl)+")"
    else: 
        params[-2] = repl[0]

In [None]:
#Build and run query
import webhoseio
import time
import datetime

webhoseio.config(token=params[0])
query_params = {"q":params[1]+" language:"+params[2]+" is_first:"+str(params[3]).lower()+" performance_score:>"+str(int(params[4]))+
                " domain_rank:<"+str(int(params[5]))+" "+params[7]+" "+params[10]+" "+params[11],
               "ts":str(int(time.mktime((datetime.datetime.now() + datetime.timedelta(-params[6])).timetuple()) * 1000)),
               "sort":params[8]}

output = webhoseio.query("filterWebContent", query_params)
print("Total number of documents: ",output['totalResults'])

In [None]:
import pandas as pd

def get_list(obj,key):
    l = []
    for e in obj:
        l.append(e[key])
    return l

data = []

#Get all results to pandas dataframe
while True:
    
    if len(output["posts"]) != 0:
        for post in output['posts']:
            data.append({
                "entity_persons":get_list(post["entities"]["persons"],"name"),
                "entity_organizations":get_list(post["entities"]["organizations"],"name"),
                "entity_locations":get_list(post["entities"]["locations"],"name"),
                "uuid":post["uuid"],
                "author":post["author"],
                "url":post["url"],
                "language":post["language"],
                "title":post["title"],
                "highlightText":post["highlightText"],
                "text":post["text"],
                "publishedDate":post["published"],
                "crawledDate":post["crawled"],
                "fb_likes":post["thread"]["social"]["facebook"]["likes"],
                "fb_shares":post["thread"]["social"]["facebook"]["shares"],
                "linkedin_shares":post["thread"]["social"]["linkedin"]["shares"],
                "gplus_shares":post["thread"]["social"]["gplus"]["shares"],
                "source":post["thread"]["site_full"],
                "country":post["thread"]["country"],
                "performance_score":post["thread"]["performance_score"],
                "participants_counts":post["thread"]["participants_count"],
                "site_type":post["thread"]["site_type"]
                
            })
    else:
        break
        
    #Get next batch of data
    time.sleep(30)
    output = webhoseio.get_next()
    
data = pd.DataFrame.from_dict(data)
data.crawledDate = pd.to_datetime(data.crawledDate).dt.date
data.publishedDate = pd.to_datetime(data.publishedDate).dt.date

#Save data
data.to_csv("crypto.csv",header=True, index=False, encoding='utf-8',sep=";")

In [136]:
data["source"].unique()

array(['www.bloomberg.com', 'www.independent.co.uk',
       'www.theguardian.com', 'www.marketwatch.com',
       'www.rollingstone.com', 'www.businessinsider.com',
       'economictimes.indiatimes.com', 'www.forbes.com',
       'www.theatlantic.com', 'www.theverge.com', 'medium.com',
       'www.latimes.com', 'timesofindia.indiatimes.com', 'www.wired.com',
       'www.thedailybeast.com', 'www.reuters.com', 'www.wsj.com',
       'www.moneycontrol.com', 'news.abs-cbn.com', 'www.rt.com',
       'www.buzzfeed.com', 'www.cnet.com', 'www.bbc.com',
       'uk.businessinsider.com', 'www.msn.com', 'www.cnbc.com',
       'www.huffingtonpost.com', 'newsinfo.inquirer.net',
       'gadgets.ndtv.com', 'www.thehindu.com', 'business.inquirer.net',
       'www.europol.europa.eu', 'www.washingtonpost.com',
       'sg.news.yahoo.com', 'finance.yahoo.com', 'www.investing.com',
       'www.pcmag.com', 'abcnews.go.com', 'au.finance.yahoo.com',
       'www.reddit.com', 'www.foxnews.com', 'www.telegraph.co.uk

In [None]:
data.columns

# Run topic model

In [None]:
import NewsTrends
import pandas as pd
import locale
locale.setlocale(locale.LC_ALL, 'deu_deu')
%matplotlib inline

data = pd.read_csv("crypto.csv",sep=";",encoding='utf-8')
#data.date = pd.to_datetime(data.date,format="%d. %B %Y")
#data.shape

In [2]:
t = NewsTrends.topicModel(data,
                          key_idx=list(data.columns).index("uuid"),
                          text_idx=list(data.columns).index("text"),
                          lang="en",random_state = 1,
                          bigram = False)

In [3]:
tok_dat = t.tokenize_docs(rmv_tokens=["'s",'$',"mr","mr."])
corpus = t.get_bow(below = 0.05,above = 0.9)

In [None]:
k,results = t.LDA_tune_k(max_k=30,iterations=1000)

In [None]:
#Train topic model
LDA = t.fit_LDA(10,iterations=5000)

In [None]:
#Inspect topic results
topics = t.inspect_topics(n_top_words=15)

result = ''
for i, topic_words in enumerate(topics):
    result += '* **Topic {}:** {}\n'.format(i, ' '.join(topic_words))
NewsTrends.MD(result)

In [None]:
topics = [["Trump, new, report"],
          ["Company, new work, help, user"],
          ["business, blockchain, change, create, entrepreneur"],
          ["bitcoin,, police, hacker, steal, criminal"],
          ["facebook, user, data privacy, cambridge analytica"],
          ["fund, bank, invest, cryptocurrency"],
          ["crypto currency exchange, trading, bitcoin, price"],
          ["blockchain technology, platform, smart contracts, token"],
          ["people like, know, think, good thing"],
          ["government, security, attack, Russia, U.S., law, ban"]
          
         ]
len(topics)

In [41]:
#doctopics = t.get_doc_topic(topics)
#t.generate_network(node_attr=[ u'title',u'location'],
 #                 node_label="title", similarity_cutoff = 0.9,
  #               similarity_measure = "correlation")

In [33]:
doctopics = t.get_doc_topic(topics)
t.generate_network(node_attr=[ u'country', u'publishedDate',u'fb_likes', u'fb_shares',u'url',u"site_type"],
                  node_label="title", similarity_cutoff = 0.8,
                 similarity_measure = "correlation")

# Visualize

In [14]:
#Plotly
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from ipywidgets import widgets
from plotly.widgets import GraphWidget

plotly.tools.set_credentials_file(username='bockjohannes@gmx.de', api_key='U8wOvUmXSdm9E9IB1l47')

<IPython.core.display.Javascript object>

## Network

In [35]:
from IPython.core.display import HTML

HTML('<iframe src="01_graphs/network/index.html" width="100%" height="650" seamless>Netzwerk kann nicht angezeigt werden.</iframe>')

In [38]:
from IPython.core.display import HTML

HTML('<iframe src="01_graphs/Project2/index.html" width="100%" height="700" seamless>Netzwerk kann nicht angezeigt werden.</iframe>')

In [43]:
cols = ["rgb(0,203,155)","rgb(72,206,14)","rgb(197,160,0)","rgb(107,155,231)","rgb(67,124,97)","rgb(0,211,255)",
        "rgb(255,136,255)","rgb(152,86,79)","rgb(255,113,49)","rgb(255,102,167)"]

## Topics over time

In [39]:
# generate a function to handle changes in the widget
def timeline(y,date_col,colorby,aggregate="nunique",timeinterval=None):

    plot_dat = t.df.copy()
    plot_dat[date_col] = pd.to_datetime(plot_dat[date_col]).dt.week
        
    plot_dat = plot_dat.groupby([date_col,colorby])[y].agg(aggregate)
    dat = plot_dat.reset_index()
    
    dataset = []
    for gr in dat[colorby].unique():
        dataset.append(go.Bar(
        x=dat[dat[colorby]==gr][date_col],
        y=dat[dat[colorby]==gr][y],
        name=gr
    ))

    layout = go.Layout(
        barmode='stack',
        showlegend = True,
        legend=dict(orientation="h"),
        yaxis = dict(title = y+" ("+aggregate+")"),
        xaxis = dict(title=date_col)
    )
    
    
    fig = go.Figure(data=dataset, layout=layout)
    return py.iplot(fig, filename='stacked-bar')

timeline(y="uuid", date_col = 'publishedDate',colorby="topic_terms",aggregate="nunique")

## Histogram

In [42]:
# generate a function to handle changes in the widget
def hist(x,colorby="topic_terms",logx = False):

    plot_dat = t.df.copy()
    
    if logx:
        plot_dat[x]=np.log(plot_dat[x])
        #typ = 'log'
    #else:
        #typ="normal"
    data = []
    for gr in plot_dat[colorby].unique():
        data.append(go.Histogram(
            x=plot_dat.loc[plot_dat[colorby]==gr,x],
            name=gr
            
    ))

    layout = go.Layout(barmode='stack',legend=dict(orientation="h"),
                       xaxis=dict(title=x,type="normal",autorange=True,
                                 exponentformat= "e" if logx else "none"),
                      yaxis = dict(title="count"))
    fig = go.Figure(data=data, layout=layout)

    
    return py.iplot(fig, filename='stacked histogram')

hist(x='fb_likes',colorby="topic_terms",logx=True)

## Bar chart

In [45]:
def bar(x,groupby,colorby = None, aggregate = "sum"):
    plot_dat = t.df.copy()
    
    if colorby == None:
        plot_dat = plot_dat.groupby(groupby)[x].agg(aggregate)
        data =[
            go.Bar(
            y=plot_dat.index.values,
            x=plot_dat.values,
            orientation = 'h')
        ]
        fig = go.Figure(data=data)
    else:
        data = []
        plot_dat = plot_dat.groupby([colorby,groupby])[x].agg(aggregate)
        for gr in plot_dat.index.levels[0]: 
            data.append(go.Bar(
            y=plot_dat[gr].index.values,
            x=plot_dat[gr].values,
            name=gr,
            orientation = 'h'
        ))
        
        layout = go.Layout(
            barmode='stack',
            showlegend = True,
            xaxis = dict(title=x+" ("+aggregate+")"),
            legend=dict(orientation="h"),
            margin = go.Margin(l=330,r=50)
        )

        fig = go.Figure(data=data, layout=layout)
    
    return py.iplot(fig, filename='bar')

bar(x="uuid",groupby="topic_terms",colorby="country",aggregate="nunique")

## Scatter plot

In [47]:
def scatter(x,y,groupby,colorby = None,sizeby = "fixed", aggregate = {"x":"sum","y":"sum","colorby":"sum"},
            axistype = {"x":"normal","y":"normal"},
            xtitle="", ytitle=""):
    
    plot_dat = t.df.copy()
    if sizeby=="fixed":
        s=15
    if colorby == None:
        colorby = groupby
        
    if plot_dat[colorby].dtype == np.object:
        plot_datx = plot_dat.groupby([colorby,groupby])[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby([colorby,groupby])[y].agg(aggregate["y"])
        
        dat = []
        for cat in plot_dat[colorby].unique():
            
            dat.append({"x":plot_datx[cat].values,
                       "y":plot_daty[cat].values,
                        "text":plot_daty[cat].index,
                       "name":cat,
                       "mode":"markers",
                       "marker":dict(size=s)})
        fig = {
            'data': dat,
            'layout': {"legend":dict(orientation="h"),"showlegend": True,
                'xaxis': {'title': xtitle,"type":axistype["x"],"exponentformat": "e" if axistype["x"]=="log" else "none"},
                'yaxis': {'title': ytitle,"type":axistype["y"],"exponentformat": "e" if axistype["y"]=="log" else "none"}
            }
        }
    else:
        plot_datx = plot_dat.groupby(groupby)[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby(groupby)[y].agg(aggregate["y"])
        plot_datcol = plot_dat.groupby(groupby)[colorby].agg(aggregate["colorby"])
        
        trace1 = go.Scatter(
            y = plot_daty.values,
            x = plot_datx.values,
            mode='markers',
            marker=dict(
                size=s,
                color = plot_datcol.values, #set color equal to a variable
                colorscale='Viridis',
                showscale=True
            )
        )
        layout = dict(xaxis=dic(title=xtitle,type=axistype["x"],exponentformat= "e" if axistype["x"]=="log" else "none"),
                      yaxis=dic(title=ytitle,type=axistype["y"],exponentformat= "e" if axistype["y"]=="log" else "none"))
        fig = go.Figure(data=[trace1],layout = layout)
        
    return py.iplot(fig, filename='bar')

scatter(x="uuid",y='fb_likes',groupby="topic_terms",colorby = "topic_terms",
        sizeby = "fixed",xtitle="Publish-Count",ytitle="fb-likes",
        axistype=dict(x="normal",y="normal"),
        aggregate = {"x":"nunique","y":"sum","colorby":"sum"})

In [48]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')