# Query news and blog data

In [1]:
import numpy as np

#Get all country codes
import pycountry
iso = []
name = []
for e in list(pycountry.countries):
    iso.append(e.alpha_2)
    name.append(e.name)
    
name = np.array(name)
iso = np.array(iso)
iso = iso[np.argsort(name)]
name = name[np.argsort(name)]

countr = []
for i,e in enumerate(name):
    countr.append((e,iso[i]))

In [None]:
from ipywidgets import Layout, Box, FloatText, Textarea, Dropdown, Label, Checkbox, SelectMultiple, Text, interact

form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between',
    align_content = "flex-start",
    width = "auto"
)

form_items = [
    Box([Label(value='Webhose Token',layout=Layout(width="auto")), Textarea(value = "76171c1e-faf9-4646-981e-10dda4eb680b")], layout=form_item_layout),
    Box([Label(value='Main query (in brackets)',layout=Layout(width="auto")), Textarea()], layout=form_item_layout),
    Box([Label(value='Language',layout=Layout(width="auto")),
         Dropdown(options=['english',"german"])], layout=form_item_layout),
    Box([Label(value='Is first (Exclude comments)',layout=Layout(width="auto")),
         Checkbox(value=True)], layout=form_item_layout),
    Box([Label(value='Performance score threshold (greater than x)',layout=Layout(width="auto")),
         FloatText(value = 1)], layout=form_item_layout),
    Box([Label(value='Domain Rank (Top x sites by monthly traffic)',layout=Layout(width="auto")),
         FloatText(value = 10000)], layout=form_item_layout),
     Box([Label(value='Published X days ago',layout=Layout(width="auto")),
         FloatText(value = 30)], layout=form_item_layout),
    Box([Label(value='Site type(s) (in brackets)',layout=Layout(width="auto")),
         Textarea(value='(site_type:news OR site_type:blogs)')], layout=form_item_layout),
    Box([Label(value='Sort by',layout=Layout(width="auto")),
         Dropdown(options={'Crawl Date': "crawled", 'Relevancy': "relevancy", 'Published': "published"})], layout=form_item_layout),
    Box([Label(value='All countries',layout=Layout(width="auto")),
         Checkbox(value=False)], layout=form_item_layout),
    Box([Label(value='Country'),
         SelectMultiple(options=countr,value = ["GB"],layout=Layout(display="flex", flex_flow='column'))], layout=form_item_layout),
    Box([Label(value='Other (thread.title or organization in brackets)',layout=Layout(width="auto")),
         Textarea()], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    flex = "flex-basis",
    flex_flow='column',
    border='solid 1px',
    width='60%',
    height = "auto"
))
form

In [None]:
#Queries
"""
(electric car OR electric cars OR autonomous driving OR autonomous car OR connected car)

((sport OR luxury) AND future AND (car sharing OR car rental OR car ownership OR car club OR car purchase OR car lease))

((sport OR luxury) AND future AND (car OR vehicle) AND (rent OR rental OR lease OR leasing OR own OR buy OR purchase OR club OR subscription OR share OR sharing))

"""

In [None]:
# Set up query with form inputs
params = []
for el in form.children:
    params.append(el.children[1].value)
    
#Transform multi select countries
if params[-3]:
    params[-2]=""
else:    
    repl = []
    for i, c in enumerate(params[-2]):
        repl.append("thread.country:"+c)
    if len(params[-2])>1:
        params[-2] = "("+" OR ".join(repl)+")"
    else: 
        params[-2] = repl[0]

In [None]:
#Build and run query
import webhoseio
import time
import datetime

webhoseio.config(token=params[0])
query_params = {"q":params[1]+" language:"+params[2]+" is_first:"+str(params[3]).lower()+" performance_score:>"+str(int(params[4]))+
                " domain_rank:<"+str(int(params[5]))+" "+params[7]+" "+params[10]+" "+params[11],
               "ts":str(int(time.mktime((datetime.datetime.now() + datetime.timedelta(-params[6])).timetuple()) * 1000)),
               "sort":params[8]}

output = webhoseio.query("filterWebContent", query_params)
print("Total number of documents: ",output['totalResults'])

In [None]:
import pandas as pd

def get_list(obj,key):
    l = []
    for e in obj:
        l.append(e[key])
    return l

data = []

#Get all results to pandas dataframe
while True:
    
    if len(output["posts"]) != 0:
        for post in output['posts']:
            data.append({
                "entity_persons":get_list(post["entities"]["persons"],"name"),
                "entity_organizations":get_list(post["entities"]["organizations"],"name"),
                "entity_locations":get_list(post["entities"]["locations"],"name"),
                "uuid":post["uuid"],
                "author":post["author"],
                "url":post["url"],
                "language":post["language"],
                "title":post["title"],
                "highlightText":post["highlightText"],
                "text":post["text"],
                "publishedDate":post["published"],
                "crawledDate":post["crawled"],
                "fb_likes":post["thread"]["social"]["facebook"]["likes"],
                "fb_shares":post["thread"]["social"]["facebook"]["shares"],
                "linkedin_shares":post["thread"]["social"]["linkedin"]["shares"],
                "gplus_shares":post["thread"]["social"]["gplus"]["shares"],
                "source":post["thread"]["site_full"],
                "country":post["thread"]["country"],
                "performance_score":post["thread"]["performance_score"],
                "participants_counts":post["thread"]["participants_count"],
                "site_type":post["thread"]["site_type"]
                
            })
    else:
        break
        
    #Get next batch of data
    time.sleep(30)
    output = webhoseio.get_next()
    
data = pd.DataFrame.from_dict(data)
data.crawledDate = pd.to_datetime(data.crawledDate).dt.date
data.publishedDate = pd.to_datetime(data.publishedDate).dt.date

#Save data
data.to_csv("crypto.csv",header=True, index=False, encoding='utf-8',sep=";")

In [136]:
data["source"].unique()

array(['www.bloomberg.com', 'www.independent.co.uk',
       'www.theguardian.com', 'www.marketwatch.com',
       'www.rollingstone.com', 'www.businessinsider.com',
       'economictimes.indiatimes.com', 'www.forbes.com',
       'www.theatlantic.com', 'www.theverge.com', 'medium.com',
       'www.latimes.com', 'timesofindia.indiatimes.com', 'www.wired.com',
       'www.thedailybeast.com', 'www.reuters.com', 'www.wsj.com',
       'www.moneycontrol.com', 'news.abs-cbn.com', 'www.rt.com',
       'www.buzzfeed.com', 'www.cnet.com', 'www.bbc.com',
       'uk.businessinsider.com', 'www.msn.com', 'www.cnbc.com',
       'www.huffingtonpost.com', 'newsinfo.inquirer.net',
       'gadgets.ndtv.com', 'www.thehindu.com', 'business.inquirer.net',
       'www.europol.europa.eu', 'www.washingtonpost.com',
       'sg.news.yahoo.com', 'finance.yahoo.com', 'www.investing.com',
       'www.pcmag.com', 'abcnews.go.com', 'au.finance.yahoo.com',
       'www.reddit.com', 'www.foxnews.com', 'www.telegraph.co.uk

In [None]:
data.columns

# Run topic model

In [1]:
import NewsTrends
import pandas as pd
import locale

#locale.setlocale(locale.LC_ALL, 'deu_deu')
%matplotlib inline

data = pd.read_csv("crypto.csv",sep=";",encoding='utf-8')
#data.date = pd.to_datetime(data.date,format="%d. %B %Y")
#data.shape



In [2]:
t = NewsTrends.topicModel(data,
                          key_idx=list(data.columns).index("uuid"),
                          text_idx=list(data.columns).index("text"),
                          lang="en",random_state = 1,
                          bigram = True)

Train bigram model ...
mid april income_tax filing deadline contribute crypto fall price investor win big bet digital_token sell remain holding pay $_25 billion capital_gain levy tom_lee head_research fundstrat_global advisors write note thursday

bitcoin drop 40_percent past_month include 1.2 percent decline thursday $ 6,750 new_york close

massive outflow crypto dollar

lee write



In [3]:
tok_dat = t.tokenize_docs(rmv_tokens=["'s",'$',"mr","mr."])
corpus = t.get_bow(below = 0.05,above = 0.9)

In [4]:
#k,results = t.LDA_tune_k(max_k=30,iterations=1000)

In [5]:
#Train topic model
LDA = t.fit_LDA(12,iterations=5000)

INFO:lda:n_documents: 461
INFO:lda:vocab_size: 947
INFO:lda:n_words: 91367
INFO:lda:n_topics: 12
INFO:lda:n_iter: 5000
INFO:lda:<0> log likelihood: -882722
INFO:lda:<10> log likelihood: -682248
INFO:lda:<20> log likelihood: -657447
INFO:lda:<30> log likelihood: -648925
INFO:lda:<40> log likelihood: -644385
INFO:lda:<50> log likelihood: -640942
INFO:lda:<60> log likelihood: -639500
INFO:lda:<70> log likelihood: -637588
INFO:lda:<80> log likelihood: -637519
INFO:lda:<90> log likelihood: -636031
INFO:lda:<100> log likelihood: -635909
INFO:lda:<110> log likelihood: -635001
INFO:lda:<120> log likelihood: -634224
INFO:lda:<130> log likelihood: -634237
INFO:lda:<140> log likelihood: -634315
INFO:lda:<150> log likelihood: -634488
INFO:lda:<160> log likelihood: -633643
INFO:lda:<170> log likelihood: -633720
INFO:lda:<180> log likelihood: -633399
INFO:lda:<190> log likelihood: -633447
INFO:lda:<200> log likelihood: -633511
INFO:lda:<210> log likelihood: -633533
INFO:lda:<220> log likelihood: -63

INFO:lda:<2040> log likelihood: -628519
INFO:lda:<2050> log likelihood: -628459
INFO:lda:<2060> log likelihood: -628544
INFO:lda:<2070> log likelihood: -628270
INFO:lda:<2080> log likelihood: -628425
INFO:lda:<2090> log likelihood: -628340
INFO:lda:<2100> log likelihood: -628557
INFO:lda:<2110> log likelihood: -629367
INFO:lda:<2120> log likelihood: -627756
INFO:lda:<2130> log likelihood: -627714
INFO:lda:<2140> log likelihood: -627656
INFO:lda:<2150> log likelihood: -627478
INFO:lda:<2160> log likelihood: -627473
INFO:lda:<2170> log likelihood: -626669
INFO:lda:<2180> log likelihood: -626617
INFO:lda:<2190> log likelihood: -626879
INFO:lda:<2200> log likelihood: -626706
INFO:lda:<2210> log likelihood: -627158
INFO:lda:<2220> log likelihood: -627085
INFO:lda:<2230> log likelihood: -627281
INFO:lda:<2240> log likelihood: -626679
INFO:lda:<2250> log likelihood: -627400
INFO:lda:<2260> log likelihood: -627615
INFO:lda:<2270> log likelihood: -627224
INFO:lda:<2280> log likelihood: -627243


INFO:lda:<4080> log likelihood: -627120
INFO:lda:<4090> log likelihood: -626927
INFO:lda:<4100> log likelihood: -627177
INFO:lda:<4110> log likelihood: -626814
INFO:lda:<4120> log likelihood: -627135
INFO:lda:<4130> log likelihood: -627048
INFO:lda:<4140> log likelihood: -626931
INFO:lda:<4150> log likelihood: -627114
INFO:lda:<4160> log likelihood: -627418
INFO:lda:<4170> log likelihood: -627984
INFO:lda:<4180> log likelihood: -627542
INFO:lda:<4190> log likelihood: -627529
INFO:lda:<4200> log likelihood: -627589
INFO:lda:<4210> log likelihood: -627462
INFO:lda:<4220> log likelihood: -627175
INFO:lda:<4230> log likelihood: -627497
INFO:lda:<4240> log likelihood: -627086
INFO:lda:<4250> log likelihood: -627196
INFO:lda:<4260> log likelihood: -626940
INFO:lda:<4270> log likelihood: -627375
INFO:lda:<4280> log likelihood: -626991
INFO:lda:<4290> log likelihood: -626911
INFO:lda:<4300> log likelihood: -626633
INFO:lda:<4310> log likelihood: -626985
INFO:lda:<4320> log likelihood: -627414


In [6]:
#Inspect topic results
topics = t.inspect_topics(n_top_words=10)

result = ''
for i, topic_words in enumerate(topics):
    result += '* **Topic {}:** {}\n'.format(i, ' '.join(topic_words))
NewsTrends.MD(result)

In [7]:
topics = [["People like"],
          ["Company services & payments"],
          ["Companies use blockchain"],
          ["Trump New York"],
          ["Websites and cryptocurrency"],
          ["Russian & security"],
          ["Hacker steal bitcoin"],
          ["India and China"],
          ["Facebook and Cambridge Analytica"],
          ["Crypto exchanges and banks"],
          ["System, network and transactions"],
          ["bitcoin price, investors and returns"]
          
         ]
len(topics)

12

In [8]:
doctopics = t.get_doc_topic(topics)
t.generate_network(node_attr=[ u'country', u'publishedDate',u'fb_likes', u'fb_shares',u'url',u"site_type","source"],
                  node_label="title", similarity_cutoff = 0.8,
                 similarity_measure = "correlation")

In [9]:
#doctopics = t.get_doc_topic(topics)
#t.generate_network(node_attr=[ u'title',u'location'],
 #                 node_label="title", similarity_cutoff = 0.9,
  #               similarity_measure = "correlation")

# Visualize

## Executive Summary

* By using intelligent text processing and learning directly from the textual context, the following analysis finds meaningful connections and relationships in a large corpus of unstructured data.
* The analysis of news articles related to cryptocurrencies and blockchain reveals a clearly optimistic narrative
* A large share of stories is concerned with the development of the *bitcoin* price and cryptocurrency exchanges
* A smaller share of news is concerned with blockchain technology in relation to Russia, hacker attacks and businesses

## Data Summary

In [10]:
import pprint
print("The data set contains {} stories.".format(data.shape[0]))
print()
print("The date range is {} to {}".format(data.publishedDate.min(),data.publishedDate.max()))
print()
print("Top sources include:")
print()
for a, b in enumerate(data.groupby("source")["uuid"].agg("count").sort_values(ascending=False)[0:5].keys(), 1):
    print('{} {}'.format(a, b))

The data set contains 461 stories.

The date range is 2018-04-03 to 2018-05-04

Top sources include:

1 www.independent.co.uk
2 www.cnbc.com
3 www.bloomberg.com
4 www.forbes.com
5 medium.com


In [19]:
#Plotly
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from ipywidgets import widgets
from plotly.widgets import GraphWidget

plotly.offline.init_notebook_mode(connected=True)
plotly.tools.set_credentials_file(username='bockjohannes@gmx.de', api_key='U8wOvUmXSdm9E9IB1l47')

## Document Analysis

*Note: If the following plots are not displayed nicely, hit refresh and wait until fully loaded.*

In [25]:
import os
from shutil import copyfile

# Move templates
source = "01_Vorlage/"
dst = "graphs/network/"

for f in ["index.html","css/style.css","js/main.js"]:
    copyfile(source+f, dst+f)


### The fairly spread out network reveals a diverse discussion of crypto tech with an optimistic narrative at the core ([HowTo](http://nbviewer.jupyter.org/github/bockjo/Udacity_portfolio/blob/master/networks_how_to.png)).

In [28]:
from IPython.core.display import HTML

HTML('<iframe src="graph/network/index.html" width="90%" height="520" seamless>Netzwerk kann nicht angezeigt werden.</iframe>')

## Topics over time

### Especially in recent days the impact of blockchain tech on businesses has gained attention.

In [14]:
# generate a function to handle changes in the widget
def timeline(y,date_col,colorby,aggregate="nunique",timeinterval=None):

    plot_dat = t.df.copy()
    plot_dat[date_col] = pd.to_datetime(plot_dat[date_col]).dt.week
        
    plot_dat = plot_dat.groupby([date_col,colorby])[y].agg(aggregate)
    dat = plot_dat.reset_index()
    
    dataset = []
    for gr in dat[colorby].unique():
        dataset.append(go.Bar(
        x=dat[dat[colorby]==gr][date_col],
        y=dat[dat[colorby]==gr][y],
        name=gr
    ))

    layout = go.Layout(
        autosize=True,
        barmode='stack',
        showlegend = True,
        legend=dict(orientation="h",x=0.1,y=1.5),
        yaxis = dict(title = y+" ("+aggregate+")"),
        xaxis = dict(title="Week number")
    )
    
    
    fig = go.Figure(data=dataset, layout=layout)
    return py.iplot(fig, filename='stacked-bar')

timeline(y="uuid", date_col = 'publishedDate',colorby="topic_terms",aggregate="nunique")

## Histogram

### Especially stories of hackers exploiting bitcoin's anonymity to blackmail their victims were shared heavily on Facebook.

In [15]:
# generate a function to handle changes in the widget
def hist(x,colorby="topic_terms",logx = False):

    plot_dat = t.df.copy()
    
    if logx:
        plot_dat[x]=np.log(plot_dat[x])
        #typ = 'log'
    #else:
        #typ="normal"
    data = []
    for gr in plot_dat[colorby].unique():
        data.append(go.Histogram(
            x=plot_dat.loc[plot_dat[colorby]==gr,x],
            name=gr
            
    ))

    layout = go.Layout(barmode='stack',legend=dict(orientation="h",x=0.1,y=1.5),
                       autosize=True,
                #width=800,
                #height=550,
                       xaxis=dict(title=x,type="normal",autorange=True,
                                 exponentformat= "e" if logx else "none"),
                      yaxis = dict(title="count"))
    fig = go.Figure(data=data, layout=layout)

    
    return py.iplot(fig, filename='stacked histogram')

hist(x='fb_likes',colorby="topic_terms",logx=True)

## Bar Chart

### XXX.

In [16]:
def bar(x,groupby,colorby = None, aggregate = "sum"):
    plot_dat = t.df.copy()
    
    if colorby == None:
        plot_dat = plot_dat.groupby(groupby)[x].agg(aggregate)
        data =[
            go.Bar(
            y=plot_dat.index.values,
            x=plot_dat.values,
            orientation = 'h')
        ]
        fig = go.Figure(data=data)
    else:
        data = []
        plot_dat = plot_dat.groupby([colorby,groupby])[x].agg(aggregate)
        for gr in plot_dat.index.levels[0]: 
            data.append(go.Bar(
            y=plot_dat[gr].index.values,
            x=plot_dat[gr].values,
            name=gr,
            orientation = 'h'
        ))
        
        layout = go.Layout(
            autosize=False,
        width=800,
        height=550,
            barmode='stack',
            showlegend = True,
            xaxis = dict(title=x+" ("+aggregate+")",position=-1.),
            yaxis = dict(),
            legend=dict(orientation="h",x=0.1,y=1.2),
            margin = go.Margin(l=330,r=50)
        )
      
        fig = go.Figure(data=data, layout=layout)
    
    return py.iplot(fig, filename='bar')

bar(x="uuid",groupby="topic_terms",colorby="country",aggregate="nunique")

## Scatter plot

### There is a linear relationship between number of articles published and the popularity of the topic in social media.

In [17]:
def scatter(x,y,groupby,colorby = None,sizeby = "fixed", aggregate = {"x":"sum","y":"sum","colorby":"sum"},
            axistype = {"x":"normal","y":"normal"},
            xtitle="", ytitle=""):
    
    plot_dat = t.df.copy()
    if sizeby=="fixed":
        s=15
    if colorby == None:
        colorby = groupby
        
    if plot_dat[colorby].dtype == np.object:
        plot_datx = plot_dat.groupby([colorby,groupby])[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby([colorby,groupby])[y].agg(aggregate["y"])
        
        dat = []
        for cat in plot_dat[colorby].unique():
            
            dat.append({"x":plot_datx[cat].values,
                       "y":plot_daty[cat].values,
                        "text":plot_daty[cat].index,
                       "name":cat,
                       "mode":"markers",
                       "marker":dict(size=s)})
        fig = {
            'data': dat,
            'layout': {"legend":dict(orientation="h",x=0.1,y=1.5),"showlegend": True,
                'xaxis': {'title': xtitle,"type":axistype["x"],"exponentformat": "e" if axistype["x"]=="log" else "none"},
                'yaxis': {'title': ytitle,"type":axistype["y"],"exponentformat": "e" if axistype["y"]=="log" else "none"},
                "autosize":True
            }
        }
    else:
        plot_datx = plot_dat.groupby(groupby)[x].agg(aggregate["x"])
        plot_daty = plot_dat.groupby(groupby)[y].agg(aggregate["y"])
        plot_datcol = plot_dat.groupby(groupby)[colorby].agg(aggregate["colorby"])
        
        trace1 = go.Scatter(
            y = plot_daty.values,
            x = plot_datx.values,
            mode='markers',
            marker=dict(
                size=s,
                color = plot_datcol.values, #set color equal to a variable
                colorscale='Viridis',
                showscale=True
            )
        )
        layout = dict(xaxis=dic(title=xtitle,type=axistype["x"],exponentformat= "e" if axistype["x"]=="log" else "none"),
                      yaxis=dic(title=ytitle,type=axistype["y"],exponentformat= "e" if axistype["y"]=="log" else "none"))
        fig = go.Figure(data=[trace1],layout = layout)
        
    return py.iplot(fig, filename='bar')

scatter(x="uuid",y='fb_likes',groupby="topic_terms",colorby = "topic_terms",
        sizeby = "fixed",xtitle="Publish-Count",ytitle="fb-likes",
        axistype=dict(x="normal",y="normal"),
        aggregate = {"x":"nunique","y":"sum","colorby":"sum"})

In [18]:
# jupyter nbconvert CryptoNews7.ipynb --to slides --template temp --post serve
# http://nbviewer.jupyter.org/format/slides/github/bockjo/Udacity_portfolio/blob/master/filename.ipynb