In [2]:
from pytrends.request import TrendReq
from bs4 import BeautifulSoup
import urllib

import numpy as np
import pandas as pd

import time
import os

pytrends = TrendReq(hl='en-US', tz=360)

# Petite exploration du package pytrends

In [3]:
bitcoin_keywords = ['blockchain', 'bitcoin', 'btc', 'cryptocurrency', 'hodl']

In [4]:
a = pytrends.build_payload(bitcoin_keywords, cat=0, timeframe='2017-05-01 2017-08-01', geo='', gprop='')

In [5]:
pytrends.interest_over_time()

Unnamed: 0_level_0,blockchain,bitcoin,btc,cryptocurrency,hodl,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-05-01,3,23,8,2,0,False
2017-05-02,4,26,8,1,0,False
2017-05-03,4,26,8,1,0,False
2017-05-04,4,33,10,2,0,False
2017-05-05,4,31,10,2,0,False
2017-05-06,3,25,9,2,0,False
2017-05-07,3,25,8,2,0,False
2017-05-08,4,29,10,2,0,False
2017-05-09,5,37,11,2,0,False
2017-05-10,5,34,10,2,0,False


# Obtenir des chiffres pertinents sur l'année
Par défaut, sur Google Trends, lorsque l'on fait une recherche sur un mot-clé sur une période par un an, la granularité est par semaine (et impossible d'en changer).

On écrit donc quelques fonctions pour "tricher" un peu : pour un mot-clé donné, on décide donc de prendre deux périodes plus petites (sept-huit mois), quitte à rescaler la deuxième période en se servant de la période de chevauchement intermédiaire.

In [6]:
def google_trends_tables(keyword):
    
    # Google Trends ne permet pas d'avoir une granularité par jour sur une période d'un an.
    #
    # Pour régler le problème, je bricole un peu :
    #  - Je fais une première période qui va du 1er mai 2017 au 1er janvier 2018 (bizarrement, ça marche),
    #  - Et une deuxième période qui va du 1er octobre 2017 au 1er mai 2018.
    #
    # J'utiliserai la période de chevauchement (trois mois) pour déterminer un scale moyen, que j'applique à la
    # colonne de nouvelles valeurs.
    #
    # Les valeurs sont pas vraiment les mêmes (Google s'amuse probablement avec du sampling), mais c'est pas
    # si mal.
    
    keyword = [keyword] # La méthode build_payload demande d'avoir un objet itérable (donc liste)
    
    pytrends_before, pytrends_after = TrendReq(hl = 'en-US', tz = 0), TrendReq(hl = 'en-US', tz = 0)
    
    # Déterminer les modèles pytrends
    pytrends_before.build_payload(keyword, cat = 0, timeframe = '2017-05-01 2018-01-01', geo = '', gprop = '')
    pytrends_after.build_payload(keyword, cat=0, timeframe='2017-10-01 2018-05-01', geo='', gprop='')
    
    # On obtient les tables correspondantes
    table_before = pytrends_before.interest_over_time()
    table_after = pytrends_after.interest_over_time()
    
    return (table_before, table_after)

def scale_generator(table_before, table_after):
    
    # Fonction qui permet de scale
    
    common_table = table_before.merge(table_after, "inner", right_index = True, left_index = True)
    common_table.columns = ['before_value', 'filler1', 'after_value', 'filler2']
    common_table['scale'] = common_table.before_value / common_table.after_value
        
    return common_table.scale.mean()
    
    
def google_trends_common(keyword):
    
    # Génère une grosse table qui fusionne un peu les deux tables before et after
    
    tables = google_trends_tables(keyword)
    table_before, table_after = tables[0], tables[1]
    
    # On renomme les colonnes juste avec le mot clé
    table_before.columns = [keyword, "isPartial"]
    table_after.columns = [keyword, "isPartial"]
    
    # On applique le scale sur la deuxième table
    table_after[keyword] = table_after[keyword].apply(lambda x: int(scale_generator(table_before, table_after) * x), 1)
    
    return table_before.merge(table_after, "outer", right_index = True, left_index = True)


def trends_merge(keyword, min_length = 93):
    
    # Simplifie pour obtenir un truc un peu plus petit
    
    common_table = google_trends_common(keyword)
    
    x_values, y_values = common_table[keyword + '_x'], common_table[keyword + '_y']
    
    # Annuler les valeurs nulles
    x_values = x_values[~np.isnan(x_values)]
    y_values = y_values[~np.isnan(y_values)][min_length:]
    
    merged_values = list(x_values) + list(y_values)    
    merged_values = [value * 100 // max(merged_values) for value in merged_values]
    
    return merged_values
    
    
def google_trends_unique(keyword, min_length = 93):
    
    # Fonction finale à retenir pour l'utilisateur
    
    common_table = google_trends_common(keyword)

    common_table[keyword] = trends_merge(keyword, min_length)
    
    common_table.drop([keyword + '_x', keyword + '_y', "isPartial" + "_x", "isPartial" + "_y"], 1, inplace = True)
    
    return common_table
    

def google_trends(keywords, min_length = 93):
    
    trends = google_trends_unique(keywords[0], min_length)
    
    for keyword in keywords[1:]:
        
        trends = trends.merge(google_trends_unique(keyword), 'left', left_index = True, right_index = True)
        
    return trends

## Ça donne quoi en pratique ?

In [7]:
trends = google_trends(['bitcoin', 'btc', 'cryptocurrency', 'hodl', 'blockchain'])
trends

Unnamed: 0_level_0,bitcoin,btc,cryptocurrency,hodl,blockchain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-01,4.0,10.0,2.0,1.0,14.0
2017-05-02,4.0,9.0,1.0,1.0,17.0
2017-05-03,4.0,10.0,1.0,1.0,18.0
2017-05-04,5.0,12.0,2.0,1.0,18.0
2017-05-05,5.0,11.0,2.0,2.0,17.0
2017-05-06,4.0,10.0,2.0,0.0,13.0
2017-05-07,4.0,8.0,3.0,1.0,13.0
2017-05-08,4.0,11.0,3.0,2.0,17.0
2017-05-09,6.0,12.0,3.0,1.0,20.0
2017-05-10,5.0,12.0,2.0,1.0,21.0


On extrait ensuite les données dans un fichier .csv, qui sera exploité sur le notebook R (plus facile à utiliser pour les modèles de prédiction).

In [8]:
trends.to_csv('./data/trends.csv')

# Tentative de scraping à partir de résultats Google

In [9]:
import feedparser

d = feedparser.parse("http://feeds.reuters.com/reuters/technologyNews")

In [10]:
import requests
r = requests.get("https://www.google.com/search?q=bitcoin&tbas=0&tbs=cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&tbm=nws&ei=GuygW9O9Gc7LsAHi0Y3gBQ&sa=N&biw=709&bih=821&dpr=1&start=0",
                 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'})

soup = BeautifulSoup(r.text, "html.parser")

In [22]:
def google_scraper(document_url, min_ind = 0, max_ind = np.Inf, step = 1, sleep = 1, header = True, **kwargs):
        
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3112.113 Safari/537.36'}
    # Header to show that we are not a bot
    # Note that for some reason, the function works much better if we pretend we are using Chrome:
    #  - The time filter (from May 1st 2017 to May 1st 2018) works as intended, which is not the case with Firefox,
    #  - The date in the snippets can be recognised normally, thanks to its HTML tag
    
    results = pd.DataFrame(columns = kwargs)
    
    ind = min_ind - step # The number of the pages that we iterate through
    attempt = 0
    
    if document_url[-1] == '1':
        document_url = document_url[:-1] # In order to have the proper number of pages!
    
    print('Document: ', document_url)

    while True and ind < max_ind:
        ind += step
        print('Page', ind, " –  URL: ", document_url + str(ind)) # Proves the algorithm is still working
        result = pd.DataFrame(columns = kwargs)
        time.sleep(sleep) # May be needed against anti-bot measures?
        
        try:
            if header == True:
                req = requests.get(document_url + str(ind), headers = headers)
            else: # Sometimes (like with BestBuy), it does not even work if there is a header ...
                req = requests.get(document_url + str(ind))
                
            soup = BeautifulSoup(req.text, "html.parser")
    
            for key, value in kwargs.items():
                result[str(key)] = soup.findAll(value[0], value[1])
            
            result['page'] = (ind // step) + (1 - min_ind)
            results = results.append(result, ignore_index = True)
            
            if len(result) == 0 and attempt >= 3: # If the result is empty, there are no more comments to get -> we break the loop
                return(results)
            
            elif len(result) == 0:
                attempt += 1
                ind -= 10
                print ('Attempt no.' + str(attempt))
                
            else:
                attempt = 0
        
        except KeyboardInterrupt: # If we want to stop the code, well, we still retain our data
            print('Keyboard Interrupt')
            return(results)
        
        #except: # Prevents the code from stopping abruptly (sometimes the website will return an error for no reason)
        #    print('Some error:', ind)
        #    continue
            
    else:
        return(results)    

In [41]:
def data_scraper(html_doc, html_type, html_class):
    
    try:
        result = html_doc.findAll(html_type, html_class)[0].text
        
    except IndexError:
        result = np.NaN
        
    return result

def data_interpretator(result, col_name, **kwargs):
    
    for key, value in kwargs.items():
        
        result[str(key)] = result[col_name].apply(lambda x: data_scraper(x, value[0], value[1]))
        
    return result

In [28]:
bbc_results = google_scraper("https://www.google.com/search?q=bitcoin+site:bbc.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=",
              step = 10, max_ind = 20, snippet = ("div", "g"))

Document:  https://www.google.com/search?q=bitcoin+site:bbc.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=
Page 0  –  URL:  https://www.google.com/search?q=bitcoin+site:bbc.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=0
Page 10  –  URL:  https://www.google.com/search?q=bitcoin+site:bbc.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=10
Page 20  –  URL:  https://www.google.com/search?q=bitcoin+site:bbc.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=20


In [21]:
bbc_results

Unnamed: 0,page,snippet
0,0.0,"<div class=""g""><!--m--><div data-hveid=""CAAQAA..."
1,0.0,"<div class=""g""><!--m--><div data-hveid=""CAkQAA..."
2,0.0,"<div class=""g""><!--m--><div data-hveid=""CAgQAA..."
3,0.0,"<div class=""g""><!--m--><div data-hveid=""CAcQAA..."
4,0.0,"<div class=""g""><!--m--><div data-hveid=""CAYQAA..."
5,0.0,"<div class=""g""><!--m--><div data-hveid=""CAUQAA..."
6,0.0,"<div class=""g""><!--m--><div data-hveid=""CAMQAA..."
7,0.0,"<div class=""g""><!--m--><div data-hveid=""CAQQAA..."
8,0.0,"<div class=""g""><!--m--><div data-hveid=""CAIQAA..."
9,0.0,"<div class=""g""><!--m--><div data-hveid=""CAEQAA..."


In [203]:
bbc_results_bis = bbc_results

In [207]:
data_interpretator(bbc_results, "snippet",
                   title = ('h3', 'r'),
                   abstract = ('span', 'st'),
                   date = ('span', 'f'))

bbc_results['source'] = 'BBC.com'

In [208]:
bbc_results

Unnamed: 0,snippet,abstract,date,title,source
0,"<div class=""g""><!--m--><div data-hveid=""CAIQAA...",7 sept. 2017 - Bitcoin is notorious for its vo...,7 sept. 2017 -,BBC - Capital - What you need to know about th...,BBC.com
1,"<div class=""g""><!--m--><div data-hveid=""CAkQAA...",27 déc. 2017 - Bitcoin. Lots of people are tal...,27 déc. 2017 -,Bitcoin: Would you want to get paid in cryptoc...,BBC.com
2,"<div class=""g""><!--m--><div data-hveid=""CAcQAA...",7 mars 2018 - BBC investigation discovers shad...,7 mars 2018 -,UK company linked to laundered Bitcoin billion...,BBC.com
3,"<div class=""g""><!--m--><div data-hveid=""CAgQAA...","27 nov. 2017 - First, full disclosure - I am a...",27 nov. 2017 -,Bitcoin - risky bubble or the future? - BBC Ne...,BBC.com
4,"<div class=""g""><!--m--><div data-hveid=""CAYQAA...",12 déc. 2017 - The internet has recently been ...,12 déc. 2017 -,Bitcoin: Does it really use more electricity t...,BBC.com
5,"<div class=""g""><!--m--><div data-hveid=""CAUQAA...",11 déc. 2017 - The value of digital currency B...,11 déc. 2017 -,What is Bitcoin? - BBC News,BBC.com
6,"<div class=""g""><!--m--><div data-hveid=""CAQQAA...",5 févr. 2018 - The value of Bitcoin has fallen...,5 févr. 2018 -,"Bitcoin falls below $6,000 - BBC News - BBC.com",BBC.com
7,"<div class=""g""><!--m--><div data-hveid=""CAMQAA...","8 déc. 2017 - In its early days, Bitcoin might...",8 déc. 2017 -,Bitcoin: 'I'm part of a crazy wave' - BBC News,BBC.com
8,"<div class=""g""><!--m--><div data-hveid=""CAEQAA...",25 janv. 2018 - 50 Cent has discovered that he...,25 janv. 2018 -,50 Cent forgot he had a stash of Bitcoin now w...,BBC.com
9,"<div class=""g""><!--m--><div data-hveid=""CAAQAA...",9 févr. 2018 - Crypto-currencies like Bitcoin ...,9 févr. 2018 -,Russian nuclear scientists arrested for 'Bitco...,BBC.com


# Etablissement d'un corpus de sites

Ce corpus est composé de trois types de sites Internet, qui offrent chacun une perspective legèrement différente :

- Les sites de journaux orientés économique (type Financial Times, Wall Street Journal), qui s'intéressent au sujet, mais peut-être de manière un peu conservatrice,

- Les sites de journaux généralistes (The Guardian), qui suivent l'affaire d'un peu plus loin, et qui ont de bonnes chances *a priori* de suivre la hype lorsque le marché monte, et d'en parler beaucoup moins après une chute,

- Les sites spécialisés (Coindesk).

In [24]:
financial_newspapers = ['ft.com', 'bbc.com', 'cnn.com', 'cnbc.com', 'economist.com', 'forbes.com', 'wsj.com',
                        'bloomberg.com', 'investopedia.com', 'fortune.com', 'foxbusiness.com', 'born2invest.com',
                        'economictimes.indiatimes.com', 'business-standard.com', 'reuters.com']

general_newspapers = ['theguardian.com', 'nytimes.com', 'washingtonpost.com', 'chicagotribune.com', 'abcnews.go.com',
                      'cbsnews.com', 'nbcnews.com', 'thetimes.co.uk', 'independent.co.uk', 'time.com']

crypto_websites = ['coindesk.com', 'coinjournal.net', 'coininsider.com', 'cointelegraph.com', 'bitcoinmagazine.com',
                   'cryptonews.com']

In [49]:
def table_generator(website_list, max_ind = 400, sleep = 1):
    
    final_output = pd.DataFrame(columns = ['snippet', 'page', 'date', 'title', 'abstract'])
    
    for newspaper in website_list:
        
        google_url = "https://www.google.com/search?q=bitcoin+site:" + newspaper + "&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start="
        
        newspaper_results = google_scraper(google_url, step = 10, max_ind = max_ind, sleep = sleep, snippet = ('div', 'g'))
        
        data_interpretator(newspaper_results, "snippet",
                   title = ('h3', 'r'),
                   abstract = ('span', 'st'),
                   date = ('span', 'f'))
        
        newspaper_results['source'] = newspaper
        
        final_output = final_output.append(newspaper_results, ignore_index = True)
        
    return final_output

In [None]:
financial_table = table_generator(financial_newspapers, max_ind = 400, sleep = 5)

Document:  https://www.google.com/search?q=bitcoin+site:ft.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=
Page 0  –  URL:  https://www.google.com/search?q=bitcoin+site:ft.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=0
Page 10  –  URL:  https://www.google.com/search?q=bitcoin+site:ft.com&lr=lang_en&rlz=1C5CHFA_enFR566FR566&tbs=lr:lang_1en,cdr:1,cd_min:5/1/2017,cd_max:5/1/2018&ei=3lGhW73pC8r3qwGV6Kn4BQ&sa=N&biw=709&bih=821&start=10


In [53]:
financial_table.snippet[30].findAll('h3', 'r')

[]