In [1]:
# Data preparation for Ph.D thesis
# @author: Andres L. Suarez-Cetrulo
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import requests

PATH = "/home/cetrulin/Desktop/Andres/data"
RAW_DATA_PATH = PATH+"/raw/"

In [2]:
"""
Retrieve intraday stock data from Alpha Vantage API.
"""

#Alpha Vantage API to download 15 days of minute data (only if required)
from alpha_vantage.timeseries import TimeSeries
apikey='E8HE93TKWNLCACEU'

# Get pandas object with the intraday data and another with the call's metadata
ts = TimeSeries(key=apikey, output_format='pandas')

In [3]:
"""
Retrieve intraday stock data from Google Finance.
"""

import csv
import datetime
import re

import pandas as pd
import requests

def get_google_finance_intraday(ticker, period=60, days=1, exchange='USD', debug=False):
    """
    Retrieve intraday stock data from Google Finance.
    Parameters
    ----------
    ticker : str
        Company ticker symbol.
    period : int
        Interval between stock values in seconds.
    days : int
        Number of days of data to retrieve.
    Returns
    -------
    df : pandas.DataFrame
        DataFrame containing the opening price, high price, low price,
        closing price, and volume. The index contains the times associated with
        the retrieved price values.
    """
    
    uri = 'https://finance.google.com/finance/getprices' \
          '?&p={days}d&f=d,o,h,l,c,v&q={ticker}&i={period}?x={exchange}'.format(ticker=ticker, period=period,  days=days, exchange=exchange)
    
    if(debug): 
        print (uri)
        
    page = requests.get(uri)
    reader = csv.reader(page.content.splitlines())
    columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    rows = []
    times = []
    for row in reader:
        if re.match('^[a\d]', row[0]):
            if row[0].startswith('a'):
                start = datetime.datetime.fromtimestamp(int(row[0][1:]))
                times.append(start)
            else:
                times.append(start+datetime.timedelta(seconds=period*int(row[0])))
            rows.append(map(float, row[1:]))
    if len(rows):
        return pd.DataFrame(rows, index=pd.DatetimeIndex(times, name='Date'),
                            columns=columns)
    else:
        return pd.DataFrame(rows, index=pd.DatetimeIndex(times, name='Date'))

In [10]:
"""
Retrieve intraday crypto data from Cryptocompare.
"""

def get_cryptocompare_intraday(symbol, comparison_symbol='USD', limit=9999, aggregate = 1, exchange=''):
    url = 'https://min-api.cryptocompare.com/data/histominute?fsym={}&tsym={}&limit={}&aggregate={}'. \
           format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
    if exchange:
        url += '&e={}'.format(exchange)
    page = requests.get(url)
    data = page.json()['Data']
    df = pd.DataFrame(data)
    df['timestamp'] = [datetime.datetime.fromtimestamp(d) for d in df.time]
    return df

In [11]:
"""
Download price for a given symbol using Google Finance, Alpha Vantage or Crypto Compare
"""

def download_single_price_from(symbol,period=60,days=20,exchange='USD',site="google",debug=True, \
                               path="default" ,name="default"): #real max days at 1min level is 15...
    df = pd.DataFrame({'A' : []})
    site_option = ""
    
    # Download index price
    if site=="google_finance":
        df=get_google_finance_intraday(symbol, period, days, exchange, debug) 
    elif site=="alpha_vantage":        
        df, meta_data = ts.get_intraday(symbol, interval='1min', outputsize='full')
    elif site=="crypto_compare": # using passed exchange as comparison_symbol (quick fix) and passing max limit
        df = get_cryptocompare_intraday(symbol = symbol, comparison_symbol = exchange)

    # Save index prices
    output_csv_path=check_or_create_path(path)+"/"+name+"_"+str(datetime.date.today())+".csv"
    df.to_csv(output_csv_path, sep=';', encoding='utf-8')

In [12]:
"""
Download price for all the symbols of a given list
"""

import os
import sys
import time

def download_list_of_prices(root_path, list_file, symbols_subpath, \
                            period=60,days=20,exchange='USD',site="google",debug=True, from_symbol=''):
    # Initialize pointer
    download_symbol=True
    if (from_symbol!=''): download_symbol=False
    
    # Load list of stocks
    symbols=pd.read_csv(root_path+"/"+list_file, sep=';', parse_dates=True,infer_datetime_format=True)
    
    # Check output root paths (and create them if needed)
    symbols_full_path=check_or_create_path(str(check_or_create_path(root_path+"/"+site+"/"+symbols_subpath))+"/symbols")
       
    counter=0
    #Go through list of prices
    for symbol in symbols['symbol'].tolist():
        counter=counter+1
        if (download_symbol or symbol==from_symbol):
            print ""+str(counter)+"/"+str(len(symbols['symbol'].tolist()))+" = "+symbol
            download_symbol=True # It continues when the selected symbol appears
            try: download_single_price_from(symbol=symbol,period=period,days=days,exchange=exchange,site=site,debug=debug,\
                                            path=symbols_full_path+"/"+symbol+"/",name=symbol)    
            except: # catch *all* exceptions
                e = sys.exc_info()[0]
                print( "<p>Error: %s</p>" % e )
                # Recursive function that tries again from pointer when crashing (awaits 5 seconds to retry)
                time.sleep(5)
                #TO-DO. Clean downloaded symbols from list to avoid iterating through them
                download_list_of_prices(root_path=root_path, list_file=list_file, symbols_subpath=symbols_subpath, \
                            period=period,days=days,exchange=exchange,site=site,debug=debug, from_symbol=symbol)
            #TO-DO
            #else: put recursive function here

    print("Done!")

In [13]:
# Check if the path does exist. If it doesnt, create a folder for the given symbol
def check_or_create_path(path):
    directory = os.path.dirname(path)
    try: 
        os.stat(directory)
    except: 
        os.mkdir(directory)
    return path

In [None]:
# 1 Download SPX index
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("SPX", 60, 20, "USD", site ,True, RAW_DATA_PATH+site+"/"+"S&P500/index","S&P500")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=SPX&i=60?x=USD


In [None]:
# 2 Download all SPX stocks
from_symbol="LUK" #"BAX" #"LUK" "#DAL" 

for site in ["alpha_vantage"]: #"alpha_vantage"]: # ,"google_finance" 
    download_list_of_prices(root_path=PATH+"/raw",list_file="SPX_list.csv",symbols_subpath="S&P500", \
                            period=60,days=20,exchange='USD',site=site,debug=False,from_symbol=from_symbol)

287/505 = LUK
288/505 = LLY
289/505 = LNC
290/505 = LKQ
291/505 = LMT
292/505 = L
293/505 = LOW
<p>Error: <class 'urllib2.HTTPError'></p>
293/505 = LOW
294/505 = LYB
295/505 = MTB
296/505 = MAC
297/505 = M
298/505 = MRO
299/505 = MPC
300/505 = MAR
301/505 = MMC
302/505 = MLM
303/505 = MAS
304/505 = MA
305/505 = MAT
306/505 = MKC
307/505 = MCD
308/505 = MCK
309/505 = MDT
310/505 = MRK
311/505 = MET
312/505 = MTD
313/505 = MGM
314/505 = KORS
315/505 = MCHP
316/505 = MU
317/505 = MSFT
318/505 = MAA
<p>Error: <class 'urllib2.HTTPError'></p>
318/505 = MAA
319/505 = MHK
320/505 = TAP
321/505 = MDLZ
322/505 = MON
323/505 = MNST
324/505 = MCO
325/505 = MS
326/505 = MOS
327/505 = MSI
328/505 = MYL
329/505 = NDAQ
330/505 = NOV
331/505 = NAVI
332/505 = NTAP
333/505 = NFLX
334/505 = NWL
335/505 = NFX
336/505 = NEM
337/505 = NWSA
338/505 = NWS
339/505 = NEE
340/505 = NLSN
341/505 = NKE
342/505 = NI
343/505 = NBL
344/505 = JWN
345/505 = NSC
346/505 = NTRS
347/505 = NOC
348/505 = NRG
349/505 = NUE
35

In [8]:
# 3 Download SPY ETF
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("SPY",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"S&P500/spy_eft","SPY")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=SPY&i=60?x=USD


In [14]:
# 4 Download all SPY holders (it should be the same than SPX as SPY tries to replicate the behaviour of the index)
from_symbol='' #DAL, LUK 

# But check/try just in case one of the lists is incomplete
for site in ["google_finance"]: #"alpha_vantage"]: # "google_finance",
    download_list_of_prices(root_path=PATH+"/raw",list_file="SPX_list.csv",symbols_subpath="S&P500", \
                            period=60,days=20,exchange='USD',site=site,debug=False,from_symbol=from_symbol)

1/505 = MMM
2/505 = ABT
3/505 = ABBV
4/505 = ACN
5/505 = ATVI
6/505 = AYI
7/505 = ADBE
8/505 = AMD
9/505 = AAP
10/505 = AES
11/505 = AET
12/505 = AMG
13/505 = AFL
14/505 = A
15/505 = APD
16/505 = AKAM
17/505 = ALK
18/505 = ALB
19/505 = ARE
20/505 = ALXN
21/505 = ALGN
22/505 = ALLE
23/505 = AGN
24/505 = ADS
25/505 = LNT
26/505 = ALL
27/505 = GOOGL
28/505 = GOOG
29/505 = MO
30/505 = AMZN
31/505 = AEE
32/505 = AAL
33/505 = AEP
34/505 = AXP
35/505 = AIG
36/505 = AMT
37/505 = AWK
38/505 = AMP
39/505 = ABC
40/505 = AME
41/505 = AMGN
42/505 = APH
43/505 = APC
44/505 = ADI
45/505 = ANDV
46/505 = ANSS
47/505 = ANTM
48/505 = AON
49/505 = AOS
50/505 = APA
51/505 = AIV
52/505 = AAPL
53/505 = AMAT
54/505 = ADM
55/505 = ARNC
56/505 = AJG
57/505 = AIZ
58/505 = T
59/505 = ADSK
60/505 = ADP
61/505 = AZO
62/505 = AVB
63/505 = AVY
64/505 = BHGE
65/505 = BLL
66/505 = BAC
67/505 = BK
68/505 = BCR
69/505 = BAX
70/505 = BBT
71/505 = BDX
72/505 = BRK.B
73/505 = BBY
74/505 = BIIB
75/505 = BLK
76/505 = HRB
77/5

In [9]:
# 5 Download EURUSD
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("EURUSD",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"EURUSD","EURUSD")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=EURUSD&i=60?x=USD


HTTPError: HTTP Error 503: Service Unavailable

In [10]:
# Extra: Downloading Bitcoin-USD
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("BTCUSD",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"BITCOIN","BTCUSD")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=BTCUSD&i=60?x=USD


In [11]:
# Extra: Downloading IBEX (not available in google finance)
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("IB",60,20,"EUR",site,True,RAW_DATA_PATH+site+"/"+"IBEX35","IB")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=IB&i=60?x=EUR


In [12]:
# Extra: Downloading NASDAQ
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("IXIC",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"NASDAQ/index","NASDAQ")
    # NASDAQ EFTs to track volumes (?): QQQ looks like the only complete one at 1 min level
    download_single_price_from("QTEC",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"NASDAQ/qtec_eft","QTEC")
    download_single_price_from("QQQ",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"NASDAQ/qqq_eft","QQQ")
    download_single_price_from("IBB",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"NASDAQ/ibb_eft","IBB")
    download_single_price_from("ONEQ",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"NASDAQ/oneq_eft","ONEQ")
    # TODO: Download prices NASDAQ 100
    # download_list_of_prices(root_path=PATH+"/raw",list_file="NASDAQ_100_list.csv",symbols_subpath="NASDAQ", \
    #                         period=60,days=20,exchange='USD',site,debug=False)

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=IXIC&i=60?x=USD
https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=QTEC&i=60?x=USD
https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=QQQ&i=60?x=USD
https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=IBB&i=60?x=USD
https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=ONEQ&i=60?x=USD


In [13]:
# Extra: Downloading S (for the craic :) )
for site in ["google_finance","alpha_vantage"]:
    download_single_price_from("S",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"S","S")

https://finance.google.com/finance/getprices?&p=20d&f=d,o,h,l,c,v&q=S&i=60?x=USD


In [None]:
# Downloading crypto
for site in ["crypto_compare"]:
    try_download("BTC",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"BITCOIN/","BTCUSD",0)
    try_download("ETH",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"ETHEREUM","ETHUSD",0)
    try_download("XRP",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"RIPPLE","XRPUSD",0)
    try_download("LTC",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"LITECOIN","LTCUSD",0)
    try_download("IOT",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"IOTA","IOTUSD",0)
    try_download("XMR",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"MONERO","XMRUSD",0)
    try_download("DASH",60,20,"USD",site,True,RAW_DATA_PATH+site+"/"+"DASH","DASHUSD",0)

In [3]:
# MANUAL CALL

import os

# Load list of stocks
spx_symbols=pd.read_csv("/home/cetrulin/Desktop/Andres/data/raw/SPX_list.csv", \
               sep=';', parse_dates=True,infer_datetime_format=True)

#Go through list of prices
for symbol in spx_symbols['symbol'].tolist():
    
    # Check if the path does exist. If it doesnt, create a folder for the given symbol
    file_path = "/home/cetrulin/Desktop/Andres/data/raw/google_finance/S&P500/symbols/"+symbol+"/"
    directory = os.path.dirname(file_path)

    try:
        os.stat(directory)
    except:
        os.mkdir(directory)

    # Get each SPX symbol from google finance
    prices=get_google_finance_intraday(ticker=symbol,period=60,days=20,exchange='USD', debug=False) 
    
    # Save it in csv
    prices.to_csv(file_path+symbol+"_"+str(datetime.date.today())+".csv", sep=';', encoding='utf-8')
