In [19]:
#Grab data from google finance api
#Daily stock data will be grab for all tickers in tickerFile from today until lastDay
#and written to riak bucket 'stocks'

today = datetime(datetime.now().year, datetime.now().month, datetime.now().day)
lastDay = datetime(2000,1,1)
tickerFile = 'NYSE.txt'
dataSource = 'google'

In [20]:
stocks = pd.read_csv(tickerFile,sep='\t',header=None)

In [21]:
tickers = list(stocks[0])

In [None]:
dataGet = sc.parallelize(tickers[0:100])
dataGet.map(lambda x: getDataByTicker(x,dataSource,lastDay,today)).collect()

In [None]:
#Filtering the data
#1:Split into a tuple (TICKER, [CLOSE,VOLUME, DATE])
#2:Group by ticker
#3:Filter out all stocks with less than minDays days of data
#and all stocks with a single day volume of less than minVol
#4:Sort the data by date with the most recent day first
#5:Cut the data to length minDays

minVol = 20000
minDays = 2000


t0 = time.time()
d = sc.parallelize(tickers[0:100]).map(lambda x: (x, riakGetStock(x)))\
    .filter(lambda x: len(x[1]) > minDays)\
    .filter(lambda x: numpy.mean([i[1] for i in x[1]]) > minVol)\
    .map(lambda x: (x[0],mySort(x[1])))\
    .map(lambda x: (x[0],myFilter(x[1],minDays))).cache()

pairs = d.cartesian(d)\
    .map(lambda x: pairAnalysis(x,200))\
    .collect()
    
written = writePairs(pairs)
t1 = time.time()

total = t1-t0
total

In [None]:
pairs

In [None]:
written

In [66]:
import sys
import numpy
import time
from operator import add
from pyspark import SparkContext
import pandas as pd
import datetime
import json
import riak
import urllib2
import pytz
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from pandas.io.data import DataReader
import riak
from riak import RiakClient, RiakNode, RiakObject
import numpy as np
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts

def getData(tickerFile, dataSource, start, end):

    rc = RiakClient(pb_port=8087, protocol='pbc')
    added = []
    notAdded = []
    stock = pd.read_csv(tickerFile,sep='\t',header=None)

    #loop over all stock tickers
    for i in range(0,len(stock.head(100))):
        #get daily data for each ticker
        gtemp = pd.DataFrame()
        bucket = rc.bucket('stocks')
        try:
            gtemp = DataReader(stock.ix[i,0],  dataSource, start, end)
        except:
            pass
        
        #didnt get any data
        if len(gtemp) == 0:
            #print(str(i) + ': ' + str(stock.ix[i,0]) + ' : No Data : ' + str((float(i)*100)/len(stock)))
            notAdded.append(stock.ix[i,0])
        #got data
        else:
            #print(str(i) + ': ' + str(stock.ix[i,0]) + ' : ' + str(len(gtemp)) + ' :' + str(float(i)/len(stock)))
            added.append(stock.ix[i,0])
            
            for j in range(0,len(gtemp.index)):
            
                #upload json to Riak Bucket
                date = gtemp.index[j].date()
                riakKey = str(stock.ix[i,0]) + '_' + str(date)
                riakVal = {'OPEN': gtemp.values[j,0], \
                           'HIGH': gtemp.values[j,1], \
                           'LOW': gtemp.values[j,2], \
                           'CLOSE': gtemp.values[j,3], \
                           'VOLUME': gtemp.values[j,4],\
                           'DATE': str(date),\
                           'TICKER': str(stock.ix[i,0])}
                
                obj = RiakObject(rc, bucket, riakKey)
                
                obj.add_index("ticker_bin", str(stock.ix[i,0]))
                obj.add_index("year_int", int(date.year))
                obj.add_index("month_int", int(date.month))
                obj.add_index("day_int", int(date.day))
                
                obj.content_type = 'text/json'
                #obj.data = riakVal
                obj.data = json.dumps(riakVal)
                obj.store()

    return added, notAdded

#start is furthest day back and end is closest to today
def getDataByTicker(ticker, dataSource, start, end):

    rc = RiakClient(pb_port=8087, protocol='pbc')
    #get daily data for each ticker
    gtemp = pd.DataFrame()
    bucket = rc.bucket('stocks')
    try:
        gtemp = DataReader(ticker,  dataSource, start, end)
    except:
        pass
        
        #didnt get any data
    if len(gtemp) == 0:
        return 0
    #got data
    else:
        
        for j in range(0,len(gtemp.index)):
            
            #upload json to Riak Bucket
            date = gtemp.index[j].date()
            riakKey = str(ticker + '_' + str(date))
            riakVal = {'OPEN': gtemp.values[j,0],\
                        'HIGH': gtemp.values[j,1],\
                        'LOW': gtemp.values[j,2], \
                        'CLOSE': gtemp.values[j,3], \
                        'VOLUME': gtemp.values[j,4],\
                        'DATE': str(date),\
                        'TICKER': str(ticker)}
                
            obj = RiakObject(rc, bucket, riakKey)
                
            obj.add_index("ticker_bin", str(ticker))
            obj.add_index("year_int", int(date.year))
            obj.add_index("month_int", int(date.month))
            obj.add_index("day_int", int(date.day))
                
            obj.content_type = 'text/json'
            #obj.data = riakVal
            obj.data = json.dumps(riakVal)
            obj.store()

    return len(gtemp.index)

def riakSearchData(searchBucket, searchTerm, searchVal1, searchVal2):
    myData = {}
    myBucket = RiakClient(pb_port=8087, protocol='pbc').bucket(searchBucket)
    if searchVal2 != None:
        for key in myBucket.get_index(searchTerm, searchVal1, searchVal2): # get all from 2002 to 2012
            myData[key] = json.loads(myBucket.get(key).data)
    else:
        for key in myBucket.get_index(searchTerm, searchVal1): # get all from 2002 to 2012
            myData[key] = json.loads(myBucket.get(key).data)
    return myData

def storeKV(myBucket, myKey, myVal):
    riak.RiakClient(pb_port=8087, protocol='pbc').bucket(myBucket).new(myKey, data = myVal).store()
    return

def deleteKey(delBucket, delKey):
    riak.RiakClient(pb_port=8087, protocol='pbc').bucket(delBucket).delete(delKey)
    if riak.RiakClient(pb_port=8087, protocol='pbc').bucket(delBucket).get(delKey).data == None:
        print 'Successful delete: %s' % delKey
    else:
        print 'Failed delete: %s' % delKey
    return

def quickDeleteKey(delBucket,delKey):
    riak.RiakClient(pb_port=8087, protocol='pbc').bucket(delBucket).delete(delKey)
    return
    
def quickDeleteAllKeys(delBucket):
    for keys in  riak.RiakClient(pb_port=8087, protocol='pbc').bucket(delBucket).stream_keys():
        for delKey in keys:
            quickDeleteKey(delBucket, delKey)
            
    print 'Done'
    return

def deleteAllKeys(delBucket):

    for keys in  riak.RiakClient(pb_port=8087, protocol='pbc').bucket(delBucket).stream_keys():
        for delKey in keys:
            deleteKey(delBucket, delKey)
    return

def getAllKV(myBucket):
    myData = {}
    riak_bucket = riak.RiakClient(pb_port=8087, protocol='pbc').bucket(myBucket)
    for keys in riak_bucket.stream_keys():
        for key in keys:
            tempData = riak_bucket.get(key).data
            print('Key: %s Value: %s' % (key, tempData))
            myData[key] = tempData
    return myData

def getValue(myBucket, myKey):
    myVal = json.loads(riak.RiakClient(pb_port=8087, protocol='pbc').bucket(myBucket).get(myKey).data)
    return myVal

#Take a tuple of tuples in and return something
def pairAnalysis(pairTuple, ndays):
    
    stockA = pairTuple[0]
    stockAData = list(stockA[1])
    
    
    stockADates = [x[2] for x in stockAData]
    stockAClose = [x[0] for x in stockAData]
    stockAVolume = [x[1] for x in stockAData]
    
    stockB = pairTuple[1]
    stockBData = list(stockB[1])
   
    
    stockBDates = [x[2] for x in stockBData]
    stockBClose = [x[0] for x in stockBData]
    stockBVolume = [x[1] for x in stockBData]
    
    if stockADates[0:ndays] != stockBDates[0:ndays]:
        return 1
    else:
        coint = eg_test(stockAClose[0:ndays],stockBClose[0:ndays])
        #print coint
        if (coint[0] != 1):
            return 2
        else:
            signal = [a - coint[1][1]*b - coint[1][0] for a in stockAClose[0:ndays] for b in stockBClose[0:ndays]]
            sigMean = numpy.mean(signal)
            sigStd = numpy.std(signal)
            zscore = (signal[len(signal)-1] - sigMean)/sigStd
            if abs(zscore) > 1:
                return [stockA[0], stockAClose[0], stockB[0], stockBClose[0], zscore, coint[1][1], sigMean, sigStd, stockADates[0]]
    return 3

def writePairs(pairList):
    
    tradeable = [x for x in pairList if type(x) is list]
    
    rc = RiakClient(pb_port=8087, protocol='pbc')
    bucket = rc.bucket('tradeEntries')
    for pair in tradeable:
        
        key = str(str(pair[0])+ '_' + str(pair[2]))
        val = {'StockA': pair[0], \
                   'CloseA': pair[1], \
                   'StockB': pair[2], \
                   'CloseB': pair[3], \
                   'ZScore': pair[4],\
                   'Beta': pair[5],\
                   'SignalMean': pair[6],\
                   'SignalSD': pair[7],\
                   'Date': pair[8]}
        myDate = pair[8].split('-')
        obj = RiakObject(rc, bucket, key)
        obj.add_index("stocka_bin", str(pair[0]))
        obj.add_index("stockb_bin", str(pair[3]))
        obj.add_index("year_int", int(myDate[0]))
        obj.add_index("month_int", int(myDate[1]))
        obj.add_index("day_int", int(myDate[2]))
        obj.content_type = 'text/json'
        obj.data = val
        obj.data = json.dumps(val)
        obj.store()
        
    return tradeable
       
#return 1 if the two series are cointegrated and 0 otherwise
def eg_test(y, x):
    
    if len(y) == 0 | len(x) == 0:
        return [2,0,0]
    
    x = stat.add_constant(x)
    result = stat.OLS(y, x).fit()
    regPar = result.params
    adfResults = ts.adfuller(result.resid, maxlag=0, regression='c', autolag=None, store=False, regresults=True)
    tstat = adfResults[0]
    critVal = adfResults[2]['1%']
    
    if tstat < critVal:
        return [1,regPar]
    else:
        return [0,regPar]
    
def riakGetStock(searchVal):
    myData = []
    myBucket = RiakClient(pb_port=8087, protocol='pbc').bucket('stocks')
    for key in myBucket.get_index('ticker_bin', searchVal): # get all from 2002 to 2012
        value = json.loads(myBucket.get(key).data)
        myData.append([(value['CLOSE']), (value['VOLUME']), str(value['DATE'])])
    return myData

def mySort(s):
    sortList = list(s)
    sortList.sort(key=lambda x: x[2], reverse=True)
    return sortList

def myFilter(s,n):
    return list(s[0:n])