In [3]:
!pip3 install zipline==1.3.0

Collecting zipline==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/be/59/8c5802a7897c1095fdc409fb557f04df8f75c37174e80d2ba58c8d8a6488/zipline-1.3.0.tar.gz (2.5MB)
[K     |████████████████████████████████| 2.5MB 4.2MB/s 
Collecting Logbook>=0.12.5
[?25l  Downloading https://files.pythonhosted.org/packages/2f/d9/16ac346f7c0102835814cc9e5b684aaadea101560bb932a2403bd26b2320/Logbook-1.5.3.tar.gz (85kB)
[K     |████████████████████████████████| 92kB 6.0MB/s 
Collecting requests-file>=1.4.1
  Downloading https://files.pythonhosted.org/packages/77/86/cdb5e8eaed90796aa83a6d9f75cfbd37af553c47a291cd47bc410ef9bdb2/requests_file-1.5.1-py2.py3-none-any.whl
Collecting pandas<=0.22,>=0.18.1
[?25l  Downloading https://files.pythonhosted.org/packages/da/c6/0936bc5814b429fddb5d6252566fe73a3e40372e6ceaf87de3dec1326f28/pandas-0.22.0-cp36-cp36m-manylinux1_x86_64.whl (26.2MB)
[K     |████████████████████████████████| 26.3MB 64.2MB/s 
Collecting cyordereddict>=0.2.2
[?25l  Downloading

In [14]:
import zipline

from collections import OrderedDict
import numpy as np

import pandas as pd
from os import listdir
import sys

import os

from zipline.data import bundles
from zipline.pipeline import Pipeline
from zipline.utils.calendars import get_calendar
from zipline.pipeline.engine import SimplePipelineEngine
from zipline.pipeline.factors import CustomFactor, DailyReturns, AverageDollarVolume


from zipline.pipeline.data import USEquityPricing
from zipline.pipeline.loaders import USEquityPricingLoader
from zipline.assets._assets import Equity
from zipline.api import symbol

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
zipline_dir = '/content/drive/MyDrive/abnormal-distribution-project-data/zipline'
os.environ['ZIPLINE_ROOT'] = zipline_dir

Ingest Zipline

In [None]:

#!zipline ingest -b 'sep'

In [3]:


METADATA_HEADERS = ['start_date', 'end_date', 'auto_close_date',
                    'symbol', 'exchange', 'asset_name']


def check_for_abnormal_returns(df, thresh=3.0):
    """Checks to see if any days have abnormal returns"""
    returns = df['close'].pct_change()
    abnormal_rets = returns[returns > thresh]
    if abnormal_rets.shape[0] > 0:
        sys.stderr.write('Abnormal returns for: {}\n'.format(df.ix[0]['ticker']))
        sys.stderr.write('{}\n'.format(str(abnormal_rets)))


def from_sep_dump(file_name, start=None, end=None):
    """
    ticker,date,open,high,low,close,volume,dividends,lastupdated
    A,2008-01-02,36.67,36.8,36.12,36.3,1858900.0,0.0,2017-11-01

    To use this make your ~/.zipline/extension.py look similar this:

    from zipline.data.bundles import register
    from alphacompiler.data.loaders.sep_quandl import from_sep_dump

    register("sep",
         from_sep_dump("/path/to/your/SEP/dump/SHARADAR_SEP_69.csv"),)

    """
    us_calendar = get_calendar("NYSE").all_sessions
    ticker2sid_map = {}

    def ingest(environ,
               asset_db_writer,
               minute_bar_writer,  # unused
               daily_bar_writer,
               adjustment_writer,
               calendar,
               cache,
               show_progress,
               output_dir,
               # pass these as defaults to make them 'nonlocal' in py2
               start=start,
               end=end):

        print("starting ingesting data from: {}".format(file_name))

        # read in the whole dump (will require ~7GB of RAM)
        df = pd.read_csv(file_name, index_col='date',
                         parse_dates=['date'], na_values=['NA'])

        # drop unused columns, dividends will be used later
        df = df.drop(['lastupdated', 'dividends', 'closeunadj'], axis=1)

        # counter of valid securites, this will be our primary key
        sec_counter = 0
        data_list = []  # list to send to daily_bar_writer
        metadata_list = []  # list to send to asset_db_writer (metadata)

        # iterate over all the unique securities and pack data, and metadata
        # for writing
        for tkr, df_tkr in df.groupby('ticker'):
            df_tkr = df_tkr.sort_index()

            row0 = df_tkr.ix[0]  # get metadata from row

            print(" preparing {}".format(row0["ticker"]))
            check_for_abnormal_returns(df_tkr)

            # check to see if there are missing dates in the middle
            this_cal = us_calendar[(us_calendar >= df_tkr.index[0]) & (us_calendar <= df_tkr.index[-1])]
            if len(this_cal) != df_tkr.shape[0]:
                print('MISSING interstitial dates for: %s using forward fill' % row0["ticker"])
                print('number of dates missing: {}'.format(len(this_cal) - df_tkr.shape[0]))
                df_desired = pd.DataFrame(index=this_cal.tz_localize(None))
                df_desired = df_desired.join(df_tkr)
                df_tkr = df_desired.fillna(method='ffill')

            # update metadata; 'start_date', 'end_date', 'auto_close_date',
            # 'symbol', 'exchange', 'asset_name'
            metadata_list.append((df_tkr.index[0],
                                  df_tkr.index[-1],
                                  df_tkr.index[-1] + pd.Timedelta(days=1),
                                  row0["ticker"],
                                  "SEP",  # all have exchange = SEP
                                  row0["ticker"]  # TODO: can we delete this?
                                  )
                                 )

            # drop metadata columns
            df_tkr = df_tkr.drop(['ticker'], axis=1)

            # pack data to be written by daily_bar_writer
            data_list.append((sec_counter, df_tkr))
            ticker2sid_map[tkr] = sec_counter  # record the sid for use later
            sec_counter += 1

        print("writing data for {} securities".format(len(metadata_list)))
        daily_bar_writer.write(data_list, show_progress=False)

        # write metadata
        asset_db_writer.write(equities=pd.DataFrame(metadata_list,
                                                    columns=METADATA_HEADERS))
        print("a total of {} securities were loaded into this bundle".format(
            sec_counter))

        # read in Dividend History
        dfd = pd.read_csv(file_name, index_col='date',
                         parse_dates=['date'], na_values=['NA'])
        # drop rows where dividends == 0.0
        dfd = dfd[dfd["dividends"] != 0.0]
        dfd = dfd.dropna()

        dfd.loc[:, 'ex_date'] = dfd.loc[:, 'record_date'] = dfd.index
        dfd.loc[:, 'declared_date'] = dfd.loc[:, 'pay_date'] = dfd.index
        dfd.loc[:, 'sid'] = dfd.loc[:, 'ticker'].apply(lambda x: ticker2sid_map[x])
        dfd = dfd.rename(columns={'dividends': 'amount'})
        dfd = dfd.drop(['open', 'high', 'low', 'close', 'volume', 'lastupdated', 'ticker', 'closeunadj'], axis=1)

        # # format dfd to have sid
        adjustment_writer.write(dividends=dfd)

    return ingest

In [4]:

def register_data(start_date, end_date, bundle_name, address):

    start_session = pd.Timestamp(start_date, tz='utc')
    end_session = pd.Timestamp(end_date, tz='utc')

    register(bundle_name, csvdir_equities(['daily'],address,),
    calendar_name='NYSE', start_session=start_session,
    end_session=end_session)


class PricingLoader(object):
    def __init__(self, bundle_data):
        self.loader = USEquityPricingLoader(
            bundle_data.equity_daily_bar_reader,
            bundle_data.adjustment_reader)

    def get_loader(self, column):
        if column not in USEquityPricing.columns:
            raise Exception('Column not in USEquityPricing')
        return self.loader

def build_pipeline_engine(bundle_data, trading_calendar):
    pricing_loader = PricingLoader(bundle_data)

    engine = SimplePipelineEngine(
        get_loader=pricing_loader.get_loader,
        calendar=trading_calendar.all_sessions,
        asset_finder=bundle_data.asset_finder)

    return engine

# Loading stock list from file
def stock_list(file_name):
    all_stocks = []
    with open(file_name, 'r') as f:
        for line in f:
            # remove linebreak which is the last character of the string
            currentPlace = line[:-1]
            # add item to the list
            all_stocks.append(currentPlace)
        return all_stocks

def get_universe_tickers(engine, universe, end_date):
    universe_end_date = pd.Timestamp(end_date, tz='UTC')

    universe_tickers = engine \
        .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date) \
        .index.get_level_values(1) \
        .values.tolist()

    return universe_tickers

In [11]:
def get_tickers_from_bundle(bundle_name):
    """Gets a list of tickers from a given bundle"""
    bundle_data = bundles.load(bundle_name, os.environ, None)

    # get a list of all sids
    lifetimes = bundle_data.asset_finder._compute_asset_lifetimes()
    all_sids = lifetimes.sid

    # retreive all assets in the bundle
    all_assets = bundle_data.asset_finder.retrieve_all(all_sids)

    # return only tickers
    return map(lambda x: (x.symbol, x.sid), all_assets)


def get_all_assets_for_bundle(bundle_name):
    """For a given bundle get a list of all assets"""
    bundle_data = load(bundle_name, os.environ, None)

    # get a list of all sids
    lifetimes = bundle_data.asset_finder._compute_asset_lifetimes()
    all_sids = lifetimes.sid

    print('all_sids: ', all_sids)

    # retreive all assets in the bundle
    return bundle_data.asset_finder.retrieve_all(sids=all_sids)


def get_ticker_sid_dict_from_bundle(bundle_name):
    """Packs the (ticker,sid) tuples into a dict."""
    all_equities = get_tickers_from_bundle(bundle_name)
    return dict(all_equities)

def pack_sparse_data(N, rawpath, fields, filename):
    """pack data into np.recarray and persists it to a file to be
    used by SparseDataFactor"""


    # create buffer to hold data for all tickers
    dfs = [None] * N

    max_len = -1
    print("Packing sids")
    for fn in listdir(rawpath):
        if not fn.endswith(".csv"):
            continue
        df = pd.read_csv(os.path.join(rawpath,fn), index_col="Date", parse_dates=True)
        df = df.sort_index()
        sid = int(fn.split('.')[0])
        #print("packing sid: %d" % sid)
        dfs[sid] = df

        # width is max number of rows in any file
        max_len = max(max_len, df.shape[0])
    print("Finished packing sids")

    # temp workaround for `Array Index Out of Bound` bug
    max_len = max_len + 1

    # pack up data as buffer
    num_fundamentals = len(fields)
    buff = np.full((num_fundamentals + 1, N, max_len), np.nan)

    dtypes = [('date', '<f8')]
    for field in fields:
        dtypes.append((field, '<f8'))

    # pack self.data as np.recarray
    data = np.recarray(shape=(N, max_len), buf=buff, dtype=dtypes)

    # iterate over loaded data and populate self.data
    for i, df in enumerate(dfs):
        if df is None:
            continue
        ind_len = df.index.shape[0]
        data.date[i, :ind_len] = df.index
        for field in fields:
            data[field][i, :ind_len] = df[field]

    data.dump(filename)  # can be read back with np.load()


def load_sf1(sf1_dir, fields, dimensions=None):
    """
    Loads SF1 data into a npy compressed file SF1.npy
    :param sf1_dir: Sharadar SF1 bulk file
    :param fields: fields to load
    :param dimensions: dimensions to load. One-to-one with fields. If None, assume ARQ if data available,
    ART if not
    """
    stocks_dir = '/content/drive/MyDrive/abnormal-distribution-project-data/stocks/dummy'

    bundles.register('sep', from_sep_dump('.', '.'), )
    num_tickers = len(get_ticker_sid_dict_from_bundle('sep'))
    print('number of tickers: ', num_tickers)

    data = pd.read_csv(sf1_dir)

    tickers = get_ticker_sid_dict_from_bundle('sep')
    
    counter = 0
    for ticker, sid in tickers.items():
        counter += 1
        if counter % 100 == 0:
            print("Working on {}-th file".format(counter))

        df = data[(data.ticker == ticker)]
        df = df.rename(columns={'datekey': 'Date'}).set_index('Date')
        df.index = df.index.rename('Date')
        series = []
        for i, field in enumerate(fields):
            if dimensions is None:
                if df[df.dimension == 'ARQ'][field].isna().sum() == df[df.dimension == 'ARQ'].shape[0]:
                    s = df[df.dimension == 'ART'][field]
                else:
                    s = df[df.dimension == 'ARQ'][field]
            else:
                s = df[df.dimension == dimensions[i]][field]
            series.append(s)

        df = pd.concat(series, axis=1)
        df = df.sort_index()
        df.index = df.index.rename('Date')
        df.to_csv(os.path.join(stocks_dir, "{}.csv".format(sid)))
    
    pack_sparse_data(num_tickers + 1,  # number of tickers in bundle + 1
                     stocks_dir,
                     fields,
                     zipline_dir + '/data/' + 'SF1.npy')  # write directly to the zipline data dir

In [15]:
sf1_dir = '/content/drive/MyDrive/abnormal-distribution-project-data/stocks/SHARADAR_SF1.csv'
fields = ['marketcap', 'assets', 'liabilities', 'pe', 'currentratio', 'netmargin', 'capex', 'fcf', 'roic']
load_sf1(sf1_dir, fields, dimensions=None)



number of tickers:  17841
Packing sids
Finished packing sids


In [16]:
from zipline.pipeline.factors import CustomFactor


class SparseDataFactor(CustomFactor):
    """Abstract Base Class to be used for computing sparse data.
    The data is packed and persisted into a NumPy binary data file
    in a previous step.
    This class must be subclassed with class variable 'outputs' set.  The fields
    in 'outputs' should match those persisted."""
    inputs = []
    window_length = 1

    def __init__(self, *args, **kwargs):
        self.time_index = None
        self.curr_date = None # date for which time_index is accurate
        self.data = None
        self.data_path = "please_specify_.npy_file"

    def bs(self, arr):
        """Binary Search"""
        if len(arr) == 1:
            if self.curr_date < arr[0]:
                return 0
            else: return 1

        mid = int(len(arr) / 2)
        if self.curr_date < arr[mid]:
            return self.bs(arr[:mid])
        else:
            return mid + self.bs(arr[mid:])

    def bs_sparse_time(self, sid):
        """For each security find the best range in the sparse data."""
        dates_for_sid = self.data.date[sid]
        if np.isnan(dates_for_sid[0]):
            return 0

        # do a binary search of the dates array finding the index
        # where self.curr_date will lie.
        non_nan_dates = dates_for_sid[~np.isnan(dates_for_sid)]
        return self.bs(non_nan_dates) - 1

    def cold_start(self, today, assets):
        if self.data is None:
            self.data = np.load(self.data_path, allow_pickle=True)

        self.M = self.data.date.shape[1]

        # for each sid, do binary search of date array to find current index
        # the results can be shared across all factors that inherit from SparseDataFactor
        # this sets an array of ints: time_index
        self.time_index = np.full(self.N, -1, np.dtype('int64'))
        self.curr_date = today.value
        for asset in assets:  # asset is numpy.int64
            self.time_index[asset] = self.bs_sparse_time(asset)

    def update_time_index(self, today, assets):
        """Ratchet update.
        for each asset check if today >= dates[self.time_index]
        if so then increment self.time_index[asset.sid] += 1"""

        ind_p1 = self.time_index.copy()
        np.add.at(ind_p1, ind_p1 != (self.M - 1), 1)
        sids_to_increment = today.value >= self.data.date[np.arange(self.N), ind_p1]
        sids_not_max = self.time_index != (self.M - 1)   # create mask of non-maxed
        self.time_index[sids_to_increment & sids_not_max] += 1

        self.curr_date = today.value

    def compute(self, today, assets, out, *arrays):
        # for each asset in assets determine index from date (today)
        if self.time_index is None:
            self.cold_start(today, assets)
        else:
            self.update_time_index(today, assets)

        ti_used_today = self.time_index[assets]

        for field in self.__class__.outputs:
            out[field][:] = self.data[field][assets, ti_used_today]


class Fundamentals(SparseDataFactor):
    outputs = ['marketcap', 'assets', 'liabilities', 'pe', 'currentratio', 'netmargin', 'capex', 'fcf', 'roic']

    def __init__(self, *args, **kwargs):
        super(Fundamentals, self).__init__(*args, **kwargs)
        self.N = len(get_ticker_sid_dict_from_bundle("sep")) + 1  # max(sid)+1 get this from the bundle
        self.data_path = zipline_dir + '/data/' + 'SF1.npy'

In [28]:
data = pd.read_csv(sf1_dir)

In [32]:
data.columns[0:50]

Index(['ticker', 'dimension', 'calendardate', 'datekey', 'reportperiod',
       'lastupdated', 'accoci', 'assets', 'assetsavg', 'assetsc', 'assetsnc',
       'assetturnover', 'bvps', 'capex', 'cashneq', 'cashnequsd', 'cor',
       'consolinc', 'currentratio', 'de', 'debt', 'debtc', 'debtnc', 'debtusd',
       'deferredrev', 'depamor', 'deposits', 'divyield', 'dps', 'ebit',
       'ebitda', 'ebitdamargin', 'ebitdausd', 'ebitusd', 'ebt', 'eps',
       'epsdil', 'epsusd', 'equity', 'equityavg', 'equityusd', 'ev', 'evebit',
       'evebitda', 'fcf', 'fcfps', 'fxusd', 'gp', 'grossmargin',
       'intangibles'],
      dtype='object')

In [33]:
data.columns[50:]

Index(['intexp', 'invcap', 'invcapavg', 'inventory', 'investments',
       'investmentsc', 'investmentsnc', 'liabilities', 'liabilitiesc',
       'liabilitiesnc', 'marketcap', 'ncf', 'ncfbus', 'ncfcommon', 'ncfdebt',
       'ncfdiv', 'ncff', 'ncfi', 'ncfinv', 'ncfo', 'ncfx', 'netinc',
       'netinccmn', 'netinccmnusd', 'netincdis', 'netincnci', 'netmargin',
       'opex', 'opinc', 'payables', 'payoutratio', 'pb', 'pe', 'pe1',
       'ppnenet', 'prefdivis', 'price', 'ps', 'ps1', 'receivables', 'retearn',
       'revenue', 'revenueusd', 'rnd', 'roa', 'roe', 'roic', 'ros', 'sbcomp',
       'sgna', 'sharefactor', 'sharesbas', 'shareswa', 'shareswadil', 'sps',
       'tangibles', 'taxassets', 'taxexp', 'taxliabilities', 'tbvps',
       'workingcapital'],
      dtype='object')

In [23]:
def run_pipeline(engine, pipeline, start_date, end_date):

    # TODO: adjust for trading days
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC')
    return engine.run_pipeline(pipeline, start_dt, end_dt)

def get_pipeline_tickers(factors):

    return factors.index.levels[1].values.tolist()

def make_pipeline(factors, universe):
    factors_pipe = OrderedDict()
        
    for name, f in factors.items():
        factors_pipe[name] = f
                    
    pipe = Pipeline(screen=universe, columns=factors_pipe)
    
    return pipe


def make_factors():
    
    all_factors = {
        '1Y_return': DailyReturns(window_length=252)
    }
    
    return all_factors

In [21]:

trading_calendar = get_calendar('NYSE') 
ingest_func = bundles.csvdir.csvdir_equities(['daily'], 'sep')
bundles.register('sep', from_sep_dump('.'))
bundle_data = bundles.load('sep')
engine = build_pipeline_engine(bundle_data, trading_calendar)

  after removing the cwd from sys.path.


In [None]:
start_date = '2016-1-4'
end_date = '2020-5-19'
universe_start_date = pd.Timestamp(start_date, tz='UTC')
universe_end_date = pd.Timestamp(end_date, tz='UTC')
# Select universe of stocks
universe = Fundamentals().marketcap.top(500) & AverageDollarVolume(window_length=120).top(500)
pipeline = Pipeline(screen=universe)
#pipeline.add(Fundamentals().marketcap, 'universe')
all_factors = run_pipeline(engine, pipeline, universe_start_date, universe_end_date)
# Get all tickers for the stocks we're looking at
all_assets = get_pipeline_tickers(all_factors)

In [34]:
all_factors

Unnamed: 0,Unnamed: 1
2016-01-04 00:00:00+00:00,Equity(0 [A])
2016-01-04 00:00:00+00:00,Equity(28 [AAL])
2016-01-04 00:00:00+00:00,Equity(40 [AAPL])
2016-01-04 00:00:00+00:00,Equity(61 [ABBV])
2016-01-04 00:00:00+00:00,Equity(62 [ABC])
2016-01-04 00:00:00+00:00,Equity(80 [ABEV])
2016-01-04 00:00:00+00:00,Equity(119 [ABT])
2016-01-04 00:00:00+00:00,Equity(207 [ACN])
2016-01-04 00:00:00+00:00,Equity(261 [ADBE])
2016-01-04 00:00:00+00:00,Equity(277 [ADI])
