Skip to content

Commit

Permalink
MAINT: Modifies minute bars to use a dict of OHLC ratios (quantopian#…
Browse files Browse the repository at this point in the history
…1428)

For scaling up pricing data before writing to bcolz, the writer now
accepts a dict mapping each sid to the ratio to use. It still accepts a
single ratio as default_ohlc_ratio, which is used as a fallback if no
mapping exists for a given sid. The default is OHLC_RATIO (1000).

This allows better handling of futures pricing data, where the required
precision across root symbols is not consistent.
  • Loading branch information
Andrew Daniels committed Aug 24, 2016
1 parent 20f9241 commit acb2cf1
Showing 1 changed file with 80 additions and 23 deletions.
103 changes: 80 additions & 23 deletions zipline/data/minute_bars.py
Expand Up @@ -24,6 +24,7 @@
import numpy as np
import pandas as pd
from six import with_metaclass
from toolz import keymap, valmap

from zipline.data._minute_bar_internal import (
minute_value,
Expand Down Expand Up @@ -208,7 +209,7 @@ class BcolzMinuteBarMetadata(object):
minutes_per_day : int
The number of minutes per each period.
"""
FORMAT_VERSION = 2
FORMAT_VERSION = 3

METADATA_FILENAME = 'metadata.json'

Expand All @@ -229,7 +230,7 @@ def read(cls, rootdir):
# if version does not match.
version = 0

ohlc_ratio = raw_data['ohlc_ratio']
default_ohlc_ratio = raw_data['ohlc_ratio']

if version >= 1:
minutes_per_day = raw_data['minutes_per_day']
Expand All @@ -254,8 +255,16 @@ def read(cls, rootdir):
raw_data['market_closes'][-1], unit='m', tz='UTC')
)

if version >= 3:
ohlc_ratios_per_sid = raw_data['ohlc_ratios_per_sid']
if ohlc_ratios_per_sid is not None:
ohlc_ratios_per_sid = keymap(int, ohlc_ratios_per_sid)
else:
ohlc_ratios_per_sid = None

return cls(
ohlc_ratio,
default_ohlc_ratio,
ohlc_ratios_per_sid,
calendar,
start_session,
end_session,
Expand All @@ -265,7 +274,8 @@ def read(cls, rootdir):

def __init__(
self,
ohlc_ratio,
default_ohlc_ratio,
ohlc_ratios_per_sid,
calendar,
start_session,
end_session,
Expand All @@ -275,7 +285,8 @@ def __init__(
self.calendar = calendar
self.start_session = start_session
self.end_session = end_session
self.ohlc_ratio = ohlc_ratio
self.default_ohlc_ratio = default_ohlc_ratio
self.ohlc_ratios_per_sid = ohlc_ratios_per_sid
self.minutes_per_day = minutes_per_day
self.version = version

Expand All @@ -288,8 +299,14 @@ def write(self, rootdir):
version : int
The value of FORMAT_VERSION of this class.
ohlc_ratio : int
The factor by which the pricing data is multiplied so that the
float data can be stored as an integer.
The default ratio by which to multiply the pricing data to
convert the floats from floats to an integer to fit within
the np.uint32. If ohlc_ratios_per_sid is None or does not
contain a mapping for a given sid, this ratio is used.
ohlc_ratios_per_sid : dict
A dict mapping each sid in the output to the factor by
which the pricing data is multiplied so that the float data
can be stored as an integer.
minutes_per_day : int
The number of minutes per each period.
calendar_name : str
Expand Down Expand Up @@ -326,7 +343,8 @@ def write(self, rootdir):

metadata = {
'version': self.version,
'ohlc_ratio': self.ohlc_ratio,
'ohlc_ratio': self.default_ohlc_ratio,
'ohlc_ratios_per_sid': self.ohlc_ratios_per_sid,
'minutes_per_day': self.minutes_per_day,
'calendar_name': self.calendar.name,
'start_session': str(self.start_session.date()),
Expand Down Expand Up @@ -365,12 +383,15 @@ class BcolzMinuteBarWriter(object):
The first trading session in the data set.
end_session : datetime
The last trading session in the data set.
ohlc_ratio : int, optional
The ratio by which to multiply the pricing data to convert the
floats from floats to an integer to fit within the np.uint32.
The default is 1000 to support pricing data which comes in to the
thousands place.
default_ohlc_ratio : int, optional
The default ratio by which to multiply the pricing data to
convert from floats to integers that fit within np.uint32. If
ohlc_ratios_per_sid is None or does not contain a mapping for a
given sid, this ratio is used. Default is OHLC_RATIO (1000).
ohlc_ratios_per_sid : dict, optional
A dict mapping each sid in the output to the ratio by which to
multiply the pricing data to convert the floats from floats to
an integer to fit within the np.uint32.
expectedlen : int, optional
The expected length of the dataset, used when creating the initial
bcolz ctable.
Expand Down Expand Up @@ -434,7 +455,8 @@ def __init__(self,
start_session,
end_session,
minutes_per_day,
ohlc_ratio=OHLC_RATIO,
default_ohlc_ratio=OHLC_RATIO,
ohlc_ratios_per_sid=None,
expectedlen=DEFAULT_EXPECTEDLEN):

self._rootdir = rootdir
Expand All @@ -447,13 +469,15 @@ def __init__(self,
self._session_labels = self._schedule.index
self._minutes_per_day = minutes_per_day
self._expectedlen = expectedlen
self._ohlc_ratio = ohlc_ratio
self._default_ohlc_ratio = default_ohlc_ratio
self._ohlc_ratios_per_sid = ohlc_ratios_per_sid

self._minute_index = _calc_minute_index(
self._schedule.market_open, self._minutes_per_day)

metadata = BcolzMinuteBarMetadata(
self._ohlc_ratio,
self._default_ohlc_ratio,
self._ohlc_ratios_per_sid,
self._calendar,
self._start_session,
self._end_session,
Expand All @@ -465,6 +489,17 @@ def __init__(self,
def first_trading_day(self):
return self._start_session

def ohlc_ratio_for_sid(self, sid):
if self._ohlc_ratios_per_sid is not None:
try:
return self._ohlc_ratios_per_sid[sid]
except KeyError:
pass

# If no ohlc_ratios_per_sid dict is passed, or if the specified
# sid is not in the dict, fallback to the general ohlc_ratio.
return self._default_ohlc_ratio

def sidpath(self, sid):
"""
Parameters:
Expand Down Expand Up @@ -772,7 +807,7 @@ def _write_cols(self, sid, dts, cols):
dt_ixs = np.searchsorted(all_minutes_in_window.values,
dts.astype('datetime64[ns]'))

ohlc_ratio = self._ohlc_ratio
ohlc_ratio = self.ohlc_ratio_for_sid(sid)

def convert_col(col):
"""Adapt float column into a uint32 column.
Expand Down Expand Up @@ -832,7 +867,13 @@ def __init__(self, rootdir, sid_cache_size=1000):
self._market_close_values = self._market_closes.values.\
astype('datetime64[m]').astype(np.int64)

self._ohlc_inverse = 1.0 / metadata.ohlc_ratio
self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio
ohlc_ratios = metadata.ohlc_ratios_per_sid
if ohlc_ratios:
self._ohlc_inverses_per_sid = (
valmap(lambda x: 1.0 / x, ohlc_ratios))
else:
self._ohlc_inverses_per_sid = None

self._minutes_per_day = metadata.minutes_per_day

Expand All @@ -856,6 +897,17 @@ def last_available_dt(self):
def first_trading_day(self):
return self._start_session

def _ohlc_ratio_inverse_for_sid(self, sid):
if self._ohlc_inverses_per_sid is not None:
try:
return self._ohlc_inverses_per_sid[sid]
except KeyError:
pass

# If we can not get a sid-specific OHLC inverse for this sid,
# fallback to the default.
return self._default_ohlc_inverse

def _minutes_to_exclude(self):
"""
Calculate the minutes which should be excluded when a window
Expand Down Expand Up @@ -999,8 +1051,9 @@ def get_value(self, sid, dt, field):
return 0
else:
return np.nan

if field != 'volume':
value *= self._ohlc_inverse
value *= self._ohlc_ratio_inverse_for_sid(sid)
return value

def get_last_traded_dt(self, asset, dt):
Expand Down Expand Up @@ -1111,11 +1164,15 @@ def load_raw_arrays(self, fields, start_dt, end_dt, sids):
excl_slice = np.s_[
excl_start - start_idx:excl_stop - start_idx + 1]
values = np.delete(values, excl_slice)

where = values != 0
# first slice down to len(where) because we might not have
# written data for all the minutes requested
out[:len(where), i][where] = values[where]
if field != 'volume':
out *= self._ohlc_inverse
if field != 'volume':
out[:len(where), i][where] = (
values[where] * self._ohlc_ratio_inverse_for_sid(sid))
else:
out[:len(where), i][where] = values[where]

results.append(out)
return results

0 comments on commit acb2cf1

Please sign in to comment.