In [None]:
from input_proc import *
from p1_funs import persist_rets_bin, load_rets_from_bin, floatr_to_uint_minret, uint_ret_offs_tofloatr

fname = Path(os.getcwd())




In [None]:
# answer to the question: how to store stock returns in a file
# without losing precision best option in terms of size is to use compressed numpy array
# column names and indices can be stored separately, e.g. in another arrays inside compressed container
persist_rets_bin(str(fname) + r'\floatGzipped', xret)  # 155 MB

In [None]:
# other ways described, some optimal for particular tasks, e.g. columnar storage for PySpark jobs
import pyarrow as pa
import pyarrow.parquet as pq

# Convert pandas DataFrame to PyArrow Table
table = pa.Table.from_pandas(xret)

# Write PyArrow Table to Parquet file
pq.write_table(table, 'xret.parquet')  # already less than pkl only 20% more than np compressed
pq.write_table(table, 'xretgzipped.parquet', compression='gzip')
# gzip only 3% improvement over uncompressed parquet, zstd same, no improvement with brotli either


# quick search of compressors suggested to use blosc2, but suboptimal as requires C-arrays
import blosc2 as bl2

compressed = bl2.pack_array(np.ascontiguousarray(xret))  # incl. clvl9 and shuffle

# Save the compressed data to a file, > 200 MB
with open('cr_compressed_blosc.b2frame', 'wb') as f:
    f.write(compressed)

blarr = bl2.compress2(np.ascontiguousarray(arr))  # fails with Fortran array
# savedsz = bl2.save_array(blarr, str(Path(fname, 'cr_compressed_blosc.bl2')), mode='w')
# this works but floods console, anyway worse than npz 162 MB, so disappointing


In [None]:
# h5py is also a good option, but not as good as compressed numpy array,
# leaving here for reference, listing all tried options
import h5py
with h5py.File(str(Path(fname, 'crDF_compressed9.h5')), "w") as f:
    # same size
    # dset_coefs = f.create_dataset("a", data=xret, compression="gzip", compression_opts=9)
    dset_coefs = f.create_dataset("a", data=xret, compression="gzip", compression_opts=9, shuffle=True)
    # dset_coefs = f.create_dataset("a", data=xret, compression="SZIP", compression_opts=9)
    # dset_coefs = f.create_dataset("a", data=xret, compression="ZFP", compression_opts=9)
    # dset_coefs = f.create_dataset("a", data=xret, compression="LZF", compression_opts=9)
    # dset_coefs = f.create_dataset("a", data=xret, compression="LZMA", compression_opts=9)
# enabling shuffling can potentially improve the compression ratio without modifying the original data
#  after decompression will be the same as the original data. 177 MB, rather slow
#  xret or xret.values, no difference


In [None]:
# instead of trying more compression tools incl. lossy, we can do our own simplistic one
# reducing precision to float32 would have been too simple:
# but decrease size of nobs * ncomps * 8 bytes for float64 from 232 to 160 MB
persist_rets_bin(Path(fname, 'f32' + 'gzipped'), xdf.astype(np.float32))  # 80MB, half the f64 size as expected


In [None]:
# also, we can use min and max values from data to create custom dtype knowing range of our values:
np.nanmax(arr)
np.nanmin(arr)  # but this is more like data-mining

# more 'theory' solid is to make use of the fact that stock returns are > -1 (price can't go below 0)


# numpy.uint16  # 0 to 65535, whereas uint8 is 0 to 255 (not enough for either company ids or dates)
np.iinfo(np.uint16).max

# 32-bit floating-point values cover range from 1.175494351 * 10^-38 to 3.40282347 * 10^+38,
# which is too much for stock returns, so we may want to limit negative bound to -1 and
# (questionably) sacrifice precision to 4 or 5 digits after the decimal point (in practice,
# it depends on upper bound, theoretically unlimited for stock returns, but with some assumptions
# even np.uint64 with 19 digits will have smaller output size than float64, see at the bottom).

# to sum up, method: use integers to count number of steps (e.g. 0.0001 step size for
#  4 digits after the decimal point) from minimal value, which for stock returns is -1.

# so for above we can use uint16 given our data, but best to use uint32 to be on the safe side:
np.nanmin(arr[arr>0])  # > 10**(-10) step size but < even 10**(-9)...
np.nanmax(arr[arr<0])  # similarly, > -10**(-9) but < -10**(-10)
np.nanmin(np.abs(arr))  # equivalent test for both bounds, can be also inferred from f64 type
np.nanmax(np.abs(arr))  # 0.09999999999999998, so 5 digits after the decimal point is enough


In [None]:
# first, use defined function to convert float64 to uint16 per this logic, to show how it works:

floatr_to_uint_minret(xret.values, prec=5, minval=-0.1)  # can use for our random data
floatr_to_uint_minret(xret.values, prec=4, minval=-1)  # appropriate defaults for stock returns (more general case)

# now, show saving of data:

In [2]:
persist_rets_bin(str(fname) + r'\uint16offst', xdf, to_int=np.uint16, prec=4, minval=-1)  # 34 MB is
#  80% reduction in size, but obviously we lose precision, may be outside of the task scope...
persist_rets_bin(Path(fname, 'ui32_p9' + 'gzipped'), xdf, to_int = np.uint32, prec=9)  # 80MB
persist_rets_bin(Path(fname, 'ui64_p19' + 'gzipped'), xdf, to_int = np.uint64, prec=19)  # 146MB

# now showcase restoring values and checking lost precision is as expected (also, covered in unit test):

NameError: name 'persist_rets_bin' is not defined

In [None]:
xdf2 = load_rets_from_bin('uint16offst_npcomprsd.npz', val_dtype=np.uint16, prec=4, minval=-1)  # also converts uint16 to float64

prec = 4
stepsz = 10**(-prec)  # fails with smaller rounding, 3 tests below with increasing "difficulty"

errAssertPrec = "reconstructed values differ from original"
assert np.nanmax(np.abs(xdf.values.round(4) - xdf2.values)) < stepsz, errAssertPrec
assert np.nanmax(np.abs(xdf.values - xdf2.values)) < stepsz, errAssertPrec
assert np.allclose(np.nan_to_num(xdf.values), np.nan_to_num(xdf2.values), atol=1e-4), errAssertPrec

#  may need improvement before production use
np.nanmean(np.abs(xdf.values - xdf2.values))  # average error
np.nanmax( np.abs(xdf.values - xdf2.values)) # worst error

In [None]:
# function now also works with specified range of values


# also, test range functionality, mapping e.g. floats betweer values [-1, 1] to ints [0, 65535] assumption of fixed step
xret3rng = floatr_to_uint_minret(xret.values.copy(), minval=-0.07, maxval=0.07)
xret3_orng = uint_ret_offs_tofloatr(xret3rng, minval=-0.07, maxval=0.07)
np.nanmean(np.abs(xret.values.round(4) - xret3_orng))  # strangely on range precision is worse than on minret only, same with nanmax



xret3rngu = floatr_to_uint_minret(xret.values.copy(), prec=4, minval=-1)
xret3_orngu = uint_ret_offs_tofloatr(xret3rngu, prec=4, minval=-1)

np.nanmean(np.abs(xret.values.round(4) - xret3_orngu))  # indeed, yields smallest error, same with nanmax

# chk worst case max abs error
np.nanmax(np.abs(xret.values.round(4) - xret3_orngu))  # 0.0001
np.nanmax(np.abs(xret.values.round(4) - xret3_orng))  # 0.0001