In [1]:
%load_ext watermark
%watermark

%load_ext autoreload
%autoreload 2 


# import standard libs
from IPython.display import display
from IPython.core.debugger import set_trace as bp
from pathlib import PurePath, Path
import sys
import time
from collections import OrderedDict as od
import re
import os
import json
os.environ['THEANO_FLAGS'] = 'device=cpu,floatX=float32'

# import python scientific stack
import pandas as pd
import pandas_datareader.data as web
pd.set_option('display.max_rows', 100)
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from numba import jit
import math
import pymc3 as pm
from theano import shared, theano as tt

# import visual tools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import seaborn as sns

plt.style.use('seaborn-talk')
plt.style.use('bmh')

#plt.rcParams['font.family'] = 'DejaVu Sans Mono'
#plt.rcParams['font.size'] = 9.5
plt.rcParams['font.weight'] = 'medium'
#plt.rcParams['figure.figsize'] = 10,7
blue, green, red, purple, gold, teal = sns.color_palette('colorblind', 6)

# import util libs
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
import missingno as msno

from src.utils.utils import *
from src.features.bars import get_imbalance
import src.features.bars as brs
import src.features.snippets as snp

RANDOM_STATE = 777


print()
%watermark -p pandas,pandas_datareader,dask,numpy,pymc3,theano,sklearn,statsmodels,scipy,matplotlib,seaborn,pyarrow,fastparquet


2019-01-18T13:10:05+05:00

CPython 3.7.1
IPython 7.2.0

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 15.6.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

pandas 0.23.4
pandas_datareader 0.7.0
dask 1.0.0
numpy 1.15.4
pymc3 3.6
theano 1.0.3
sklearn 0.20.1
statsmodels 0.9.0
scipy 1.1.0
matplotlib 3.0.2
seaborn 0.9.0
pyarrow 0.11.1
fastparquet not installed


In [2]:
infp=PurePath('data//processed//clean_IVE_fut_prices.parq')
df = pd.read_parquet(infp)
cprint(df)

-------------------------------------------------------------------------------
dataframe information
-------------------------------------------------------------------------------
                      price     bid     ask   size      v          dv
dates                                                                
2019-01-11 15:59:58  104.84  104.83  104.85    130    130    13629.20
2019-01-11 16:00:00  104.84  104.83  104.84  11407  11407  1195909.88
2019-01-11 16:10:00  104.84  104.78  105.04      0      0        0.00
2019-01-11 18:30:00  104.84  104.24  105.17      0      0        0.00
2019-01-11 20:00:00  104.84  104.46  105.06      0      0        0.00
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1542701 entries, 2009-09-28 09:30:00 to 2019-01-11 20:00:00
Data columns (total 6 columns):
price    1542701 non-null float64
bid      1542701 non-null float64
ask      1542701 non-null float64
size     1542701 non-null int6

In [5]:
df.head()

Unnamed: 0_level_0,price,bid,ask,size,v,dv
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-09-28 09:30:00,50.79,50.7,50.79,100,100,5079.0
2009-09-28 09:30:00,50.71,50.7,50.79,638,638,32352.98
2009-09-28 09:31:32,50.75,50.75,50.76,100,100,5075.0
2009-09-28 09:31:33,50.75,50.72,50.75,100,100,5075.0
2009-09-28 09:31:50,50.75,50.73,50.76,300,300,15225.0


In [41]:
T = 1000
date = '2009-01-01'
b_t_series = [1]*T
price_series = df[:]['price'].values[:T]


for i in range(1, len(price_series)-1):
    
    if price_series[i] - price_series[i-1] == 0:
        b_t_series[i] = b_t_series[i-1]
    elif price_series[i] - price_series[i-1] !=0:
        b_t_series[i] = abs(price_series[i] - price_series[i-1])/(price_series[i] - price_series[i-1])

print(b_t_series)
theta_t = np.cumsum(b_t_series) #tick imbalance at time t
print(theta_t)
    

[1, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1

In [42]:
len(b_t_series), len(price_series)

(1000, 1000)

In [51]:
df1 = pandas.DataFrame(b_t_series)

In [53]:
df1.ewma()

AttributeError: 'DataFrame' object has no attribute 'ewma'

In [58]:
a = df1.ewm(0.98)

In [61]:
a.arg

AttributeError: 'EWM' object has no attribute 'arg'