In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.linear_model import LinearRegression as LR
from statsmodels.stats.weightstats import DescrStatsW
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib notebook
pd.set_option('display.max_rows', 500)

from valkyrie.tools import HOSTNAME
from valkyrie.securities import stocks_good_dvd, parent
from valkyrie.quants.linear_model import lm_fit, wcorr, analyze_features
from valkyrie.quants.feature_analyzer import FeatureMgr

# Load Data

In [2]:
#feature_mgr = FeatureMgr('20210101', '20210630', ['TWO PRC'] , 'latest')#
feature_mgr = FeatureMgr('20210101', '20210630', stocks_good_dvd() , 'eve_features3_2x0_quoter_cooldown_600')
ret_ns = [int(1 * 2 * 60 * 6.5), int(3 * 2 * 60 * 6.5), int(5 * 2 * 60 * 6.5), int(10 * 2 * 60 * 6.5)]

ret_col, s2_col = 'risk_tv', 'cywa_mkt_s2h'
df_tv = feature_mgr.calc_rets('log', ret_col, ret_ns, s2_col)
df_tv['parent'] = df_tv['ticker'].apply(parent)

ret_col, s2_col = 'cywa_mkt_xh', 'cywa_mkt_s2h'
df_cy = feature_mgr.calc_rets('log', ret_col, ret_ns, s2_col)
df_cy['parent'] = df_cy['ticker'].apply(parent)
gc.collect()

0

# Checking

### against nbbo

In [None]:
hls = [1, 60, 600, 3600]
print(f'Within bbo %')
tv_cols = [c for c in df_tv if 'cywa_' in c and '_tv' in c and 'pff_' not in c] + ['tv']
df_against_nbbo = []
for col in ['risk_tv'] + tv_cols:
    below = df_tv.eval(f'{col} <= bid ').sum() / df_tv.shape[0]
    above = df_tv.eval(f'{col} >= ask ').sum() / df_tv.shape[0]
    df_against_nbbo.append({'name': col, 'below' : below, 'above' : above})
df_against_nbbo = pd.DataFrame(df_against_nbbo).set_index('name')
print(df_against_nbbo)

### tv stats

In [None]:
for col in tv_cols:    
    df_tv[f'drt_{col}'] = df_tv.eval(f'{col} - risk_tv')
dtv_cols = [c for c in df_tv if 'drt_' in c] 
cols = tv_cols + dtv_cols
pd.concat( [pd.DataFrame(df_tv[cols].diff().std(), columns=['diff_std']), df_tv[cols].describe(percentiles = [0.01, 0.5,0.99]).T],axis= 1)

### dtv corr

In [None]:
ret_cols = [f'risk_tv_ret_{ret}_n' for ret in ret_ns]
dtv_cols = [c for c in df_tv if 'drt_' in c] 
cols = dtv_cols + ret_cols
df_tv[cols].corr()[ret_cols + dtv_cols].query('index.str.contains("drt_")')#[ret_cols + dtv_cols]

In [None]:
#weighted correlation
wcoeff = {}
for n in ret_ns:
    X = df_tv[['drt_tv', f'risk_tv_ret_{n}_n']].values
    dsw = DescrStatsW(X, weights=df_tv[f'wgt_risk_tv_ret_{n}_n'].values)
    wcoeff[f'drt_tv_vs_risk_tv_ret_{n}_n'] = dsw.corrcoef[0,1]
pd.DataFrame(wcoeff, index = ['corr'])

# Check by name

In [None]:
df_tv.query('ticker == "AGNCM" and date == "20210104"')[['bid','ask','risk_tv']].plot()

In [None]:
df_tv.query('ticker == "AGNCM"')['bid ask tv'.split() + [c for c in df_tv if 'cywa_10' in c]].head(5000).to_csv('/home/bb/tmp/tmp.csv')

In [None]:
df_tv.query('ticker == "TWO PRC"')['bid ask tv'.split()].head(5000).plot(alpha = 0.7)

In [None]:
df_tv.query('ticker == "BAC PRB" and "20210428" <= date <= "20210429"')[['cywa_mkt_s2h']].plot(alpha = 0.7)
df_tv.query('ticker == "BAC PRB" and "20210428" <= date <= "20210429"')[['bid','ask','risk_tv']].plot(alpha = 0.7)

# Horizon analysis

In [None]:
xcols = [f'mean_revsion_{hl}' for hl in 'short mid long vlong vvlong vvvlong'.split()]
ycols = [f'cywa_mkt_xh_ret_{hz}_n' for hz in ret_ns]
wcols = [f'wgt_cywa_mkt_xh_ret_{hz}_n' for hz in ret_ns]

wcorr(df_cy, xcols, ycols, wcols)

In [None]:
wcoeff = {}
for hz in [780, 2340, 3900, 7800]:
    for hl in 'short mid long vlong vvlong vvvlong'.split():
        c = f'feature_mean_revsion_{hl}'
        df_cy[c] = df_cy.eval(f'cywa_{hl}_xh - cywa_mkt_xh')
        df_cy[f'wgt_mean_revsion_{hl}'] = df_cy.eval(f'1.0 / (cywa_{hl}_s2h + cywa_mkt_s2h)')    
        res = wcorr(df_cy,[c], [f'cywa_mkt_xh_ret_{hz}_n'], [f'wgt_mean_revsion_{hl}'])
        display(res)                

In [None]:
for hl in 'short mid long vlong vvlong vvvlong'.split():    
    df_cy[f'mean_revsion_{hl}'] = df_cy.eval(f'cywa_{hl}_xh - cywa_mkt_xh')

# Combined Analysis

In [None]:
def analyze_features(df, exclude_features, xcols, ycols, wcols):
    res = {}
    res[f'all'] = lm_fit(df, xcols, ycols, wcols).loc['r2']
    for ef in exclude_features:
        xm_cols = [c for c in xcols if not c.endswith(ef)]
        c = f'frm {ef}'
        res[c] = lm_fit(df, xm_cols, ycols, wcols).loc['r2']
        res[c] = res[f'all'] - res[c] #r2 from removing ef
        res[ef] = lm_fit(df, [ef], ycols, wcols).loc['r2'] #r2 from ef alone
    res = pd.DataFrame(res)
    return res

In [None]:
th = 1e-4
df = df_cy.query('abs(feature_cluster) > @th or abs(feature_long_dpff) > @th or abs(feature_mean_revsion_vlong) > @th')
exclude_features = ['_cluster', '_dpff', '_vlong']
ycols = [f'cywa_mkt_xh_ret_{ret}_n' for ret in ret_ns]
wcols = [f'wgt_cywa_mkt_xh_ret_{ret}_n' for ret in ret_ns]
xcols = [c for c in df if 'feature_' in c]

res = analyze_features(df, exclude_features, xcols, ycols, wcols)
display(res)

In [None]:
[c for c in df_cy if 'feature_' in c]

# Combine Features

In [20]:
cywa_hl = 'long'
yhz = 780 * 5



df_cy['feature_long_dpff'] = df_cy.eval(f'pff_cywa_{cywa_hl}_xh - cywa_{cywa_hl}_xh')
df_cy['feature_mkt_dpff'] = df_cy.eval(f'pff_cywa_mkt_xh - cywa_mkt_xh ')
df_cy['total_s2'] = df_cy.eval(f'cywa_mkt_s2h + cywa_{cywa_hl}_s2h')
df_cy['wgt'] = df_cy.eval('1.0 / total_s2')

df_fit = df_cy.query('abs(feature_long_dpff - feature_mkt_dpff) >= 0.0 * 1.0e-4').copy()
#print(f'{df_fit.shape[0]/df.shape[0]}')

#xcols = [c for c in df_fit if 'feature_' in c]
xcols = ['feature_long_dpff', 'feature_mkt_dpff']
ycols = [f'cywa_mkt_xh_ret_{yhz}_n']
#xcols = ['feature_cluster']
res = lm_fit(df_fit, xcols, ycols, ['wgt'])
res

Unnamed: 0,cywa_mkt_xh_ret_3900_n
r2,0.014383
feature_long_dpff,-8.028215
feature_mkt_dpff,8.106622
model,Linefit_False_q=0.03


In [33]:
df_cy.drop('feature_log_pff',axis = 1 , inplace=True)

In [83]:
cywa_hl = 'long'
yhz = 780 * 5


df_cy['feature_log_long_dpff'] = df_cy.eval(f'pff_cywa_{cywa_hl}_xh / cywa_{cywa_hl}_xh').apply(np.log)
df_cy['feature_log_mkt_dpff'] = df_cy.eval(f'pff_cywa_mkt_xh / cywa_mkt_xh ').apply(np.log)

#df_cy['feature_log_pff'] = df_cy.eval(f'pff_cywa_mkt_xh / cywa_mkt_xh ').apply(np.log) - df_cy.eval(f'pff_cywa_{cywa_hl}_xh / cywa_{cywa_hl}_xh').apply(np.log)

df_cy['total_s2'] = df_cy.eval(f'cywa_mkt_s2h + cywa_{cywa_hl}_s2h + cluster_{cywa_hl}_avg_s2 + (1.0 / wgt_cywa_mkt_xh_ret_{yhz}_n)')
df_cy['wgt'] = df_cy.eval('1.0 / total_s2')

xcols = [c for c in df_cy if 'feature_log_' in c]

ycols = [f'cywa_mkt_xh_ret_{yhz}_n']
#xcols = ['feature_cluster']
res = lm_fit(df_cy, xcols, ycols, ['wgt'], quantile = 1e-4)
res

Unnamed: 0,cywa_mkt_xh_ret_3900_n
r2,0.017508
feature_log_long_dpff,-0.623403
feature_log_mkt_dpff,0.629421
model,Linefit_False_q=0.0001


In [85]:
z = df_cy['wgt'] * (df_cy['feature_log_long_dpff'] * -0.623403 + df_cy['feature_log_mkt_dpff']*0.629421)

In [87]:
np.mean((df_cy['feature_log_long_dpff'] * -0.623403 + df_cy['feature_log_mkt_dpff']*0.629421))

-0.000711130151431774

In [89]:
45776.0703125 * -0.000711130151431774

-32.55274381327966

In [88]:
df_cy['wgt'].mean()

45776.0703125

In [42]:
x1 = np.array([-1.208621e-01,-1.208463e-01])
w = np.array([-0.332835,	0.338029])
w1 = np.array([-0.445879, 0.447191])

In [None]:
df_cy['']

In [30]:

np.sum(x1 * w)

-0.0006224168892000059

In [43]:
np.sum(x1 * w1)

-0.00015150545739999605

In [39]:
np.sum(df_cy[ycols[0]] * df_cy['wgt'])/np.sum(df_cy['wgt'] )

-0.00029226646

In [45]:
-0.00062241688920 * 25

-0.015560422229999999

In [66]:
x = np.random.randn(100000,2)

In [67]:
x.shape

(100000, 2)

In [68]:
y = 0.3 *x[:,0] + 0.4*x[:,1]

In [69]:
ym = y - np.mean(y)

In [70]:
from sklearn import linear_model

In [78]:
lr = linear_model.LinearRegression(fit_intercept=False)

In [79]:
lr.fit(x,y)
lr.score(x,y)

1.0

In [80]:
lr.coef_

array([0.3, 0.4])

In [81]:
np.mean(y)

-0.00020943723786793364

In [82]:
np.mean(ym)

7.815970093361102e-19