In [None]:
import ray
ray.init()

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('float_format', '{:f}'.format)
import pandas_bokeh
import market_cluster as mc
import polygon_ds as pds
import polygon_s3 as ps3
import flow_backfill as fbf
import ray_symbol_details as rsd

import matplotlib as mpl
from matplotlib import style
style.use('fivethirtyeight')
mpl.rcParams['figure.figsize'] = [20, 7]

Install CUDA and cudamat (for python) to enable GPU speedups.


In [None]:
# rsd.symbol_details_ray(symbols=['SPY', 'QQQ'])

In [None]:
# ps3.remove_symbol(symbol='market', tick_type='daily')

In [None]:
# fbf.run_backfill(symbols=['market'], tick_type='daily', start_date='2016-01-01')

In [2]:
start_date = '2018-10-24'
end_date = '2020-11-24'
symbol = 'market'
tick_type='daily'

In [3]:
df = mc.market_cluster_workflow(start_date, end_date)

4502494
-0.32428916062964214
-0.40957078454740864
-0.4373749304274475


In [None]:
df.reset_index().to_feather('market_daily.feather')

In [None]:
ps3.put_df_to_s3(df.reset_index(), tick_type='market_daily', other='returns')

In [11]:
X = df.reset_index(drop=True)
corex4000, full_df = mc.corex_fit(X, n_hidden=200)

Linear CorEx with 200 latent factors
2200 iterations to tol: 0.000010, TC=1237.081039
2758 iterations to tol: 0.000010, TC=1983.804016
1535 iterations to tol: 0.000010, TC=2337.475128
1631 iterations to tol: 0.000010, TC=2488.994537
316 iterations to tol: 0.000010, TC=2546.276489
269 iterations to tol: 0.000010, TC=2567.807068
163 iterations to tol: 0.000010, TC=2580.033051


In [124]:
import corex_linearcorex as lc

In [126]:
lc.pick_n_hidden(X, verbose=True)

[0;31mSignature:[0m [0mlc[0m[0;34m.[0m[0mpick_n_hidden[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mrepeat[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m A helper function to pick the number of hidden factors / clusters to use.
[0;31mFile:[0m      ~/QuantClarity/pandas-polygon/corex_linearcorex.py
[0;31mType:[0m      function


In [80]:
full_df.sort_values(['tcs', 'symbol'], ascending=False)[0:20]

Unnamed: 0,symbol,cluster,tcs
4493,VGT,0,948.407593
4224,TMFC,0,948.407593
3530,QQQ,0,948.407593
3188,ONEQ,0,948.407593
2359,IYW,0,948.407593
2199,IGM,0,948.407593
2181,IETC,0,948.407593
1736,FTEC,0,948.407593
1643,FLGE,0,948.407593
1550,FBGX,0,948.407593


In [None]:
import corex_gaussianize as g

norm = g.Gaussianize(strategy='lambert')

In [None]:
gld = df[df.symbol=='GLD']

In [None]:
norm.fit(df[df.symbol=='GLD'].close)

In [None]:
import h2o
from h2o.estimators import H2OGeneralizedLowRankEstimator
h2o.init()

# Import the USArrests dataset into H2O:
arrestsH2O = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/USArrests.csv")

# Split the dataset into a train and valid set:
train, valid = arrestsH2O.split_frame(ratios=[.8], seed=1234)

# Build and train the model:
glrm_model = H2OGeneralizedLowRankEstimator(k=4,
                                            loss="Huber",
                                            # loss_by_col={'cat_var': 'Categorical'},
                                            regularization_x="OneSparse", # None (default), Quadratic, L2, L1, NonNegative, OneSparse, UnitOneSparse, Simplex.
                                            regularization_y="OneSparse",
                                            gamma_x=0.5,
                                            gamma_y=0.5,
                                            max_iterations=1000,
                                            recover_svd=True,
                                            init="PlusPlus",
                                            transform="standardize",
                                           )
glrm_model.train(training_frame=train)

In [None]:
### market clustering

# %time df = read_market_daily(result_path)
mdf = pd.read_parquet('data/mdf.parquet')

%time npdf, sharpe_ratios = mc.normalize_market_df(mdf)

# %time npdf_resid = colwise_linreg(npdf)
npdf_resid = pd.read_parquet('data/npdf_resid.parquet')

# %time par_cor_mat = npdf_resid.corr()
resid_par_cor_mat = pd.read_parquet('data/partial_cor_mat.parquet')

details_df = pd.read_parquet('data/details_df.parquet')

# cluster_lables = cluster_sim_matrix(similarity=resid_par_cor_mat.abs())
cluster_lables = pd.read_parquet('data/cluster_lables.parquet')

# symbol_meta = join_symbol_data(details_df, cluster_lables, sharpe_ratios, mdf)
symbol_meta = pd.read_parquet('data/symbol_meta.parquet')

# cluster_coheasion = mc.get_cluster_coheasion(sim_df=resid_par_cor_mat.abs(), symbol_meta=symbol_meta, cluster_col='cluster_n200')

In [None]:
symbol_meta['liquidy_rank'] = symbol_meta.groupby('cluster_n200')['daily_avg_dollar_volume'].rank(method='first')

symbol_meta = symbol_meta.sort_values(['cluster_n200', 'liquidy_rank']).reset_index()

by_cluster = symbol_meta.groupby('cluster_n200')
result = []
for cluster, frame in by_cluster:
    counts = dict(frame['sector'].value_counts())
    counts.pop('', None)  # drop symbols w/o sector/industry info
    result.append(counts)

In [None]:
cluster_sec = pd.DataFrame(result)    
cluster_sec = cluster_sec.drop(columns=['Finance'])
cluster_sec_pct = cluster_sec.apply(lambda x: 100 * x / float(x.sum()), axis=1)

cluster_sec_pct.style.background_gradient(cmap='coolwarm', axis=0)

# n += 1
n = 61
print(n)
symbol_meta[symbol_meta.cluster_n200==n].sector.value_counts()

sym_clust = symbol_meta[symbol_meta.cluster_n200==n]
sym_clust