In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import polygon_backfill as pb
import pandas_bokeh

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import os
import s3fs
from io import BytesIO


def get_s3_df(symbol:str, date:str, columns=None) -> pd.DataFrame:
    s3 = s3fs.S3FileSystem(
        key=os.environ['B2_ACCESS_KEY_ID'], 
        secret=os.environ['B2_SECRET_ACCESS_KEY'], 
        client_kwargs={'endpoint_url': os.environ['B2_ENDPOINT_URL']}
    )    
    byte_data = s3.cat(f"polygon-equities/data/trades/symbol={symbol}/date={date}/data.feather")
    df_bytes_io = BytesIO(byte_data)
    return pd.read_feather(df_bytes_io, columns=columns)


def condition_filter(condition_array):
    confirmed_bad = [2, 5, 7, 10, 12, 13, 15, 16, 17, 18, 19, 20, 22, 28, 29, 33, 38, 52, 53]
    if condition_array is not None:
        filter = any(np.isin(condition_array, confirmed_bad))
    else: 
        filter = False
    return filter


def outlier_score(df, window_len=7):
    df['outlier_diff'] = abs(df.price - df.price.rolling(window=window_len, center=True).median())
    df['outlier_zs'] = (df['outlier_diff'] - df['outlier_diff'].mean()) / df['outlier_diff'].std(ddof=0)
    return df


def plot_price(df):
    df.price.plot(figsize=[20,7])


In [4]:
df = get_s3_df(symbol='SPY', date='2020-08-25', columns=['exchange_epoch', 'price', 'size', 'condition'])
# df = pd.read_feather('data.feather', columns=['exchange_epoch', 'price', 'size', 'condition'])
# df['date_time'] = pd.to_datetime(df.exchange_epoch)

In [9]:
bad_ticks = df.condition.apply(condition_filter)

df_f = df[~bad_ticks]

In [42]:
df_o = outlier_score(df)

In [43]:
df_f_o = outlier_score(df_f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# df['price_diff'] = df.price.diff()
# df['price_diff_zscore'] = (df['price_diff'] - df['price_diff'].mean()) / df['price_diff'].std(ddof=0)

df['cond_str'] = df.condition.astype('str')

# df['price_rmed'] = 
# df['price_rmed_diff'] = abs(df.price - df.price.rolling(window=7, center=True).median())
price_rmed_diff = abs(df.price - df.price.rolling(window=7, center=True).median())
df['price_rmed_diff_zs'] = (price_rmed_diff - price_rmed_diff.mean()) / price_rmed_diff.std(ddof=0)

In [None]:
good = [0, 1, 3, 4, 8, 9, 11, 14, 23, 25, 27, 28, 30, 34, 36]
listed_bad = [2, 5, 7, 10, 13, 15, 16, 17, 18, 19, 20, 21, 22, 29, 33, 37, 52, 53]
confirmed_bad = [2, 7, 10, 12, 13, 15, 16, 17, 20, 22, 38, 52, 53]
neverseen_bad = [5, 18, 19, 21, 29, 33]
listed_blank = [6, 17, 18, 19, 24, 26, 32, 35, 39-51, 54, 55, 56, 59]
after_hours = [12]
odd_lot = [37]
neutral = [41]

In [None]:
df.groupby('cond_str')['price_rmed_diff_zs'].describe().sort_values('mean', ascending=False)

In [None]:
dfe = df.explode('condition').dropna()

dfe.groupby('condition')['price_rmed_diff_zs'].describe().sort_values('mean', ascending=False)

In [None]:
df_w = pd.get_dummies(df.explode(column='condition'), columns=['condition']).reset_index().groupby('index').sum()

In [None]:
bad_ticks = df.condition.apply(filter_conditions)

In [None]:
wide_df = pd.get_dummies(df.condition.explode()).reset_index().groupby('index').sum()
wide_df

In [None]:
df.loc[(df.price_rmed_diff_zs < 40), 'price_rmed_diff_zs'] = 0

In [None]:
full_df = wide_df.join(df['price_rmed_diff_zs'])
full_df = full_df.dropna()

In [None]:
full_df_corr = full_df.corr()

In [None]:
full_df_corr['price_rmed_diff_zs'].sort_values(ascending=False)

In [None]:
df.price_rmed_diff_zs.plot_bokeh(kind='hist', sizing_mode="scale_height", bins=500)

In [None]:
df.price_rmed_diff_zs.describe(percentiles=[.7,.8,.9,.99,.999])

In [None]:
full_df_corr.price_rmed_diff_zs.sort_values(ascending=False)

In [None]:
n = df.sort_values('price_rmed_diff_zs', ascending=False).index[80]
n

In [None]:
df[n-7:n+7]

In [None]:
df2 = df
df2.index = pd.to_datetime(df.exchange_epoch)

In [None]:
df2.price.plot(figsize=[20,5])

In [None]:
df2.price_rmed_diff_zs.plot(figsize=[20,5])

In [None]:
df.price_rmed_diff_zs[140000:160000].plot_bokeh(kind='hist', sizing_mode="scale_height", bins=500)

In [None]:
trades = pb.get_ticks_date('GLD', '2020-08-03', 'trades')
df = pb.trades_to_df(trades)
# df = pd.read_feather('GLD_trades.feather')

In [None]:
from market_cluster import *

symbol='DUST'
start_date='2020-06-01'
end_date='2020-07-04'
result_path="/Users/bobcolner/QuantClarity/data"
date_partition='hive'
tick_type='trades'
formats=['feather', 'parquet']
skip=True
# backfill_data(symbol, start_date, end_date, result_path, date_partition, tick_type, formats, skip)

# .plot_bokeh(kind='hist', bins=50, sizing_mode="scale_height", disable_scientific_axes=True)
# df.style.background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)  # 'Blues', 'coolwarm', 'RdBu_r' & 'BrBG' are other good diverging colormaps
# pd.set_option('precision', 2)

In [None]:
# %time df = read_market_daily(result_path)
%time mdf = pd.read_parquet('data/mdf.parquet')

%time npdf, sharpe_ratios = normalize_market_df(mdf)

# %time npdf_resid = colwise_linreg(npdf)
%time npdf_resid = pd.read_parquet('data/npdf_resid.parquet')

# %time par_cor_mat = npdf_resid.corr()
%time resid_par_cor_mat = pd.read_parquet('data/partial_cor_mat.parquet')

%time details_df = pd.read_parquet('data/details_df.parquet')

# cluster_lables = cluster_sim_matrix(similarity=resid_par_cor_mat.abs())
%time cluster_lables = pd.read_parquet('data/cluster_lables.parquet')

# symbol_meta = join_symbol_data(details_df, cluster_lables, sharpe_ratios, mdf)
%time symbol_meta = pd.read_parquet('data/symbol_meta.parquet')

# cluster_coheasion = get_cluster_coheasion(sim_df=resid_par_cor_mat.abs(), symbol_meta=symbol_meta, cluster_col='cluster_n200')

In [None]:
symbol_meta['liquidy_rank'] = symbol_meta.groupby('cluster_n200')['daily_avg_dollar_volume'].rank(method='first')

symbol_meta = symbol_meta.sort_values(['cluster_n200', 'liquidy_rank']).reset_index()

In [None]:
by_cluster = symbol_meta.groupby('cluster_n200')
result = []
for cluster, frame in by_cluster:
    counts = dict(frame['sector'].value_counts())
    counts.pop('', None)  # drop symbols w/o sector/industry info
    result.append(counts)

cluster_sec = pd.DataFrame(result)    
cluster_sec = cluster_sec.drop(columns=['Finance'])
cluster_sec_pct = cluster_sec.apply(lambda x: 100 * x / float(x.sum()), axis=1)

In [None]:
cluster_sec_pct.style.background_gradient(cmap='coolwarm', axis=0)

In [None]:
# n += 1
n = 61
print(n)
symbol_meta[symbol_meta.cluster_n200==n].sector.value_counts()

In [None]:
sym_clust = symbol_meta[symbol_meta.cluster_n200==n]
sym_clust

In [None]:
import random

random.choices(population=list(sym_clust.symbol), weights=list(sym_clust.liquidy_rank), k=len(sym_clust.symbol))

In [None]:
from scipy.cluster.hierarchy import linkage, is_valid_linkage, fcluster
from scipy.spatial.distance import pdist

## Load dataset
X = np.load("dataset.npy")

## Hierarchical clustering
dists = pdist(X)
Z = linkage(dists, method='centroid', metric='euclidean')

print(is_valid_linkage(Z))

## Now let's say we want the flat cluster assignement with 10 clusters.
#  If cut_tree() was working we would do
from scipy.cluster.hierarchy import cut_tree
cut = cut_tree(Z, 10)

clust = fcluster(Z, k, criterion='maxclust')

In [None]:
# from scipy.cluster.hierarchy import cut_tree
from scipy import cluster
np.random.seed(23)
X = np.random.randn(50, 4)
Z = cluster.hierarchy.ward(X, )
cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])
cutree[:10]

In [None]:
def compound_interest(principle:float, rate:float, peroids:int): 
    # Calculates compound interest  
    total_return = principle * (pow((1 + rate / 100), peroids)) 
    print("Total Interest $:", round(total_return, 2))
    print("Anualized Peroid %", round(total_return / principle, 1) * 100)

# compount daily for 1 year (market days)
compound_interest(principle=100000, rate=.5, peroids=250)


In [None]:
import os
import pyarrow.dataset as ds
from pyarrow import fs

s3  = fs.S3FileSystem(
    access_key=os.environ['B2_ACCESS_KEY_ID'], 
    secret_key=os.environ['B2_SECRET_ACCESS_KEY'], 
    endpoint_override=os.environ['B2_ENDPOINT_URL']
)

dataset = ds.dataset(
    source='polygon-equities/data/trades/',
    format='feather',
    filesystem=s3,
    partitioning='hive',
    exclude_invalid_files=True
)

subset = (ds.field('symbol') == 'SPY') & (ds.field('date') == '2020-08-05')

df = dataset.to_table(filter=subset).to_pandas()