In [1]:
import pandas as pd
import ta

In [2]:
df = pd.read_parquet('all_symbols.parquet')
regime_df = pd.read_parquet('market_index.parquet')
sector_df = pd.read_parquet('sector_index.parquet')

In [4]:
sectors = df.sector.unique()

In [9]:
sector = 'Information Technology'

In [10]:
df_sec = df.loc[df.sector == sector]

In [11]:
sec_symbols = df_sec.symbol.unique()

In [16]:
import itertools as itr

sec_pairs = list(itr.combinations(sec_symbols, 2))

In [28]:
import progressbar
import scipy.stats as stat

data = []
for sym1, sym2 in progressbar.progressbar(sec_pairs):
    df1 = df_sec.loc[df_sec.symbol == sym1].set_index('Date')
    df2 = df_sec.loc[df_sec.symbol == sym2].set_index('Date')
    intersection = df1.index.intersection(df2.index)
    df1 = df1.loc[intersection]
    df2 = df2.loc[intersection]
    data.append(stat.pearsonr(df1.adj_close, df2.adj_close))

100% (10440 of 10440) |##################| Elapsed Time: 0:08:09 Time:  0:08:09


In [31]:
corr = pd.DataFrame(data, columns=['coef', 'p_value'])
pairs = pd.DataFrame(sec_pairs, columns=['symbol1', 'symbol2'])
pair_corr_df = pd.concat((pairs, corr), axis=1, ignore_index=False)

In [32]:
pair_corr_df.sort_values('coef')

Unnamed: 0,symbol1,symbol2,coef,p_value
4993,DBD,FLIR,-0.882701,0.0
2207,BMI,DBD,-0.881864,0.0
5043,DBD,PTC,-0.881826,0.0
4987,DBD,EPAY,-0.868772,0.0
4230,CSCO,DBD,-0.868343,0.0
...,...,...,...,...
1059,AMAT,MKSI,0.969898,0.0
2492,CACI,LDOS,0.970346,0.0
2497,CACI,MANT,0.973002,0.0
2874,CCMP,MPWR,0.974335,0.0


In [3]:
import ta

def prep_regime_filter(regime_df, roc_col_name='regime_roc', mv_col_name='regime_ma',
                       close_name='regime_close',
                       ma_period=200,
                       roc_period=45):
    regime_df.loc[:, roc_col_name] = ta.momentum.ROCIndicator(regime_df.adj_close, n=roc_period).roc()
    regime_df.loc[:, mv_col_name] = regime_df.set_index(
        'Date'
    ).adj_close.rolling('%dd' % ma_period, min_periods=1).mean().values
    regime_df.loc[:, close_name] = regime_df.adj_close
    return regime_df

In [4]:
regime_df = prep_regime_filter(regime_df)

In [5]:
sector_df = sector_df.groupby('sector').apply(lambda x: prep_regime_filter(x, roc_col_name='sector_roc',
                                                                     mv_col_name='sector_ma',
                                                                     close_name='sector_close'))

In [6]:
df = df.merge(regime_df[['Date', 'regime_close', 'regime_ma']], on='Date', how='left')
df = df.merge(sector_df[['Date', 'sector', 'sector_close', 'sector_ma']], on=['Date', 'sector'], how='left')

In [7]:
import ta
import numpy as np

def mean_atr(df, atr_period=14):
    df.loc[:, 'last_close'] = df.adj_close.shift(1)
    atr_high = np.maximum(df.high_adj, df.last_close)
    atr_low = np.minimum(df.high_adj, df.last_close) 
    atr = atr_high - atr_low
    return atr, atr.ewm(span=atr_period, adjust=False).mean()

def mean_close_diff_norm(close, ma):
    return (close - ma) / ma

def manual_mfi(df, period):
    mfi_df = df[['adj_close', 'Date', 'Volume']].set_index('Date')
    mfi_df.loc[:, 'prev_close'] = mfi_df.adj_close.shift(1)
    mfi_df.loc[:, 'perc_change'] = (mfi_df.prev_close - mfi_df.adj_close).abs() / mfi_df.adj_close
    up_index = mfi_df.adj_close > mfi_df.prev_close
    down_index = mfi_df.adj_close < mfi_df.prev_close
    mfi_df.loc[:, 'avg_up'] = 0
    mfi_df.loc[:, 'avg_down'] = 0
    mfi_df.loc[up_index, 'avg_up'] = mfi_df.loc[up_index, 'perc_change'] * mfi_df.loc[up_index, 'Volume']
    mfi_df.loc[down_index, 'avg_down'] = mfi_df.loc[down_index, 'perc_change'] * mfi_df.loc[down_index, 'Volume']
    mfi_df.loc[:, 'avg_up'] = mfi_df.loc[:, 'avg_up'].ewm(alpha=1.0 / period, adjust=False).mean()
    mfi_df.loc[:, 'avg_down'] = mfi_df.loc[:, 'avg_down'].ewm(alpha=1.0 / period, adjust=False).mean()
    mfi = (100.0 - (100.0 / (1 + (mfi_df.avg_up / mfi_df.avg_down)))).values
    return mfi


def generate_ta_features(sym_df, rsi_period=5, roc_period=45, roc_short_period=4, fut_roc_period=5, mfi_period=5,
                         sto_period=14, atr_period=14, volitility_short_period=2, volitility_medium_period=7, bba_period=20,
                         dch_period=20, ma_period=200):
    sym_df = sym_df.sort_values('Date')
    sym_df.loc[:, 'mv_avg'] = sym_df.set_index(
        'Date'
    ).adj_close.rolling('%dd' % ma_period, min_periods=1).mean().values
    rsi = ta.momentum.RSIIndicator(close=sym_df.adj_close, n=rsi_period).rsi()
    roc = ta.momentum.ROCIndicator(sym_df.adj_close, n=roc_period).roc()
    roc_short = ta.momentum.ROCIndicator(sym_df.adj_close, n=roc_short_period).roc()
    roc_fut = ta.momentum.ROCIndicator(sym_df.adj_close, n=fut_roc_period).roc()
    mfi = ta.momentum.MFIIndicator(
        high=sym_df.adj_high, low=sym_df.adj_low,
        close=sym_df.adj_close, volume=sym_df.Volume,
        n=mfi_period
    ).money_flow_index()
    sto = ta.momentum.StochasticOscillator(high=sym_df.adj_high, low=sym_df.adj_low, close=sym_df.adj_close,
                                     n=sto_period).stoch_signal()
    bb = ta.volatility.BollingerBands(close=sym_df.adj_close, n=bba_period)
    bb_high = bb.bollinger_hband()
    bb_low = bb.bollinger_lband()
    bba = bb_high - bb_low
    dc = ta.volatility.DonchianChannel(close=sym_df.adj_close, n=dch_period)
    dc_high = dc.donchian_channel_hband()
    dc_low = dc.donchian_channel_lband()
    dch = dc_high - dc_low
    sym_df.loc[:, 'rsi'] = rsi
    sym_df.loc[:, 'roc'] = roc
    sym_df.loc[:, 'roc_short'] = roc_short
    sym_df.loc[:, 'fut_roc'] = roc_fut.shift(-fut_roc_period)
    sym_df.loc[:, 'mfi'] = mfi
    sym_df.loc[:, 'sto'] = sto
    sym_df.loc[:, 'bba'] = bba
    sym_df.loc[:, 'dch'] = dch
    sym_df.loc[:, 'bba_norm'] = bba / sym_df.adj_close
    sym_df.loc[:, 'dch_norm'] = dch / sym_df.adj_close
    sym_df.loc[:, 'ma_diff_norm'] = mean_close_diff_norm(sym_df.adj_close, sym_df.mv_avg)
    sym_df.loc[:, 'regime_ma_diff_norm'] = mean_close_diff_norm(sym_df.regime_close, sym_df.regime_ma)
    sym_df.loc[:, 'sector_ma_diff_norm'] = mean_close_diff_norm(sym_df.sector_close, sym_df.sector_ma)
    volatility_base = (sym_df.adj_close.diff() / sym_df.adj_close.shift(1)).abs()
    sym_df.loc[:, 'volatility'] = volatility_base.ewm(span=atr_period, adjust=False).mean()
    sym_df.loc[:, 'volatility_short'] = volatility_base.ewm(span=volitility_short_period, adjust=False).mean()
    sym_df.loc[:, 'volatility_medium'] = volatility_base.ewm(span=volitility_medium_period, adjust=False).mean()
    sym_df.loc[:, 'directional_strength'] = -(sym_df.adj_close.diff() / sym_df.adj_close.shift(1)).ewm(span=atr_period,
                                                                                                      adjust=False).mean()
    return sym_df

In [8]:
ta_feats = df.groupby('symbol').apply(generate_ta_features)

In [9]:
ta_feats.to_parquet('ta_data.parquet')