In [1]:
from dateutil.relativedelta import relativedelta
from datetime import datetime
import pandas as pd
import numpy as np
import pickle


In [2]:
def formHourlyAssetUniverse(df: pd.DataFrame, asset_universe_dict: dict) -> dict:
    # initialize new asset universe
    new_asset_universe_dict = {}

    # set quantile to consider 
    quantile_threshold = 0.5
    volume_threshold   = 2e5
    spread_threshold   = 5e-3

    # generate months to loop over to subset asset universe
    def generate_dates(start, end):
        start, end = [datetime.strptime(d, "%Y-%m-%d") for d in [start, end]]
        return [(start + relativedelta(months=i)).strftime('%Y-%m-%d') for i in range((end.year - start.year) * 12 + end.month - start.month + 1) if start.day == 1]

    monthly_dates = generate_dates("2020-12-01", "2022-12-01")

    # subset universe
    for i in range(len(monthly_dates)-1):
        # form dates
        previous_month = monthly_dates[i]
        current_month  = monthly_dates[i+1]

        # form current asset universe
        current_month_asset_universe = asset_universe_dict[current_month]

        # obtain relevant data
        temp_df = df[(df.asset.isin(current_month_asset_universe))
                    & (df.date >= previous_month) & (df.date < current_month)][[
                'date', 'asset', 'r_ex_tp1', 'char_volume_t', 'char_spread_bps_t'
            ]].reset_index(drop=True).copy()

        # form volume data
        volume_df = temp_df.groupby('asset')[['char_volume_t']].apply(lambda x: np.quantile(x, quantile_threshold)).reset_index()
        volume_df = volume_df.rename(columns={0: 'volume'})
        volume_df = volume_df.sort_values(by='volume')

        # form spread data
        spread_df = temp_df.groupby('asset')[['char_spread_bps_t']].apply(lambda x: np.quantile(x, quantile_threshold)).reset_index()
        spread_df = spread_df.rename(columns={0: 'spread'})
        spread_df = spread_df.sort_values(by='spread', ascending=False)

        # determine assets to drop
        assets_to_drop_below_volume_threshold = list(volume_df[volume_df.volume < volume_threshold].asset.values)
        assets_to_drop_below_spread_threshold = list(spread_df[spread_df.spread > spread_threshold].asset.values)

        # report
        print(current_month)
        print(f"dropping assets below volume threshold: {assets_to_drop_below_volume_threshold}")
        print(f"dropping assets above spread threshold: {assets_to_drop_below_spread_threshold}")

        # update new asset universe
        assets_to_drop = list(set(assets_to_drop_below_volume_threshold + assets_to_drop_below_spread_threshold))
        for asset in assets_to_drop:
            current_month_asset_universe.remove(asset)
        new_asset_universe_dict[current_month] = current_month_asset_universe

    return new_asset_universe_dict


In [3]:
if __name__ == "__main__":
    # set args
    ASSET_IN_FP = '../data/clean/asset_universe_hourly_dict.pickle'
    TRAIN_IN_FP = '../data/clean/panel_train.pkl'
    TEST_IN_FP  = '../data/clean/panel_test.pkl'
    ASSET_OUT_FP = '../data/clean/strict_asset_universe_hourly_dict.pickle'

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    train_df = pd.read_pickle(TRAIN_IN_FP)
    test_df  = pd.read_pickle(TEST_IN_FP)

    # combine
    df = pd.concat([train_df, test_df])

    # subset asset universe dict
    new_asset_universe_dict = formHourlyAssetUniverse(df, asset_universe_dict)

    # save
    with open(ASSET_OUT_FP, 'wb') as f:
        pickle.dump(new_asset_universe_dict, f)

2021-01-01
dropping assets below volume threshold: ['kcs']
dropping assets above spread threshold: ['xrp', 'kcs', 'link']
2021-02-01
dropping assets below volume threshold: ['comp', 'kcs']
dropping assets above spread threshold: ['xrp', 'ada', 'kcs', 'link', 'ltc', 'comp']
2021-03-01
dropping assets below volume threshold: ['xlm', 'bch']
dropping assets above spread threshold: ['dnt', 'cvc', 'xlm', 'xrp', 'nu', 'lrc', 'ada', 'zrx', 'band', 'dot', 'link', 'ltc', 'comp', 'bch']
2021-04-01
dropping assets below volume threshold: ['xlm', 'bch', 'uma', 'mkr', 'band', 'lrc', 'comp', 'knc', 'cvc']
dropping assets above spread threshold: ['vet', 'dnt', 'xlm', 'cvc', 'xrp', 'lrc', 'nu', 'mana', 'bat', 'ada', 'zrx', 'grt', 'knc', 'bnt', 'band', 'luna', 'uma', 'link', 'dot', 'ltc', 'comp', 'bch']
2021-05-01
dropping assets below volume threshold: ['bnt', 'uma', 'lrc', 'knc', 'xlm', 'enj', 'matic', 'cvc', 'dnt', 'nu', 'bat']
dropping assets above spread threshold: ['trx', 'vet', 'dnt', 'matic', 'x