In [127]:
import pandas as pd
import numpy as np
import datetime
from typing import Dict
from dateutil import rrule
import pickle
from typing import List

In [128]:
def formCrosswalkDf(cm_panel_df: pd.DataFrame, ca_panel_df: pd.DataFrame) -> pd.DataFrame:
    """
    Form a crosswalk between assets in cm_panel_df and ca_panel_df.

    Args:
        cm_panel_df: A pandas DataFrame containing asset data from CoinMetrics.
        ca_panel_df: A pandas DataFrame containing asset data from CoinAPI.

    Returns:
        A pandas DataFrame containing a crosswalk between assets in cm_panel_df and ca_panel_df.
    """
    # Form crosswalk
    cm_cw_df = cm_panel_df[['asset']].drop_duplicates().copy()
    cm_cw_df = cm_cw_df.rename(columns={'asset': 'asset_cm'})
    ca_cw_df = ca_panel_df[['asset_id']].drop_duplicates().copy()
    ca_cw_df = ca_cw_df.rename(columns={'asset_id': 'asset_coinapi'})
    assert cm_cw_df.asset_cm.is_unique & ca_cw_df.asset_coinapi.is_unique
    cm_cw_df = cm_cw_df.sort_values(by='asset_cm', ignore_index=True)
    ca_cw_df = ca_cw_df.sort_values(by='asset_coinapi', ignore_index=True)

    # Adjust names to match
    cm_cw_df['asset_lower_cm'] = cm_cw_df.asset_cm.str.lower()
    ca_cw_df['asset_lower_coinapi'] = ca_cw_df.asset_coinapi.str.lower()

    # Adjust asset names manually to match
    ca_cw_df.loc[ca_cw_df.asset_coinapi=='IOTA', 'asset_lower_coinapi'] = 'miota'
    cm_cw_df.loc[cm_cw_df.asset_cm=='one_harmony', 'asset_lower_cm'] = 'one'
    assert cm_cw_df.asset_lower_cm.is_unique & ca_cw_df.asset_lower_coinapi.is_unique

    # Match names on direct matches
    cw_df = cm_cw_df.merge(ca_cw_df, left_on='asset_lower_cm', right_on='asset_lower_coinapi', how='inner', validate='one_to_one')

    # Manually remove asseets that are derivatives or unpurchasable
    cw_df = cw_df[~cw_df.asset_cm.isin(['paxg', 'elon', 'gnt', 'go',  'msol', 'pax'])]

    return cw_df[['asset_cm', 'asset_coinapi']]

In [129]:
def mergePanels(cw_df: pd.DataFrame, cm_panel_df: pd.DataFrame, ca_panel_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge three pandas DataFrames containing cryptocurrency data.

    Args:
    - cw_df (pd.DataFrame): DataFrame containing cryptocurrency asset names from CoinAPI
    - cm_panel_df (pd.DataFrame): DataFrame containing cryptocurrency data from CoinMetrics
    - ca_panel_df (pd.DataFrame): DataFrame containing cryptocurrency data from CoinAPI

    Returns:
    - pd.DataFrame: A merged pandas DataFrame containing data from all three input DataFrames.
    """
    # Ensure dates are the same
    assert type(ca_panel_df.date.values[0]) == type(cm_panel_df.date.values[0])

    # Add Coinmetrics asset names to CoinAPI panel
    ca_panel_df = ca_panel_df.merge(cw_df, left_on='asset_id', right_on='asset_coinapi', how='inner', validate='many_to_one')

    # Merge panels togethers
    panel_df = cm_panel_df.merge(ca_panel_df,
                                left_on=['date', 'asset'],
                                right_on=['date', 'asset_cm'],
                                how='inner',
                                validate='one_to_one')

    # Remove unnecessary columns and reorder and sort
    panel_df = panel_df.drop(columns=['asset', 'asset_id'])
    cols = list(panel_df.columns.values)
    cols.remove('date')
    cols.remove('asset_cm')
    cols.remove('asset_coinapi')
    panel_df = panel_df[['date', 'asset_cm', 'asset_coinapi'] + cols]
    panel_df = panel_df.sort_values(by=['date', 'asset_cm'], ignore_index=True)

    return panel_df

In [130]:
def cleanPanel(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Clean the panel data, including forming the price, volume, trade count, and mcap columns.
    
    Args:
        panel_df (pd.DataFrame): raw panel data.
    
    Returns:
        panel_df (pd.DataFrame): updated and clean panel data.
    """
    # CLEAN PRICE COLUMN

    # Confirm no rows where we have no price data
    assert 0 == panel_df[panel_df.usd_per_token_cm.isnull() & panel_df.usd_per_token_coinapi.isnull()].shape[0]

    # Confirm usd_per_token_coinapi is always non missing
    assert 0 == panel_df.usd_per_token_coinapi.isnull().sum()

    # Adjust volumes to be nonzero before volume weighted average
    panel_df.loc[panel_df.usd_volume_cm==0, 'usd_volume_cm'] = 1
    panel_df.loc[panel_df.usd_volume_coinapi==0, 'usd_volume_coinapi'] = 1

    # Form price column
    panel_df['usd_per_token'] = np.nan
    panel_df.loc[panel_df.usd_per_token_cm.isnull(), 'usd_per_token'] = panel_df['usd_per_token_coinapi']
    panel_df.loc[~panel_df.usd_per_token_cm.isnull() 
            & ~panel_df.usd_per_token_coinapi.isnull(), 'usd_per_token'] = panel_df.apply(lambda row: 
                                                            (row['usd_per_token_cm']*row['usd_volume_cm'] 
                                                            + row['usd_per_token_coinapi']*row['usd_volume_coinapi']) 
                                                            / (row['usd_volume_cm'] + row['usd_volume_coinapi']), axis=1)
    assert 0 == panel_df.usd_per_token.isnull().sum()

    # Adjust back the volumes 
    panel_df.loc[panel_df.usd_volume_cm==1, 'usd_volume_cm'] = 0
    panel_df.loc[panel_df.usd_volume_coinapi==1, 'usd_volume_coinapi'] = 0

    # Fix known issues with prices
    panel_df.loc[(panel_df.asset_cm == 'luna')
                & ~panel_df.usd_per_token_cm.isnull(), 'usd_per_token'] = panel_df.usd_per_token_cm
    panel_df.loc[(panel_df.date.dt.year==2022)
                & (panel_df.date.dt.month==10)
                & (panel_df.asset_cm=='poly'), 'usd_per_token'] = panel_df.usd_per_token_cm
    panel_df.loc[(panel_df.date.dt.year==2022)
                & (panel_df.date.dt.month==12)
                & (panel_df.asset_cm=='alpha'), 'usd_per_token'] = panel_df.usd_per_token_cm
    panel_df.loc[(panel_df.date.dt.year==2022)
                & (panel_df.date.dt.month==12)
                & (panel_df.asset_cm=='ont'), 'usd_per_token'] = panel_df.usd_per_token_cm
    assert 0 == panel_df.usd_per_token.isnull().sum()

    # Remove old price columns
    panel_df = panel_df.drop(columns=['usd_per_token_coinapi', 'usd_per_token_cm'])

    # CLEAN VOLUME COLUMN

    # Confirm coinapi isn't missing volumes
    assert 0 == panel_df.usd_volume_coinapi.isnull().sum()
    assert 0 == panel_df.trades_coinapi.isnull().sum()

    # Form volume column
    panel_df['usd_volume'] = np.nan
    panel_df.loc[panel_df.usd_volume_cm.isnull(), 'usd_volume'] = panel_df['usd_volume_coinapi']
    panel_df.loc[panel_df.usd_volume_cm.notnull(), 
                'usd_volume'] = (panel_df.usd_volume_cm
                                        + panel_df.usd_volume_coinapi)/2
    assert 0 == panel_df.usd_volume.isnull().sum()

    # Form trades column
    panel_df['trades_count'] = np.nan
    panel_df.loc[panel_df.trades_cm.isnull(), 'trades_count'] = panel_df['trades_coinapi']
    panel_df.loc[panel_df.trades_cm.notnull(), 
                'trades_count'] = (panel_df.trades_cm + panel_df.trades_coinapi)/2
    assert 0 == panel_df.trades_count.isnull().sum()

    # Remove old volume columns
    panel_df = panel_df.drop(columns = ['usd_volume_cm', 'usd_volume_coinapi',
                                        'trades_cm', 'trades_coinapi'])
    
    # CLEAN MCAP COLUMN

    # Use estimated mcap from coinmetrics given it has broadest coverage
    panel_df['usd_mcap'] = np.nan
    panel_df.loc[panel_df.CapMrktEstUSD.notnull(), 'usd_mcap'] = panel_df.CapMrktEstUSD

    # Use remaining mcap measures in order of closeness to estimated mcap by CM
    panel_df.loc[panel_df.CapMrktEstUSD.isnull()
                & panel_df.CapMrktCurUSD.notnull(), 'usd_mcap'] = panel_df.CapMrktCurUSD
    panel_df.loc[panel_df.CapMrktEstUSD.isnull()
                & panel_df.CapMrktCurUSD.isnull()
                & panel_df.CapRealUSD.notnull(), 'usd_mcap'] = panel_df.CapRealUSD

    # Where mcap is missing, use supply measures to calculate mcap
    panel_df.loc[panel_df.CapMrktEstUSD.isnull()
                & panel_df.CapMrktCurUSD.isnull()
                & panel_df.CapRealUSD.isnull()
                & panel_df.SplyCur.notnull(), 'usd_mcap'] = panel_df.usd_per_token * panel_df.SplyCur

    # Remove old mcap columns
    panel_df = panel_df.drop(columns=['CapMrktCurUSD', 'CapMrktEstUSD', 'CapMrktFFUSD', 'CapRealUSD', 
                                    'SplyAct1yr', 'SplyActEver', 'SplyCur', 'SplyFF'], axis=1)
    
    # Manually fix specific assets
    panel_df.loc[panel_df.asset_cm.isin(['xym', 'nft', 'gari', 'tfuel']), 'usd_mcap'] = np.nan

    # Drop rows where mcap is missing
    panel_df = panel_df[panel_df.usd_mcap.notnull()]

    # FINAL CLEANING
    
    # Keep final columns
    panel_df = panel_df[['date', 'asset_cm', 'asset_coinapi', 'usd_per_token', 'usd_mcap', 'usd_volume', 'trades_count']]

    # drop rows
    panel_df = panel_df[(panel_df.date.dt.year >= 2015) & (panel_df.date <= '2023-02-02')]
    panel_df = panel_df.dropna(how='any', subset=['date', 'asset_cm', 'asset_coinapi'])

    # set negative values to missing and too large values to missing
    data_cols = list(panel_df.columns.values)
    data_cols.remove('date')
    data_cols.remove('asset_cm')
    data_cols.remove('asset_coinapi')
    for col in data_cols:
        panel_df.loc[panel_df[col] < 0, col] = np.nan
        panel_df.loc[panel_df[col] > 2e12, col] = np.nan

    # Within asset, confirm there are no missing dates before first and last date
    df = pd.DataFrame()
    assets = list(np.unique(panel_df.asset_cm.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = panel_df[panel_df.asset_cm==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 31)]
        num_days_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 31)]
        start_days = dates[indices_to_expand]
        new_days = []
        for i in range(len(start_days)):
            start_day = start_days[i]
            days_to_add = num_days_to_add[i]
            for j in range(1, days_to_add):
                new_days.append(start_day+np.timedelta64(24*(j), 'h'))
        
        # add the new days to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_days})
        new_asset_df['asset_cm'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # forward fill the price and mcap columns
        asset_df['usd_per_token'] = asset_df.usd_per_token.ffill()
        asset_df['usd_mcap'] = asset_df.usd_mcap.ffill()

        # replace volume and trades with zeros
        asset_df.loc[asset_df.usd_volume.isnull(), 'usd_volume'] = 0
        asset_df.loc[asset_df.trades_count.isnull(), 'trades_count'] = 0

        # add data to master df
        df = pd.concat((df, asset_df))

    # Keep only a single asset column
    df = df.drop(columns='asset_coinapi', axis=1)
    df = df.rename(columns={'asset_cm': 'asset'})
    
    # drop duplicated rows across id columns
    df = df.drop_duplicates(subset=['date', 'asset'])

    # sort values and reset index
    df = df.sort_values(by=['date', 'asset'], ignore_index=True)

    return df

In [131]:
def buildAssetUniverse(
    panel_df: pd.DataFrame, weekly_portfolio_size: int, volume_prct_share_threshold: float
    ) -> Dict[str, list]:
    """ build an initial universe of assets to pull data for.
    
    Args:
        panel_df (pd.DataFrame): panel of asset prices, trading volumes, and mcaps from cmc.
        weekly_portfolio_size (int): total dollar amount of weekly portfolio trading size.
        volume_prct_share_threshold (float): threshold on percent of market volume for our strat.
    
    Returns:
        asset_universe (Dict[str, list]): keys of start of each month in study period with associated value
                                            of list of asset names to include.
    """
    # specify the dates to obtain
    start_date = datetime.date(2016, 7, 1)
    end_date   = datetime.date(2022, 12, 1)
    dates = [dt.strftime('%Y-%m-%d') for dt in rrule.rrule(rrule.MONTHLY, dtstart=start_date, until=end_date)]

    # apply suff data, volume, and mcap filters
    asset_universe_per_month = []
    for i in range(3, len(dates)):
        # determine start and end dates for window 
        start_window = dates[i-3]
        end_window   = dates[i]

        # build temporary dataframe for this time period
        temp_df = panel_df[(panel_df.date >= start_window) & (panel_df.date < end_window)].copy()

        # obtain list of tokens to consider that are in the panel before the date we are forming universe for
        assets_included = list(np.unique(temp_df.asset.values))

        # figure out tokens removed due to insuff data; 28*3 days ensures at least 12 weeks of data 
        asset_ns_df = temp_df.groupby('asset').size()
        assets_lost_given_insuff_data = list(asset_ns_df[asset_ns_df < 28*3].index.values)
        for asset in assets_lost_given_insuff_data:
            if asset in assets_included:
                assets_included.remove(asset)

        # Set initial volume threshold to $100k and mcap threshold to roughly 1 prct of total mcap
        volume_threshold = 250000
        current_year = int(end_window[:4]) 
        if current_year <= 2016:
            mcap_threshold = 0.5e6
        elif current_year in [2017]:
            mcap_threshold = 1e6
        elif current_year in [2018, 2019, 2020]:
            mcap_threshold = 10e6
        elif current_year >= 2021:
            mcap_threshold = 25e6

        # Calc mean mcap and median vol dataframes for this month
        mean_mcap_df = temp_df.groupby('asset')[['usd_mcap']].mean()
        med_vol_df = temp_df.groupby('asset')[['usd_volume']].median()

        # Figure out tokens removed due to initial volume threshold and
        #     if 2018 or later, for not breaking 10% of median daily volume on given portfolio size
        temp_vol_df = med_vol_df[med_vol_df.usd_volume>=volume_threshold].copy()
        if current_year < 2018:
            final_vol_df = temp_vol_df.copy()
        else:
            vol_mcap_df = mean_mcap_df[mean_mcap_df.index.isin(temp_vol_df.index)].copy()
            dollar_per_mcap_invested = weekly_portfolio_size / vol_mcap_df.usd_mcap.sum()
            vol_mcap_df['weekly_volume'] = vol_mcap_df.usd_mcap*dollar_per_mcap_invested
            assert(np.isclose(weekly_portfolio_size, vol_mcap_df.weekly_volume.sum()))
            final_vol_df = temp_vol_df.merge(vol_mcap_df, on='asset', how='inner')
            final_vol_df['volume_share'] = final_vol_df.weekly_volume / final_vol_df.usd_volume
            final_vol_df = final_vol_df[final_vol_df.volume_share < volume_prct_share_threshold]
            print(f"Maximum volume share percentage in panel for this month is: {np.round(final_vol_df.volume_share.max(), 2)}")

        # Subset assets to those that pass volume thresholds, always keeping btc and eth
        volume_asset_universe = list(final_vol_df.index.values)
        if 'btc' not in volume_asset_universe:
            volume_asset_universe += ['btc']
        if 'eth' not in volume_asset_universe:
            volume_asset_universe += ['eth']
        assets_included = [asset for asset in assets_included if asset in volume_asset_universe]

        # Figure out assets removed due to mcap threshold 
        assets_lost_given_mcap_threshold = list(mean_mcap_df[mean_mcap_df.usd_mcap < mcap_threshold].index.values)
        for asset in assets_lost_given_mcap_threshold:
            if asset in assets_included:
                assets_included.remove(asset)

        # Report out new asset ever
        print('New assets that we have never had are ')
        if i != 3:
            all_assets = []
            for j in range(i-4,-1,-1):
                all_assets += asset_universe_per_month[j]
            print(np.unique(set(assets_included).difference(set(all_assets))))
        else:
            print(np.unique(assets_included))
        print('\n')

        # Report out assets for this month
        print(f'This month\'s ({end_window}) {len(assets_included)} assets are:')
        print(np.unique(assets_included))
        print('\n\n')

        # Add assets to list
        asset_universe_per_month.append(list(np.unique(assets_included)))

    # build asset universe
    asset_universe_dict = {}
    for i in range(3, len(dates)):
        asset_universe_dict[dates[i]] = asset_universe_per_month[i-3]

    return asset_universe_dict


In [132]:
def finalClean(panel_df: pd.DataFrame, asset_universe: List[str]) -> pd.DataFrame:
    """ final few cleaning and checks of the panel.
    
    Args: 
        panel_df (pd.DataFrame): panel of crypto asset prices, mcaps, and volumes.
        asset_universe (List[str]): list of CoinMetrics unique asset names.

    Return: panel_df of same data but cleaned.
    """
    # cut down to post oct 2016 and pre 2023
    panel_df = panel_df[(panel_df.date >= '2016-07-01') & (panel_df.date <= '2023-01-02')]

    # cut down to asset universe
    panel_df = panel_df[panel_df.asset.isin(asset_universe)]
    
    # confirm no missing
    assert 0 == panel_df.isnull().sum().sum()

    # sort
    panel_df = panel_df.sort_values(by=['date', 'asset'], ignore_index=True)

    return panel_df

def determineUniqueAssets(asset_universe_dict) -> list:
    """ determine the unique assets in the universe to return as a list. """
    assets = []
    for k, v in asset_universe_dict.items():
        assets.extend(v)
    assets = list(np.unique(np.array(assets)))
    assets.sort()
    return assets

In [133]:
if __name__ == "__main__":
    # Set args
    CM_PANEL_FP = '../data/raw/coinmetrics_panel_initial.pkl'
    CA_PANEL_FP = '../data/raw/coinapi_panel.pkl'
    PANEL_OUT_FP = '../data/derived/basic_panel.pkl'
    CW_OUT_FP = '../data/derived/cm_to_coinapi_cw.pkl'
    ASSET_OUT_FP = '../data/clean/asset_universe_dict.pickle'
    WEEKLY_PORTFOLIO_SIZE_USD = 5e6
    VOLUME_PRCT_SHARE_THRESHOLD = 0.04

    # Import panels
    cm_panel_df = pd.read_pickle(CM_PANEL_FP)
    ca_panel_df = pd.read_pickle(CA_PANEL_FP)

    # Form crosswalk
    cw_df = formCrosswalkDf(cm_panel_df, ca_panel_df)

    # Merge panels
    panel_df = mergePanels(cw_df, cm_panel_df, ca_panel_df)

    # Clean the data
    panel_df = cleanPanel(panel_df)

    # Build asset universe and save
    asset_universe_dict = buildAssetUniverse(
        panel_df, WEEKLY_PORTFOLIO_SIZE_USD, VOLUME_PRCT_SHARE_THRESHOLD)
    with open(ASSET_OUT_FP, 'wb') as f:
        pickle.dump(asset_universe_dict, f)
    asset_universe = determineUniqueAssets(asset_universe_dict)

    # Final clean and save
    panel_df = finalClean(panel_df, asset_universe)
    panel_df.to_pickle(PANEL_OUT_FP)
    cw_df = cw_df[cw_df.asset_cm.isin(asset_universe)].reset_index(drop=True)
    cw_df.to_pickle(CW_OUT_FP)


New assets that we have never had are 
['btc' 'eth']


This month's (2016-10-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2016-11-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2016-12-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2017-01-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2017-02-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2017-03-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2017-04-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[set()]


This month's (2017-05-01) 2 assets are:
['btc' 'eth']



New assets that we have never had are 
[{'xrp', 'ltc'}]


This month's (2017-06-01) 4 assets are:
['btc' 'eth' 'ltc' 'xrp']



New assets that we 