In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime

In [None]:
# TODO IMPORT COINMETRICS AND COINAPI PANELS
# TODO FORM CROSSWALK
# TODO MERGE PANELS

In [None]:
# sets args
cmc_asset_universe_fp = "../data/raw/cmc_asset_universe.pkl"
cmc_cw_fp = "../data/raw/cmc_cw.pkl"
cmc_panel_fp = "../data/raw/cmc_price_volume_mcap_panel.pkl"
cg_cw_fp = "../data/raw/coingecko_cmc_cw.pkl"
cg_panel_fp = "../data/raw/coingecko_price_volume_mcap_panel.pkl"
cm_cw_fp = "../data/raw/coinmetrics_cmc_cw.pkl"
cm_asset_info_fp = '../data/raw/coinmetrics_assets_first_tradable.pkl'
cm_panel_fp = "../data/raw/coinmetrics_initial_panel.pkl"
coinapi_panel_fp = '../data/raw/coinapi_panel.pkl'

# import data
with open(cmc_asset_universe_fp, 'rb') as f:
    cmc_asset_universe_dict = pickle.load(f)
cmc_cw_df =  pd.read_pickle(cmc_cw_fp)
cmc_panel_df = pd.read_pickle(cmc_panel_fp)
cg_cw_df =  pd.read_pickle(cg_cw_fp)
cg_panel_df = pd.read_pickle(cg_panel_fp)
cm_cw_df = pd.read_pickle(cm_cw_fp)
cm_asset_df = pd.read_pickle(cm_asset_info_fp)
cm_panel_df = pd.read_pickle(cm_panel_fp)
ca_panel_df = pd.read_pickle(coinapi_panel_fp)

In [None]:
def formCoinmetricsAssetUniverse(client: CoinMetricsClient, cmc_assets_fp: str) -> pd.DataFrame:
    """ map cmc universe to coinmetrics universe. 
    (1) pull all cm assets and open my universe of cmc assets.
    (2) adjust cm asset names so they match to my cmc assets.
    (3) merge asset ids together on both the cm asset id and the full name.
    (4) clean the merged data.
    (5) add assets from cm that should be in the universe but aren't in cmc.
    (6) remove stablecoins and derivatives.

    Args:
        client (CoinMetricsClient): cm client object for pinging api.
        cmc_assets_fp (str): filepath to cmc asset universe pickle.
    
    Returns:
        merged_df (pd.DataFrame): dataframe of crosswalk between cmc id and cm id.    
    """
    # import cmc token universe
    with open(cmc_assets_fp, 'rb') as f:
        cmc_asset_universe_dict = pickle.load(f)

    # form unique cmc asset df
    cmc_assets = []
    for k, v in cmc_asset_universe_dict.items():
        cmc_assets.extend(v)
    cmc_assets = list(np.unique(np.array(cmc_assets)))
    cmc_assets_df = pd.DataFrame(data={'asset_cmc': cmc_assets})

    # pull all cm assets
    full_asset_catalog = client.catalog_full_assets()
    cm_assets_df = pd.DataFrame(full_asset_catalog)

    # Check that the "asset" column is unique in both dataframes
    assert (cmc_assets_df["asset_cmc"].is_unique 
            and cm_assets_df["full_name"].is_unique 
            and cm_assets_df['asset'].is_unique)

    # remove duplicated cm asset; they have a data error
    cm_assets_df = cm_assets_df[~cm_assets_df.asset.isin(['seed', 'tree', 'aurora'])]

    # change cm full names before merge so they match cmc for known nonmatches
    cm_assets_df.loc[cm_assets_df.asset=='aave', 'full_name'] = 'aave-old'
    cm_assets_df.loc[cm_assets_df.asset=='alpha', 'full_name'] = 'alpha-finance-lab'
    cm_assets_df.loc[cm_assets_df.asset=='mco', 'full_name'] = 'crypto-com'
    cm_assets_df.loc[cm_assets_df.asset=='fet', 'full_name'] = 'fetch'
    cm_assets_df.loc[cm_assets_df.asset=='clv', 'full_name'] = 'clover'
    cm_assets_df.loc[cm_assets_df.asset=='gno', 'full_name'] = 'gnosis-gno'
    cm_assets_df.loc[cm_assets_df.asset=='glm', 'full_name'] = 'golem-network-tokens'
    cm_assets_df.loc[cm_assets_df.asset=='hive', 'full_name'] = 'hive-blockchain'
    cm_assets_df.loc[cm_assets_df.asset=='rook', 'full_name'] = 'keeperdao'
    cm_assets_df.loc[cm_assets_df.asset=='yffii', 'full_name'] = 'yearn-finance-ii'
    cm_assets_df.loc[cm_assets_df.asset=='btt', 'full_name'] = 'bittorrent'
    cm_assets_df.loc[cm_assets_df.asset=='idex', 'full_name'] = 'aurora'
    cm_assets_df.loc[cm_assets_df.asset=='egld', 'full_name'] = 'multiversx-egld'
    cm_assets_df.loc[cm_assets_df.asset=='cfx', 'full_name'] = 'confluxnetwork'
    cm_assets_df.loc[cm_assets_df.asset=='xch', 'full_name'] = 'chia-network'
    cm_assets_df.loc[cm_assets_df.asset=='syn', 'full_name'] = 'synapse2'

    # clean the asset names to just low case letters and numbers and merge
    cmc_assets_df["asset_clean"] = cmc_assets_df["asset_cmc"].str.lower().str.replace(r"[^a-zA-Z0-9]", "")
    cm_assets_df["asset_clean"] = cm_assets_df["full_name"].str.lower().str.replace(r"[^a-zA-Z0-9]", "")
    merged_df = pd.merge(cmc_assets_df, cm_assets_df, 
                        on="asset_clean", how='inner',
                        validate='one_to_one')

    # repeat but use the unique asset abbreviation id from cm
    cm_assets_df["asset_clean"] = cm_assets_df["asset"].str.lower().str.replace(r"[^a-zA-Z0-9]", "")
    merged_df2 = pd.merge(cmc_assets_df, cm_assets_df, 
                        on="asset_clean", how='inner',
                        validate='one_to_one')

    # remove duplicated assets from the two merged dataframes and put them together
    merged_df2 = merged_df2[~merged_df2.asset.isin(list(merged_df.asset.values))]
    merged_df = pd.concat((merged_df, merged_df2))
    assert merged_df.asset.is_unique

    # clean up the merged data
    merged_df = merged_df[['asset_cmc', 'asset']]
    merged_df = merged_df.rename(columns={'asset': 'asset_cm'})
    merged_df = merged_df.reset_index(drop=True)

    # manually add to my universe of cm assets these assets to consider
    assets_to_add = ['ape', 'apt', 'arpa', 'badger', 'bal', 'cake', 'cel', 'comp', 'cvx',
        'dot', 'etc', 'fil', 'flr', 'flux', 'ftt', 'fun', 'gmx', 'grin', 'hnt',
        'inv', 'knc', 'krl', 'luna', 'luna2', 'mir', 'multi', 'nft', 'nu', 'ocean', 'ohm',
        'op', 'poly', 'qi', 'rndr', 'rpl', 'skl', 'snt', 'theta', 'tru', 'xdc', 'zrx']
    merged_df = pd.concat((merged_df, pd.DataFrame(data={'asset_cmc': np.repeat(np.nan, len(assets_to_add)),
                                                        'asset_cm': assets_to_add})))

    # manually remove stables and derivatives
    merged_df = merged_df[~merged_df.asset_cm.isin(['steth', 'wbtc', 'tusd', 'gusd', 'usdd', 'btcb'])]

    # manually add in both aave old and new
    merged_df = pd.concat((merged_df, pd.DataFrame(data={'asset_cmc': ['aave'],
                                                        'asset_cm': ['aave']}))).reset_index(drop=True)

    return merged_df


In [None]:
#

# clean cws and panels before merge
cm_cw_df = cm_cw_df[~cm_cw_df.asset_cm.isin(['xno'])]
cm_cw_df = cm_cw_df[~cm_cw_df.asset_cmc.isin(['aave-old'])]
cm_cw_df = cm_cw_df.rename(columns = {'asset_cmc': 'slug_cmc'})
cg_cw_df = cg_cw_df[~cg_cw_df.asset_cmc.isin(['cronos', 'aave-old', 'yearn-finance-ii'])]
cg_cw_df = cg_cw_df.rename(columns={'asset_cmc': 'slug_cmc'})
cmc_panel_df['date'] = cmc_panel_df['date'].dt.date
assert type(cg_panel_df.date.values[0]) == datetime.date
cm_panel_df['time'] = pd.to_datetime(cm_panel_df['time']).dt.date
cm_panel_df = cm_panel_df.rename(columns={'asset': 'asset_cm', 'time': 'date'})
cm_asset_df = cm_asset_df.rename(columns={'asset': 'asset_cm'})

# merge panels togethers
panel_df = cmc_panel_df.merge(cg_cw_df,
                              on='slug_cmc',
                              how='left',
                              validate='many_to_one')
assert cmc_panel_df.shape[0]==panel_df.shape[0]
panel_df = panel_df.merge(cg_panel_df, on=['date', 'asset_gecko'], how='outer', validate='many_to_one')
panel_df = panel_df.merge(cm_cw_df[~cm_cw_df.slug_cmc.isnull()], on='slug_cmc', how='left', validate='many_to_one')
panel_df = panel_df.merge(cm_panel_df, on=['date', 'asset_cm'], how='outer', validate='many_to_one')

# cut down to window of interest
panel_df = panel_df[panel_df.date.apply(lambda x: x.year) >=2015]


In [None]:
# TODO CLEAN PRICE COLUMN
# TODO CLEAN VOLUME COLUMNS
# TODO CLEAN MCAP COLUMN; USE CM SUPPLY WHERE DONT HAVE MCAP
# TODO subset down to date, asset, price, mcap, and volume and other useful variables

In [None]:
# CLEAN PRICE COLUMN

# drop rows where we have no price data
panel_df = panel_df[~(panel_df.ReferenceRateUSD.isnull() 
                    & panel_df.usd_per_token_cmc.isnull() 
                    & panel_df.usd_per_token_cg.isnull())]

# form the price column
panel_df['usd_per_token'] = np.nan
panel_df.loc[~panel_df.ReferenceRateUSD.isnull(), 'usd_per_token'] = panel_df['ReferenceRateUSD']
panel_df.loc[panel_df.usd_per_token.isnull(), 'usd_per_token'] = panel_df[['usd_per_token_cmc', 'usd_per_token_cg']].mean(axis=1, skipna=True)

# remove rows where the price between cmc and cg is different by more than 50%
panel_df = panel_df[~(panel_df.ReferenceRateUSD.isnull() 
                    & ~panel_df.usd_per_token_cmc.isnull() 
                    & ~panel_df.usd_per_token_cg.isnull()
                    & (np.abs((panel_df.usd_per_token_cmc-panel_df.usd_per_token_cg)/panel_df.usd_per_token_cmc) > 0.5))]

# keep just the final price
panel_df = panel_df.drop(columns=['usd_per_token_cmc', 'usd_per_token_cg', 
                     'PriceUSD', 'ReferenceRate', 'ReferenceRateUSD'], axis=1)

# convert dtype
panel_df['usd_per_token'] = panel_df.usd_per_token.astype(float)

# CLEAN MCAP COLUMN

# drop if there is no mcap data
panel_df = panel_df[~(panel_df.usd_mcap_cmc.isnull()
                    & panel_df.usd_mcap_cg.isnull()
                    & panel_df.CapMrktEstUSD.isnull())]

# set any zeros to missing
panel_df.loc[panel_df.CapMrktEstUSD==0, 'CapMrktEstUSD'] = np.nan
panel_df.loc[panel_df.usd_mcap_cg==0, 'usd_mcap_cg'] = np.nan
panel_df.loc[panel_df.usd_mcap_cmc==0, 'usd_mcap_cmc'] = np.nan

# form the mcap column
panel_df['CapMrktEstUSD'] = panel_df.CapMrktEstUSD.astype(float)
panel_df['usd_mcap'] = panel_df[['CapMrktEstUSD', 'usd_mcap_cg', 'usd_mcap_cmc']].mean(axis=1, skipna=True)
assert 0 == panel_df.usd_mcap.isnull().sum()

# drop rows where mcaps between cg and cmc are more than order of magnitude off when we are missing CM values
panel_df = panel_df[~(panel_df.CapMrktEstUSD.isnull() & ~panel_df.usd_mcap_cg.isnull() & ~panel_df.usd_mcap_cmc.isnull()
                      & (np.abs((panel_df.usd_mcap_cg - panel_df.usd_mcap_cmc)/panel_df.usd_mcap_cmc) > 10))]

# keep just the final price
panel_df = panel_df.drop(columns=['usd_mcap_cmc', 'usd_mcap_cg', 
                                  'CapMrktCurUSD', 'CapMrktEstUSD', 'CapMrktFFUSD', 'CapRealUSD'], axis=1)

In [None]:
# TODO USE THIS FOR CLEANING PANELS

  # drop rows
    panel_df = panel_df[(panel_df.date.dt.year >= 2015) & (panel_df.date <= '2023-02-02')]
    panel_df = panel_df.dropna(how='any', subset=['date', 'cmc_id'])
    panel_df = panel_df.dropna(how='all', subset=['usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc'])

    # form list of data columns to work with
    data_cols = list(panel_df.columns.values)
    data_cols.remove('date')
    data_cols.remove('cmc_id')

    # set negative values to missing and too large values to missing
    for col in data_cols:
        panel_df.loc[panel_df[col] < 0, col] = np.nan
        panel_df.loc[panel_df[col] > 2e12, col] = np.nan

    # drop duplicated rows across id columns
    panel_df = panel_df.drop_duplicates(subset=['date', 'cmc_id'])

    # sort values and reset index
    panel_df = panel_df.sort_values(by=['date', 'cmc_id'], 
                                    ignore_index=True)

In [None]:
# TODO look for continuity within asset. look at returns to see if anything crazy. look if mcap jump is way diff than price jump.
# TODO make sure ranges of values looks good
# TODO go scope old cleaning scripts to make sure i do all of that too

In [None]:
# TODO apply the inclusion criteria on the first on each month; use code below for it
# TODO write down what i am doing in the JMP
# TODO go scope five best factor model papers to ensure mine is best in class. add how i am better to my write up

In [None]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    if not isinstance(returns, np.ndarray):
        raise TypeError("Input 'returns' must be a NumPy array")
    if annualized and periods_in_year is None:
        raise ValueError("Input 'periods_in_year' must be provided if 'annualized' is True")
    geom_avg_at_given_freq = np.prod(1 + returns) ** (1 / np.size(returns)) - 1
    return (geom_avg_at_given_freq + 1) ** periods_in_year - 1 if annualized else geom_avg_at_given_freq

def prepPanelForInitialInclusiveCriteria(panel_df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    """ performs various ad hoc cleaning to prep the panel further for applying inclusion criteria.
    
    Args:
        panel_df (pd.DataFrame): panel of asset prices, trading volumes, and mcaps from cmc.
        cw_df (pd.DataFrame): identifying variables for the assets.
        
    Returns:
        panel_df (pd.DataFrame): cleaned panel.
    """
    # manually remove tokens from panel
    tokens_to_remove = [770, 776, 3787, 8644, 9103]
    panel_df = panel_df[~panel_df.cmc_id.isin(tokens_to_remove)]

    # merge on cmc slug and drop the cmc id
    panel_df = panel_df.merge(cw_df[['cmc_id', 'slug_cmc']],
                              on='cmc_id',
                              how='inner',
                              validate='many_to_one')
    panel_df = panel_df.drop('cmc_id', axis=1)
    panel_df = panel_df[['date', 'slug_cmc', 'usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc']]
    panel_df = panel_df.sort_values(by=['date', 'slug_cmc'], ignore_index=True)

    # adjust particular values
    panel_df.loc[(panel_df.slug_cmc=='uquid-coin') & 
                  panel_df.usd_volume_24h_cmc.isnull(), 'usd_volume_24h_cmc'] = 0

    # ensure no missing in the df
    assert(0==panel_df.isnull().sum().sum())

    # ensure unique on key columns
    dups = panel_df.duplicated(subset=['date', 'slug_cmc'])
    assert(~dups.any()),('there are duplicates in the data on keys date and slug_cmc')

    # drop more tokens manually
    # NOTES: ampleforth is a stablecoin, pax gold is a gold stablecoin, index, and wrapped tokens
    wrapped_tokens_to_drop = ['ampleforth', 'cryptoindex-com-100', 'pax-gold',
                            'wrapped-centrifuge', 'wrapped-luna-token', 'wrapped-ncg', 'wrapped-nxm']
    panel_df = panel_df[~panel_df.slug_cmc.isin(wrapped_tokens_to_drop)]

    return panel_df
def buildInitialAssetUniverse(panel_df: pd.DataFrame, start_date: date, end_date: date) -> dict:
    """ build an initial universe of assets to pull data for.
    
    Args:
        panel_df (pd.DataFrame): panel of asset prices, trading volumes, and mcaps from cmc.
        start_date (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.
    
    Returns:
        asset_universe (dict): keys of start of each month in study period with associated value
                               of list of asset names to include.
    """
    # specify the dates to obtain
    dates = [start_date.strftime('%Y-%m-%d')]
    current_date = start_date+relativedelta(months=1)
    while current_date <= end_date:
        dates.append(current_date.strftime('%Y-%m-%d'))
        current_date += relativedelta(months=1)

    # apply suff data, volume, and mcap filters
    asset_universe_per_month = []
    for i in range(len(dates)-1):
        # determine start and end dates for window
        start_window = dates[i]
        end_window   = dates[i+1]

        # build temporary dataframe for this time period
        temp_df = panel_df[(panel_df.date >= start_window) & (panel_df.date <= end_window)].copy()

        # obtain list of tokens to consider
        assets_included = list(np.unique(temp_df[temp_df.date == end_window].slug_cmc.values))

        # figure out tokens removed due to insuff data
        # note: 28 days ensures at least 4 weeks of data 
        asset_ns_df = temp_df.groupby('slug_cmc').size()
        assets_lost_given_insuff_data = list(asset_ns_df[asset_ns_df < 28].index.values)
        for asset in assets_lost_given_insuff_data:
            if asset in assets_included:
                assets_included.remove(asset)

        # Figure out tokens removed due to volume threshold
        temp_vol_df = temp_df.groupby('slug_cmc').usd_volume_24h_cmc.min()
        assets_lost_given_insuff_vol = list(temp_vol_df[temp_vol_df < 10000].index.values)
        for asset in assets_lost_given_insuff_vol:
            if asset in assets_included:
                assets_included.remove(asset)

        # Figure out assets removed due to mcap threshold
        current_year = int(end_window[:4]) 
        if current_year <= 2016:
            mcap_threshold = 750000
        elif current_year == 2017:
            mcap_threshold = 2e6
        elif current_year == 2018:
            mcap_threshold = 30e6
        elif current_year in [2019, 2020]:
            mcap_threshold = 15e6
        elif current_year >= 2021:
            mcap_threshold = 75e6
        temp_mcap_df = temp_df.groupby('slug_cmc').usd_mcap_cmc.min()
        assets_lost_given_mcap_threshold = list(temp_mcap_df[temp_mcap_df < mcap_threshold].index.values)
        for asset in assets_lost_given_mcap_threshold:
            if asset in assets_included:
                assets_included.remove(asset)

        # Report out new asset ever
        print('New assets that we have never had are ')
        if i != 0:
            all_assets = []
            for j in range(i-1,-1,-1):
                all_assets += asset_universe_per_month[j]
            print(np.unique(set(assets_included).difference(set(all_assets))))
        else:
            print(np.unique(assets_included))
        print('\n')

        # Report out assets for this month
        print(f'This month\'s ({end_window}) {len(assets_included)} assets are:')
        print(np.unique(assets_included))
        print('\n\n')

        # Add assets to list
        asset_universe_per_month.append(list(np.unique(assets_included)))

    # build asset universe
    asset_universe_dict = {}
    for i in range(len(dates)-1):
        asset_universe_dict[dates[i+1]] = asset_universe_per_month[i]

    return asset_universe_dict

def determineUniqueAssets(asset_universe_dict) -> list:
    """ determine the unique assets in the universe to return as a list. """
    assets = []
    for k, v in asset_universe_dict.items():
        assets.extend(v)
    assets = list(np.unique(np.array(assets)))
    assets.sort()
    return assets

In [None]:
# TODO manually check that the universe makes sense for maybe 6-10 of the random sampling of hte first years and 
# the last 3-6 random sampling over 2-3 years?

# TODO output the coinapi to cm crosswalk for this universe as well as a dictionary of the cmc ids at the start of each month

# TODO convert all the code to functions with professional documentation
