In [1]:
import pandas as pd
import numpy as np

In [2]:
def cleanCrosswalk(cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Cleans a crosswalk DataFrame by ensuring that both asset IDs are unique, 
    sorting the columns, sorting by the asset name, and resetting the index.

    Args:
        cw_df: A pandas DataFrame representing the crosswalk.

    Returns:
        A pandas DataFrame with the cleaned crosswalk.

    Raises:
        AssertionError: If either of the asset IDs are not unique.
    """
    # Ensure both asset ids are unique
    assert cw_df['asset_san'].is_unique, "asset_san must be unique"
    assert cw_df['asset_cm'].is_unique, "asset_cm must be unique"

    # Sort columns
    cw_df = cw_df[['asset_cm', 'asset_san']]

    # Sort by cm asset name and reset index
    cw_df = cw_df.sort_values(by='asset_cm').reset_index(drop=True)

    return cw_df

In [3]:
def cleanPanel(df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Clean the santiment panel with a whole lotta steps that should
        be in their own functions!
    """
    # Convert date column to datetime format and remove timezone information
    df['date'] = pd.to_datetime(df['datetime'], utc=True).dt.tz_localize(None)
    df = df.drop('datetime', axis=1)
    assert len(df) == df[df.date.dt.minute==0].shape[0]

    # Cut down to relevant dates between July 1, 2016 and January 2, 2023
    df = df[(df['date'] >= '2016-07-01') & (df['date'] <= '2023-01-02')]

    # Clean up category
    df.loc[df.asset_san=='bnb-pancakeswap', 'category_san'] = 'Decentralized Exchange'
    df.loc[df.asset_san=='xrp', 'category_san'] = 'Cryptocurrency'
    df.loc[df.asset_san=='a-benqi', 'category_san'] = 'Lending'
    df.loc[df.asset_san=='creditcoin', 'category_san'] = 'Lending'

    # Confirm no mising dates nor asset ids nor category
    assert 0 == df.date.isnull().sum()
    assert 0 == df.asset_san.isnull().sum()
    assert 0 == df.category_san.isnull().sum()

    # Confirm all asset ids are in cw
    assert cw_df.asset_san.is_unique
    asset_ids = list(cw_df.asset_san.values)
    assert df.shape[0] == df[df.asset_san.isin(asset_ids)].shape[0]

    # Drop any duplicates
    df = df.drop_duplicates(subset=['date', 'asset_san'])

    # Convert all columns to float32
    cols = list(df.columns.values)
    cols.remove('date')
    cols.remove('asset_san')
    cols.remove('category_san')
    for col in cols:
        df[col] = df[col].astype('float32')

    # Set column order
    cols.sort()
    df = df[['date', 'asset_san', 'category_san']+cols]

    # Loop over all assets to add any missing days and fill missing price, volume, and mcap data
    final_df = pd.DataFrame()
    df = df.sort_values(by=['date', 'asset_san'], ignore_index=True)
    assets = list(np.unique(df.asset_san.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = df[df.asset_san==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'h').astype(int))
        
        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32*24)]
        num_datetime_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32*24)]
        start_datetimes = dates[indices_to_expand]
        new_datetimes = []
        for i in range(len(start_datetimes)):
            start_datetime = start_datetimes[i]
            datetime_to_add = num_datetime_to_add[i]
            for j in range(1, datetime_to_add):
                new_datetimes.append(start_datetime+np.timedelta64(j, 'h'))

        # add the new datetimes to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_datetimes})
        new_asset_df['asset_san'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # drop any duplicates added
        asset_df = asset_df.drop_duplicates(subset=['date'])

        # forward fill 
        asset_df = asset_df.ffill(limit=32*24)

        # if asset contains nonmissing obs in a column but starts missing, then fill first obs until nonmissing with 0
        cols_to_fill_first_row_with_zero = list(asset_df.columns[asset_df.notna().any() & asset_df.iloc[0].isna()].values)
        for col in cols_to_fill_first_row_with_zero:
            first_non_missing_index = asset_df[col].first_valid_index()
            asset_df.loc[:(first_non_missing_index-1), col] = 0

        # add data to master df
        final_df = pd.concat((final_df, asset_df))

    # reset df name
    del df
    df = final_df.copy()

    # Fill remaining missingness with crossectional median besides price, mcap, and volume
    cols_with_missing = [col for col in df.columns if df[col].isna().any()]
    for col in cols_with_missing:
        medians = df.groupby('date')[col].transform('median')
        df[col].fillna(medians, inplace=True)

    # Ensure no missing values
    assert 0 == df.isnull().sum().sum()

    # Ensure all columns have san in name
    df.columns = [col if col == 'date' or col.endswith('_san') else col + '_san' for col in df.columns]

    # ensure no duplicates by date and asset
    assert not df.duplicated(subset=['date', 'asset_san']).any()

    # Sort by date then asset and reset index
    df = df.sort_values(by=['date', 'asset_san'], ignore_index=True)

    return df

In [4]:
def cleanMacro(macro_df: pd.DataFrame) -> pd.DataFrame:
    """ Clean the Santiment macro data with a whole lotta steps that
        should be their own functions... :) 
    """
    # Drop columns
    macro_df = macro_df.drop(['total_assets_issued', 'mcd_locked_token',
                            'uniswap_total_user_claims_amount',
                            'uniswap_total_lp_claims_amount'], axis=1)

    # Convert date column to datetime format and remove timezone information
    macro_df['date'] = pd.to_datetime(macro_df['datetime'], utc=True).dt.tz_localize(None)
    macro_df = macro_df.drop('datetime', axis=1)
    assert len(macro_df) == macro_df[macro_df.date.dt.minute==0].shape[0]
    macro_df = macro_df.sort_values(by='date', ignore_index=True)

    # Cut down to relevant dates between July 1, 2016 and January 2, 2023
    macro_df = macro_df[(macro_df['date'] >= '2016-07-01') & (macro_df['date'] <= '2023-01-02')]

    # drop duplicates
    assert 0 == macro_df.asset.isnull().sum()
    macro_df = macro_df.drop_duplicates(subset=['date', 'asset'])

    # Form cross-sectional sum
    x_sec_sum_cols = ['aave_v2_total_borrowed_usd', 'aave_v2_total_deposits_usd',
        'aave_v2_total_liquidations_usd', 'aave_v2_total_new_debt_usd',
        'aave_v2_total_supplied_usd', 'compound_total_borrowed_usd',
        'compound_total_deposits_usd', 'compound_total_liquidations_usd',
        'compound_total_new_debt_usd', 'compound_total_supplied_usd',
        'dai_created', 'dai_repaid', 'makerdao_total_borrowed_usd',
        'makerdao_total_deposits_usd', 'makerdao_total_supplied_usd',
        'total_trade_volume_by_dex', 'nft_retail_trade_volume_usd',
        'nft_retail_trades_count', 'nft_trade_volume_usd', 'nft_trades_count',
        'nft_whale_trade_volume_usd', 'nft_whale_trades_count',
        'uniswap_total_claims_amount', 'usdt_binance_open_interest', 'usdt_binance_open_value']
    x_sec_sum_df = macro_df.groupby(['date'])[x_sec_sum_cols].sum().reset_index()
    x_sec_sum_df = x_sec_sum_df.rename(columns={'aave_v2_total_borrowed_usd': 'total_aave_borrowed', 
        'aave_v2_total_deposits_usd': 'total_aave_deposits',
        'aave_v2_total_liquidations_usd': 'total_aave_liq', 
        'aave_v2_total_new_debt_usd': 'total_aave_new_debt',
        'aave_v2_total_supplied_usd': 'total_aave_supply', 
        'compound_total_borrowed_usd': 'total_compound_borrowed',
        'compound_total_deposits_usd': 'total_compound_deposits', 
        'compound_total_liquidations_usd': 'total_compound_liq',
        'compound_total_new_debt_usd': 'total_compound_new_debt', 
        'compound_total_supplied_usd': 'total_compound_supply',
        'dai_created': 'total_dai_created',
        'dai_repaid': 'total_dai_repaid', 
        'makerdao_total_borrowed_usd': 'total_maker_borrowed',
        'makerdao_total_deposits_usd': 'total_maker_deposits', 
        'makerdao_total_supplied_usd': 'total_maker_supply',
        'total_trade_volume_by_dex': 'total_dex_volume', 
        'nft_retail_trade_volume_usd': 'total_nft_retail_volume',
        'nft_retail_trades_count': 'total_nft_retail_trades', 
        'nft_trade_volume_usd': 'total_nft_volume', 
        'nft_trades_count': 'total_nft_trades',
        'nft_whale_trade_volume_usd': 'total_nft_whale_volume', 
        'nft_whale_trades_count': 'total_nft_whale_trades',
        'uniswap_total_claims_amount': 'total_uni_claims', 
        'usdt_binance_open_interest': 'total_open_interest_usdt_binance', 
        'usdt_binance_open_value': 'total_open_value_usdt_binance'})

    # Form cross-sectional medians
    x_sec_median_cols = ['aave_v2_stable_borrow_apy', 'aave_v2_supply_apy',
                    'aave_v2_variable_borrow_apy', 'mcd_collat_ratio',
                    'mvrv_usd_intraday', 'usdt_binance_funding_rate']
    x_sec_med_df = macro_df.groupby(['date'])[x_sec_median_cols].median().reset_index()
    x_sec_med_df = x_sec_med_df.ffill()
    x_sec_med_df = x_sec_med_df.fillna(0)
    x_sec_med_df = x_sec_med_df.rename(columns = {'aave_v2_stable_borrow_apy': 'aave_med_borrow_apy', 
        'aave_v2_supply_apy': 'aave_med_supply_apy',
        'aave_v2_variable_borrow_apy': 'aave_med_variable_borrow_apy', 
        'mcd_collat_ratio': 'mcd_med_collat_ratio',
        'mvrv_usd_intraday': 'mvrv_med', 
        'usdt_binance_funding_rate': 'funding_rate_med_usdt_binance'})

    # Form cross-sectional means
    x_sec_avg_df = macro_df.groupby(['date'])[['mcd_liquidation']].mean().reset_index()
    x_sec_avg_df = x_sec_avg_df.fillna(0)
    x_sec_avg_df = x_sec_avg_df.rename(columns={'mcd_liquidation': 'mcd_avg_liq'})

    # Form single time-series but just for eth
    eth_df = macro_df[macro_df.asset=='ethereum'][['date', 'eth2_roi', 'eth2_stakers_count',
        'average_fees_usd', 'fees_usd', 'median_fees_usd', 'mvrv_usd_intraday']].reset_index(drop=True)
    eth_df = eth_df.rename(columns = {'eth2_roi': 'eth_roi', 
        'eth2_stakers_count': 'eth_stakers_count',
        'average_fees_usd': 'eth_avg_fee', 
        'fees_usd': 'eth_total_fee', 
        'median_fees_usd': 'eth_median_fee', 
        'mvrv_usd_intraday': 'eth_mvrv'})
    eth_df.loc[eth_df.eth_roi.isnull(), 'eth_roi'] = 0
    eth_df.loc[eth_df.eth_stakers_count.isnull(), 'eth_stakers_count'] = 0
    eth_df = eth_df.ffill()

    # Form single times-series but just for btc
    btc_df = macro_df[macro_df.asset=='bitcoin'][['date', 'mvrv_usd_intraday']].reset_index(drop=True)
    btc_df = btc_df.rename(columns = {'mvrv_usd_intraday': 'btc_mvrv'})

    # merge dfs
    final_df = x_sec_sum_df.merge(x_sec_med_df, on='date', how='outer', validate='one_to_one')
    final_df = final_df.merge(x_sec_avg_df, on='date', how='outer', validate='one_to_one')
    final_df = final_df.merge(eth_df, on='date', how='outer', validate='one_to_one')
    final_df = final_df.merge(btc_df, on='date', how='outer', validate='one_to_one')
    macro_df = final_df.copy()
    del final_df

    # Ensure all dates are present
    macro_df.set_index('date', inplace=True)
    min_dt, max_dt = macro_df.index.min(), macro_df.index.max()
    full_date_range = pd.date_range(start=min_dt, end=max_dt, freq='1H')
    assert len(full_date_range) == macro_df.shape[0]
    macro_df = macro_df.reset_index()

    # Confirm no missings in the df
    assert(macro_df.isnull().sum().sum() == 0)

    # Set column order
    cols = list(macro_df.columns.values)
    cols.remove('date')
    cols.sort()
    macro_df = macro_df[['date']+cols]

    # Convert all columns to float32
    for col in cols:
        macro_df[col] = macro_df[col].astype('float32')

    # Ensure all columns have cmc in name
    macro_df.columns = [col if col == 'date' or col.endswith('_san') else col + '_san' for col in macro_df.columns]

    # ensure no duplicates by date and asset
    assert not macro_df.duplicated(subset=['date']).any()

    # Sort by date then asset and reset index
    macro_df = macro_df.sort_values(by=['date'], ignore_index=True)
    
    return macro_df

In [5]:
if __name__ == "__main__":
    # set args
    CW_IN_FP = '../data/raw/san_coinmetrics_cw.pkl'
    PANEL_IN_FP = "../data/raw/san_panel.pkl"
    MACRO_IN_FP = '../data/raw/san_macro.pkl'
    CW_OUT_FP = '../data/derived/san_cm_cw.pkl'
    PANEL_OUT_FP = "../data/derived/san_panel.pkl"
    MACRO_OUT_FP = '../data/derived/san_macro.pkl'

    # import data
    cw_df = pd.read_pickle(CW_IN_FP)
    df = pd.read_pickle(PANEL_IN_FP)
    macro_df = pd.read_pickle(MACRO_IN_FP)

    # Extract column from crosswalk before dropping it
    df = df.rename(columns={'asset': 'asset_san'})
    df = df.merge(cw_df[['asset_san', 'category_san']], on='asset_san', how='left', validate='many_to_one')

    # clean data
    cw_df = cleanCrosswalk(cw_df)
    df = cleanPanel(df, cw_df)    
    macro_df = cleanMacro(macro_df)

    # save data
    cw_df.to_pickle(CW_OUT_FP)
    df.to_pickle(PANEL_OUT_FP)
    macro_df.to_pickle(MACRO_OUT_FP)