In [18]:
import pandas as pd
import numpy as np

In [19]:
def cleanCrosswalk(cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Cleans a crosswalk DataFrame by ensuring that both asset IDs are unique,
    sorting the columns, sorting by the asset name, and resetting the index.

    Args:
        cw_df: A pandas DataFrame representing the crosswalk.

    Returns:
        A pandas DataFrame with the cleaned crosswalk.
    """
    # Ensure asset ids for cmc is unique
    assert cw_df['asset_cmc'].is_unique, "asset_cmc must be unique"

    # Sort columns
    sorted_cw_df = cw_df[['asset_cm', 'asset_cmc']]

    # Sort by cm asset name and reset index
    cleaned_cw_df = sorted_cw_df.sort_values(by='asset_cm', ignore_index=True)

    return cleaned_cw_df


In [20]:
def cleanPanel(df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Clean the CMC panel.
    
    Args:
        df (pd.DataFrame): raw panel data.
        cw_df (pd.DataFrame): cleaned cmc to cm crosswalk.
    
    Returns:
        (pd.DataFrame): cleaned panel data.
    """
    # drop useless col
    df = df.drop(['tvl_ratio', 'platform'], axis=1)

    # date col
    assert 0 == df.date.isnull().sum()
    assert type(df.date.values[0]) == np.datetime64, "date is not the correct type."
    df['date'] = df.date.dt.ceil('H')

    # Cut down to relevant dates
    df = df[(df['date'] >= '2016-07-01') & (df['date'] <= '2023-01-02')]

    # asset col
    assert 0 == df.asset_cmc.isnull().sum()
    asset_ids = list(cw_df.asset_cmc.values)
    assert df.shape[0] == df[df.asset_cmc.isin(asset_ids)].shape[0]

    # drop duplicated datetime and asset
    df = df.drop_duplicates(subset=['date', 'asset_cmc'])

    # For price, volume, and mcap columns, ensure in range, ffill if not, no missing, and convert type down
    thresholds = {'usd_per_token': 1e9, 'usd_volume_24h': 1e12, 'usd_mcap': 1e13}
    for col, thresh in thresholds.items():
        df.loc[df[col]>thresh, col] = np.nan
        df[col] = df.groupby('asset_cmc')[col].fillna(method='ffill')
        assert 0 == df[col].isnull().sum()
        assert 0 == df[(df[col]<0) | (df[col] > thresh)].shape[0]
        df[col] = df[col].astype('float32')
        
    # rank column 
    df['rank_cmc'] = df.groupby('asset_cmc')['rank_cmc'].fillna(method='ffill')
    df['rank_cmc'] = df.groupby('date')['rank_cmc'].transform(lambda x: x.fillna(2 * x.max()))
    df['rank_cmc'] = df.rank_cmc.astype(int)
    assert np.min(df.rank_cmc)>=1
    assert np.max(df.rank_cmc)<1e4
    assert 0 == df.rank_cmc.isnull().sum()

    # num market pairs
    df['num_market_pairs_cmc'] = df.groupby('asset_cmc')['num_market_pairs_cmc'].fillna(method='ffill') # ffill within asset
    first_instance_idx = df[df['num_market_pairs_cmc'].isna()].groupby('asset_cmc').apply(lambda x: x.index[0])
    df.loc[first_instance_idx, 'num_market_pairs_cmc'] = 1 # fill first index of each asset with 1 if missing
    df['num_market_pairs_cmc'] = df.groupby('date')['num_market_pairs_cmc'].transform(lambda x: x.fillna(x.min())) # fill missing values with min value by date
    df['num_market_pairs_cmc'] = df.groupby('asset_cmc')['num_market_pairs_cmc'].fillna(method='ffill') # ffill within asset
    df['num_market_pairs_cmc'] = df.num_market_pairs_cmc.astype(int)
    assert np.min(df.num_market_pairs_cmc.values) >= 1
    assert np.max(df.num_market_pairs_cmc.values) <= 1e6
    assert 0 == df.num_market_pairs_cmc.isnull().sum()

    # supply columns
    df.loc[df.max_supply.isnull(), 'max_supply'] = 10*df.total_supply
    supply_cols = ['circulating_supply', 'total_supply', 'max_supply'] 
    for col in supply_cols:
        df.loc[df.asset_cmc=='multiversx-egld', col] = (df[df.asset_cmc=='multiversx-egld'].usd_mcap / 
                                                        df[df.asset_cmc=='multiversx-egld'].usd_per_token) # fix this asset
        df[col] = df.groupby('asset_cmc')[col].fillna(method='ffill') # ffill within asset
    df.loc[df.circulating_supply.isnull(), 'circulating_supply'] = df.usd_mcap / df.usd_per_token
    df.loc[df.total_supply.isnull(), 'total_supply'] = 10*df.circulating_supply
    df.loc[df.max_supply.isnull(), 'max_supply'] = 10*df.total_supply
    for col in supply_cols:
        assert 0 == df[col].isnull().sum()
        assert 1e18 > np.max(df[col])
        assert 0 <= np.min(df[col])
        df[col] = df[col].astype('float32')

    # clean missing values in tags column
    def replace_nan_with_empty_list(value):
        if isinstance(value, float) and np.isnan(value):
            return []
        else:
            return value
    df['tags'] = df['tags'].apply(replace_nan_with_empty_list)

    # create vc column
    df['vc_cmc'] = 0
    df['vc_cmc'] = df['tags'].apply(lambda x: 1 if any('portfolio' in tag for tag in x) else 0)

    # drop tags
    df = df.drop('tags', axis=1)

    # Loop over all assets to add any missing datetimes and fill columns
    final_df = pd.DataFrame()
    assets = df['asset_cmc'].unique()
    for asset in assets:    
        # subset to asset of interest
        asset_df = df[df.asset_cmc == asset].copy()
        asset_df.set_index('date', inplace=True)

        # find the min and max datetime for the asset
        min_dt, max_dt = asset_df.index.min(), asset_df.index.max()

        # create a complete DateTimeIndex with hourly frequency between min and max datetime
        full_date_range = pd.date_range(start=min_dt, end=max_dt, freq='1H')

        # Reindex asset_df with the complete DateTimeIndex
        asset_df = asset_df.reindex(full_date_range)

        # Fill asset_cmc column for newly added rows
        asset_df['asset_cmc'].fillna(asset, inplace=True)
        
        # Forward fill gaps that are less than 31 days
        asset_df = asset_df.ffill(limit=31*24)

        # Reset index and fill asset_cmc column for the newly added rows
        asset_df.reset_index(inplace=True)
        asset_df.rename(columns={'index': 'date'}, inplace=True)

        # Add data to master df
        final_df = pd.concat((final_df, asset_df))

    # Reset index of the final DataFrame
    final_df.reset_index(drop=True, inplace=True)

    # Reset  names
    df = final_df.copy()
    del final_df

    # Confirm no missings in the df
    assert(df.isnull().sum().sum() == 0)

    # Set column order
    cols = list(df.columns.values)
    cols_to_remove = ['date', 'asset_cmc', 'usd_per_token', 'usd_mcap', 'usd_volume_24h']
    for col in cols_to_remove:
        cols.remove(col)
    cols.sort()
    df = df[cols_to_remove+cols]

    # Ensure all columns have cmc in name
    df.columns = [col if col == 'date' or col.endswith('_cmc') else col + '_cmc' for col in df.columns]

    # ensure no duplicates by date and asset
    assert not df.duplicated(subset=['date', 'asset_cmc']).any()

    # Sort by date then asset and reset index
    df = df.sort_values(by=['date', 'asset_cmc']).reset_index(drop=True)

    return df


In [21]:
def cleanMacro(global_df: pd.DataFrame, ex_df: pd.DataFrame) -> pd.DataFrame:
    # date col in each
    global_df['date'] = pd.to_datetime(global_df['date'], utc=True).dt.tz_localize(None)
    assert 0 == global_df.date.isnull().sum()
    global_df['date'] = global_df.date.dt.ceil('H')
    ex_df['date'] = pd.to_datetime(ex_df['date'], utc=True).dt.tz_localize(None)
    assert 0 == ex_df.date.isnull().sum()
    ex_df['date'] = ex_df.date.dt.ceil('H')

    # drop duplicates
    global_df = global_df.drop_duplicates(subset='date')
    ex_df = ex_df.drop_duplicates(subset='date')

    # drop cols
    global_df = global_df.drop('altcoin_usd_mcap', axis=1)

    # fix ranges of global df values
    assert 0 == global_df[(global_df.total_usd_mcap < 0) | (global_df.total_usd_mcap > 1e13)].shape[0]
    assert 0 == global_df[(global_df.total_usd_volume_24h < 0) | (global_df.total_usd_volume_24h > 1e15)].shape[0]
    global_df.loc[global_df.altcoin_usd_volume_24h<0, 'altcoin_usd_volume_24h'] = np.nan
    global_df.loc[global_df.altcoin_usd_volume_24h>1e15, 'altcoin_usd_volume_24h'] = np.nan

    # confirm exchange variables ranges
    ex_df['ex_usd_volume_24h'] = ex_df.ex_usd_volume_24h.astype('float32')
    ex_df['ex_num_market_pairs'] = ex_df.ex_num_market_pairs.astype('float32')
    assert 0 == ex_df.isnull().sum().sum()
    assert 0 == ex_df[(ex_df.ex_usd_volume_24h<0) | (ex_df.ex_usd_volume_24h>1e15)].shape[0]
    assert 0 == ex_df[(ex_df.ex_num_market_pairs<0) | (ex_df.ex_num_market_pairs>1e7)].shape[0]

    # identify dex and cex
    dexs = ['bancor-network',
            'compound',
            'curve-finance',
            'dydx',
            'pancakeswap-v2',
            'sushiswap',
            'uniswap-v2']
    ex_df['dex'] = 0
    ex_df.loc[ex_df.ex_slug.isin(dexs), 'dex'] = 1

    # form dex and cex dfs
    cex_df = ex_df[ex_df.dex==0].groupby('date')[['ex_usd_volume_24h', 'ex_num_market_pairs']].sum()
    cex_df = cex_df.rename(columns={'ex_usd_volume_24h': 'ex_usd_volume_24h_cex',
                                    'ex_num_market_pairs': 'ex_num_pairs_cex'})
    dex_df = ex_df[ex_df.dex==1].groupby('date')[['ex_usd_volume_24h', 'ex_num_market_pairs']].sum()
    dex_df = dex_df.rename(columns={'ex_usd_volume_24h': 'ex_usd_volume_24h_dex',
                                    'ex_num_market_pairs': 'ex_num_pairs_dex'})

    # merge
    macro_df = global_df.merge(cex_df, on='date', how='outer', validate='one_to_one')
    macro_df = macro_df.merge(dex_df, on='date', how='outer', validate='one_to_one')

    # Cut down to relevant dates
    macro_df = macro_df[(macro_df['date'] >= '2016-07-01') & (macro_df['date'] <= '2023-01-02')]

    # Ensure all dates are present
    macro_df.set_index('date', inplace=True)
    min_dt, max_dt = macro_df.index.min(), macro_df.index.max()
    full_date_range = pd.date_range(start=min_dt, end=max_dt, freq='1H')
    macro_df = macro_df.reindex(full_date_range)
    macro_df.reset_index(inplace=True)
    macro_df = macro_df.rename(columns={"index": 'date'})
    macro_df = macro_df.drop_duplicates(subset='date')

    # Fill the first value of columns with 1 if missing
    for col in ['active_cryptos', 'active_exchanges', 
        'active_market_pairs', 'ex_usd_volume_24h_cex', 
        'ex_num_pairs_cex', 'ex_usd_volume_24h_dex', 'ex_num_pairs_dex']:
        macro_df.loc[(macro_df.index==np.min(macro_df.index)) 
                &(macro_df[col].isnull()), col] = 1

    # Forward fill
    macro_df = macro_df.ffill()

    # Reset index and fill asset_cmc column for the newly added rows
    macro_df.rename(columns={'index': 'date'}, inplace=True)

    # Reset index of the final df
    macro_df = macro_df.sort_values(by='date', ignore_index=True)

    # Confirm no missings in the df
    assert(macro_df.isnull().sum().sum() == 0)

    # Set column order
    cols = list(macro_df.columns.values)
    cols.remove('date')
    cols.sort()
    macro_df = macro_df[['date']+cols]

    # Convert all columns to float32
    for col in cols:
        macro_df[col] = macro_df[col].astype('float32')

    # Ensure all columns have cmc in name
    macro_df.columns = [col if col == 'date' or col.endswith('_cmc') else col + '_cmc' for col in macro_df.columns]

    # ensure no duplicates by date and asset
    assert not macro_df.duplicated(subset=['date']).any()

    # Sort by date then asset and reset index
    macro_df = macro_df.sort_values(by=['date']).reset_index(drop=True)

    return macro_df

In [None]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP  = "../data/raw/cmc_panel.pkl"
    CW_IN_FP     = '../data/raw/cmc_coinmetrics_cw.pkl'
    MACRO_IN_FP  = '../data/raw/cmc_macro.pkl'
    EX_IN_FP     = '../data/raw/cmc_exchange_panel.pkl'
    PANEL_OUT_FP = '../data/derived/cmc_panel.pkl'
    CW_OUT_FP    = '../data/derived/cmc_cm_cw.pkl'
    MACRO_OUT_FP = '../data/derived/cmc_macro.pkl'

    # import
    df = pd.read_pickle(PANEL_IN_FP)
    cw_df = pd.read_pickle(CW_IN_FP)
    global_df = pd.read_pickle(MACRO_IN_FP)
    ex_df =  pd.read_pickle(EX_IN_FP)
    
    # clean
    cw_df = cleanCrosswalk(cw_df)
    df = cleanPanel(df, cw_df)
    macro_df = cleanMacro(global_df, ex_df)

    # save
    df.to_pickle(PANEL_OUT_FP)
    cw_df.to_pickle(CW_OUT_FP)
    macro_df.to_pickle(MACRO_OUT_FP)