In [1]:
from helper_functions import Helper
from typing import Dict
import pandas as pd
import numpy as np
import pickle
import gc


In [2]:
def formPanel(cw_in_fp: str, ca_panel_in_fp: str, cm_panel_in_fp: str, cmc_panel_in_fp: str, cg_panel_in_fp: str,
    san_panel_in_fp: str, asset_ico_in_fp: str, messari_in_fp: str) -> pd.DataFrame:
    """ Form a single panel DataFrame from all the data sources. """

    # import
    cw_df = pd.read_pickle(cw_in_fp)
    ca_panel_df = pd.read_pickle(ca_panel_in_fp)
    cm_panel_df = pd.read_pickle(cm_panel_in_fp)
    cmc_panel_df = pd.read_pickle(cmc_panel_in_fp)
    cg_panel_df = pd.read_pickle(cg_panel_in_fp)
    san_panel_df = pd.read_pickle(san_panel_in_fp)
    ico_panel_df = pd.read_pickle(asset_ico_in_fp)
    messari_df = pd.read_pickle(messari_in_fp)

    # Merge CM and CA data together
    panel_df = ca_panel_df.merge(cw_df[['asset_ca', 'asset_cm']], on='asset_ca', how='inner', validate='many_to_one')
    panel_df = panel_df.merge(cm_panel_df, on=['date', 'asset_cm'], how='left', validate='one_to_one')

    # Merge on rest of panel data
    san_panel_df = san_panel_df.merge(cw_df[cw_df.asset_san.notnull()][['asset_san', 'asset_cm']], 
                                    on='asset_san', how='inner', validate='many_to_one')
    panel_df = panel_df.merge(san_panel_df, on=['date', 'asset_cm'], how='left', validate='one_to_one')
    cmc_panel_df = cmc_panel_df.merge(cw_df[cw_df.asset_cmc.notnull()][['asset_cmc', 'asset_cm']], 
                                    on='asset_cmc', how='inner', validate='many_to_one')
    panel_df = panel_df.merge(cmc_panel_df, on=['date', 'asset_cm'], how='left', validate='one_to_one')
    cg_panel_df = cg_panel_df.merge(cw_df[cw_df.asset_cg.notnull()][['asset_cg', 'asset_cm']], 
                                    on='asset_cg', how='inner', validate='many_to_one')
    panel_df = panel_df.merge(cg_panel_df, on=['date', 'asset_cm'], how='left', validate='one_to_one')
    panel_df = panel_df.merge(ico_panel_df, on=['asset_cm'], how='left', validate='many_to_one')
    panel_df = panel_df.merge(messari_df, on=['asset_cm'], how='left', validate='many_to_one')

    # Drop other asset id columns
    panel_df = panel_df.drop(['asset_ca', 'asset_san', 'asset_cmc', 'asset_cg'], axis=1)

    # Rename id column
    panel_df = panel_df.rename(columns={'asset_cm': 'asset'})

    # Ensure all columns have char in front of the col name
    panel_df.columns = [col if col in ['date', 'asset'] else 'char_'+col for col in panel_df.columns]

    # Make sure unique on id columns
    assert 0 == panel_df[['date', 'asset']].duplicated().sum()

    # Resort
    cols = list(panel_df.columns.values)
    cols.remove('date')
    cols.remove('asset')
    panel_df = panel_df[['date', 'asset']+cols]
    panel_df = panel_df.sort_values(by=['date', 'asset'], ignore_index=True)

    # Clean up CoinGecko columns that were at 24h freq to fill missing at 1h freq
    cg_cols = ['char_usd_per_token_cg', 'char_usd_mcap_cg', 'char_usd_volume_24h_cg', 
        'char_twitter_followers_cg', 'char_github_activity_cg', 'char_reddit_activity_cg']
    for cg_col in cg_cols:
        panel_df[cg_col] = panel_df.groupby('asset')[cg_col].fillna(method='ffill')

    # ensure no missing in critical columns
    panel_df.loc[panel_df['char_usd_per_token_cm'].isnull(), 'char_usd_per_token_cm'] = 0
    panel_df.loc[panel_df['char_usd_volume_cm'].isnull(), 'char_usd_volume_cm'] = 0
    critical_cols = ['char_usd_per_token_ca',
        'char_usd_volume_ca',
        'char_usd_per_token_cm',
        'char_usd_volume_cm']
    for col in critical_cols:
        assert 0 == panel_df[col].isnull().sum()
    panel_df.loc[panel_df.char_usd_per_token_cm.isnull(), 'char_usd_per_token_cm'] = panel_df.char_usd_per_token_ca
    panel_df.loc[panel_df.char_usd_volume_cm.isnull(), 'char_usd_volume_cm'] = panel_df.char_usd_volume_ca

    # Clear memory
    del cw_df, ca_panel_df, cm_panel_df, cmc_panel_df, 
    del cg_panel_df, san_panel_df, ico_panel_df, messari_df
    gc.collect()

    return panel_df


In [3]:
def formMacro(macro_in_fp: str, ca_macro_in_fp: str, cm_macro_in_fp: str, 
    cmc_macro_in_fp: str, san_macro_in_fp: str, macro_ico_in_fp: str) -> pd.DataFrame:
    """ Form single macro DataFrame out of all macro data. """

    # import
    macro_df = pd.read_pickle(macro_in_fp)
    ca_macro_df = pd.read_pickle(ca_macro_in_fp)
    cm_macro_df = pd.read_pickle(cm_macro_in_fp)
    cmc_macro_df = pd.read_pickle(cmc_macro_in_fp)
    san_macro_df = pd.read_pickle(san_macro_in_fp)
    ico_macro_df = pd.read_pickle(macro_ico_in_fp)

    # convert macro to hourly dataframe
    assert 0 == macro_df.isnull().sum().sum()
    macro_df.set_index('date', inplace=True)
    min_dt, max_dt = macro_df.index.min(), macro_df.index.max()
    full_date_range = pd.date_range(start=min_dt, end=max_dt, freq='1H')
    macro_df = macro_df.reindex(full_date_range)
    macro_df = macro_df.ffill()
    macro_df = macro_df.reset_index()
    macro_df = macro_df.rename(columns={'index': 'date'})
    assert 0 == macro_df.isnull().sum().sum()

    # merge on other macro data
    macro_df = macro_df.merge(ca_macro_df, on='date', how='left', validate='one_to_one')
    macro_df = macro_df.merge(cm_macro_df, on='date', how='left', validate='one_to_one')
    macro_df = macro_df.merge(cmc_macro_df, on='date', how='left', validate='one_to_one')
    macro_df = macro_df.merge(san_macro_df, on='date', how='left', validate='one_to_one')
    macro_df = macro_df.merge(ico_macro_df, on='date', how='left', validate='one_to_one')

    # Ensure all columns have char in front of the col name
    macro_df.columns = [col if col=='date' else 'macro_'+col for col in macro_df.columns]

    # Make sure unique on id columns
    assert macro_df.date.is_unique

    # Resort
    macro_df = macro_df.sort_values(by='date', ignore_index=True)

    # Clear memory
    del ca_macro_df, cm_macro_df, cmc_macro_df, san_macro_df, ico_macro_df
    gc.collect()

    return macro_df


In [4]:
def finalizeAssetUniverse(panel_df: pd.DataFrame, asset_universe_dict: Dict[str, list]) -> tuple:
    """ Finalize the asset universe by confirming if every asset and start_of_month_date
        have the necessary 90 days of data before that month; subset the panel to these
        included asset-dates 90 days before and the month of that asset-start_of_month_date.

    Note: this takes about twenty-five minutes to run.

    Args:
        panel_df (pd.DataFrame): the study's panel data.
        asset_universe_dict (dict): dictionary with keys for the start of each month of
            the study period and values of included assets for that month.

    Returns: tuple
        panel_df (pd.DataFrame): subsetted panel to the applicable asset-timestamps.
        final_asset_universe_dict (dict): updated included assets and start_of_month_dates.
    """
    # Initialize dataframe for final date-asset pairs and a new asset_universe_dict
    dates_df = pd.DataFrame(columns=["date", "asset"])
    final_asset_universe_dict = {}
    for date in asset_universe_dict.keys():
        # add this date to the new dictionary
        final_asset_universe_dict[date] = []

        # loop over all assets for this start of month date
        for asset in asset_universe_dict[date]:
            # form start and end date for window 90 days before trading date
            start_date = np.datetime64(date) - np.timedelta64(90, 'D')
            end_date = np.datetime64(date)
            end_date_plus_one_month = end_date + pd.DateOffset(months=1)

            # adjust to add the 2023 days if it is last month of study
            if date == '2022-12-01':
                end_date_plus_one_month += pd.DateOffset(days=2)

            # Form asset dataframe for the dates associated with this asset trading window,
            # including the entire month for this month of the study.
            asset_df = panel_df.loc[(panel_df["asset"] == asset) 
                & (panel_df["date"] >= start_date) 
                & (panel_df["date"] < end_date_plus_one_month)][['date', 'asset']]

            # Count number of obs for just the trailing 3 months
            asset_num_obs = asset_df[(asset_df["date"] >= start_date) 
                                    & (asset_df["date"] < end_date)].shape[0]

            # If asset has all trailing 90 days of data (at hourly freq)
            if asset_num_obs == 2160:
                # then add it to the final asset universe
                final_asset_universe_dict[date].append(asset)

                # and build asset-timestamps to subset the panel down to
                dates_df = pd.concat([dates_df, asset_df])

        # cut down size of dates
        dates_df.drop_duplicates(inplace=True)
    
    # cut panel down to just asset-dates of interest
    panel_df = panel_df.merge(dates_df, on=['date', 'asset'], how='inner', validate='one_to_one')

    return panel_df, final_asset_universe_dict


In [5]:
def formPrices(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form price column from actually tradable prices and 
        the global price from all data sources. 
    
    Args: 
        panel_df (pd.DataFrame): the panel data frame with all columns.
        
    Returns:
        (pd.DataFrame): updated panel data with new columns char_price_t
                        and char_price_global_t with the raw prices 
                        columns removed.
    """
    # confirm cm and ca tradable prices are good to go
    assert 0 == panel_df.char_usd_per_token_ca.isnull().sum()
    assert 0 == panel_df.char_usd_per_token_cm.isnull().sum()
    assert 0 == (panel_df.char_usd_per_token_ca==0).sum()
    panel_df.loc[panel_df.char_usd_per_token_cm==0, 'char_usd_per_token_cm'] = np.nan
    assert 1e6 > np.max(panel_df.char_usd_per_token_cm)
    assert 1e6 > np.max(panel_df.char_usd_per_token_ca)
    assert 0 < np.min(panel_df.char_usd_per_token_cm)
    assert 0 < np.min(panel_df.char_usd_per_token_ca)

    # form char_price_t
    panel_df['char_price_t'] = np.nan
    panel_df.loc[panel_df.char_usd_per_token_cm.isnull(), 'char_price_t'] = panel_df.char_usd_per_token_ca
    panel_df.loc[panel_df.char_usd_per_token_cm.notnull(), 'char_price_t'] = (1/2)*(panel_df.char_usd_per_token_cm
                                                                                    + panel_df.char_usd_per_token_ca)
    assert 0 == panel_df.char_price_t.isnull().sum()
    assert 0 < np.min(panel_df.char_price_t)
    assert 1e6 > np.min(panel_df.char_price_t)

    # clean up old cm and ca columns
    panel_df = panel_df.drop(['char_usd_per_token_ca', 'char_usd_per_token_cm'], axis=1)

    # clean up the global price cols before combining
    global_price_cols = ['char_usd_ref_price_ca', 'char_reference_rate_usd_cm',
        'char_price_usd_san', 'char_usd_per_token_cmc', 'char_usd_per_token_cg']
    for col in global_price_cols:
        panel_df.loc[(panel_df[col]<=0) | (panel_df[col]>1e6) , col] = np.nan

    # form global price column
    panel_df['char_price_global_t'] = panel_df[global_price_cols].mean(axis=1)
    assert 0 == panel_df.char_price_global_t.isnull().sum()
    assert 0 == panel_df[(panel_df.char_price_global_t <= 0) | (panel_df.char_price_global_t > 1e6)].shape[0]

    # clean up old global price columns
    panel_df = panel_df.drop(global_price_cols, axis=1)

    return panel_df


In [6]:
def formExcessReturn(panel_df: pd.DataFrame, hours_ahead: int) -> pd.DataFrame:
    """ Form excess return at given number of hours ahead. """
    # create new column names
    new_price_col = 'char_price_tp'+str(hours_ahead)
    new_ret_col = 'r_ex_tp'+str(hours_ahead)
    rf_ret_col = 'r_rf_tp'+str(hours_ahead)

    # form new df of the prices for given number of hours ahead
    temp_df = panel_df[['date', 'asset', 'char_price_t']].copy()
    temp_df = temp_df.rename(columns={'char_price_t': new_price_col})
    temp_df['date'] = temp_df['date'] - pd.to_timedelta(hours_ahead, unit='H')

    # merge it back on
    panel_df = panel_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    del temp_df

    # form return
    panel_df[new_ret_col] = (((panel_df[new_price_col]
                                - panel_df.char_price_t)/panel_df.char_price_t)
                                - panel_df[rf_ret_col])
    
    # delete the created column
    panel_df = panel_df.drop(new_price_col, axis=1)

    return panel_df

def formLHSs(panel_df: pd.DataFrame, macro_df: pd.DataFrame) -> pd.DataFrame:
    """ Form LHS's of excess returns one hour and day ahead. """

    # Add on one month tbill
    panel_df = panel_df.merge(macro_df[['date', 'macro_dgs1mo_fred']], 
        on=['date'], how='left', validate='many_to_one')
    panel_df['macro_dgs1mo_fred'] = panel_df['macro_dgs1mo_fred'].ffill()

    # Form one hour and one day ahead risk free return
    panel_df['r_rf_tp1']  = (1+panel_df.macro_dgs1mo_fred.values/100)**(1/(365*24))-1
    panel_df['r_rf_tp24'] = (1+panel_df.macro_dgs1mo_fred.values/100)**(1/(365))-1
    panel_df = panel_df.drop('macro_dgs1mo_fred', axis=1)

    # Form one hour and one day ahead excess return
    panel_df = formExcessReturn(panel_df, hours_ahead=1)
    panel_df = formExcessReturn(panel_df, hours_ahead=24)

    # identify outlier returns
    outliers_df = panel_df[(panel_df['r_ex_tp1'] > 2.75) | 
                        (panel_df['r_ex_tp24'] > 10) |
                        (panel_df['r_ex_tp1'] < -0.5) |
                        (panel_df['r_ex_tp24'] < -0.8)][['date', 'asset']]

    # identify all datetime-asset pairs one hour before and after these outlier returns
    before_df = outliers_df.copy()
    before_df['date'] = before_df['date'] - pd.Timedelta(hours=1)
    after_df = outliers_df.copy()
    after_df['date'] = after_df['date'] + pd.Timedelta(hours=1)
    outliers_df = pd.concat([outliers_df, before_df, after_df])
    del before_df, after_df
    outliers_df = outliers_df.drop_duplicates()
    outliers_df['set_price_missing'] = 1

    # set prices on panel to missing if in this outlier set
    panel_df = panel_df.merge(outliers_df, on=['date', 'asset'], how='left', validate='one_to_one')
    del outliers_df
    panel_df.loc[panel_df.set_price_missing==1, 'char_price_t'] = np.nan
    panel_df = panel_df.sort_values(by=['date', 'asset'], ignore_index=True)
    panel_df['char_price_t'] = panel_df.groupby('asset')['char_price_t'].fillna(method='ffill')
    panel_df = panel_df.drop('set_price_missing', axis=1)
    assert 0 == panel_df.char_price_t.isnull().sum()

    # drop the old returns to form new ones
    panel_df = panel_df.drop(columns=['r_ex_tp1', 'r_ex_tp24'], axis=1)
    panel_df = formExcessReturn(panel_df, hours_ahead=1)
    panel_df = formExcessReturn(panel_df, hours_ahead=24)

    # Drop unnecssary columns
    panel_df = panel_df.drop(columns=['r_rf_tp1', 'r_rf_tp24'], axis=1)

    # Fix outliers
    panel_df.loc[panel_df.r_ex_tp24<-1, 'r_ex_tp24'] = -.999999
    panel_df.loc[panel_df.r_ex_tp24>10, 'r_ex_tp24'] = 9.71
    panel_df.loc[panel_df.r_ex_tp1>2.75, 'r_ex_tp1'] = 2

    # Drop rows with missing lhs
    panel_df = panel_df[panel_df.r_ex_tp24.notnull()]
    panel_df = panel_df[panel_df.r_ex_tp1.notnull()]

    # Confirm all nonmissing
    assert 0 == panel_df.r_ex_tp24.isnull().sum()
    assert 0 == panel_df.r_ex_tp1.isnull().sum()

    # Drop 2023 data
    panel_df = panel_df[panel_df.date.dt.year < 2023]

    return panel_df


In [7]:
def formVolumes(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form volume columns from usd trade volume and counts in
        actually tradable markets as well as global data on
        trade volume in usd.
    
    Args: 
        panel_df (pd.DataFrame): the panel data frame with all columns.
        
    Returns:
        (pd.DataFrame): updated panel data with new columns.
    """
    # clean up values in volume columns
    panel_df.loc[panel_df.char_trades_cm.isnull(), 'char_trades_cm'] = 0
    panel_df.loc[panel_df.char_usd_volume_24h_cmc.isnull(), 'char_usd_volume_24h_cmc'] = 0
    panel_df.loc[panel_df.char_usd_volume_24h_cg.isnull(), 'char_usd_volume_24h_cg'] = 0
    panel_df.loc[panel_df.char_volume_usd_san.isnull(), 'char_volume_usd_san'] = 0
    panel_df.loc[panel_df.char_volume_usd_san>1e11, 'char_volume_usd_san'] = 1e11

    # confirm all raw volume columns are nonmissing, weakly positive, and bounded
    volume_cols  = ['char_usd_volume_ca',
        'char_usd_volume_cm',
        'char_trades_volume_ca',
        'char_trades_cm',
        'char_usd_volume_24h_cmc',
        'char_usd_volume_24h_cg',
        'char_volume_usd_san']
    for col in volume_cols:
        assert 0 == panel_df[col].isnull().sum()
        assert 0 == (panel_df[col]<0).sum()
        assert 0 == (panel_df[col]>2e12).sum()
    
    # form char_volume_t
    panel_df['char_volume_t'] = (1/2)*(panel_df.char_usd_volume_ca
                                        + panel_df.char_usd_volume_cm)

    # form char_trades_t
    panel_df['char_trades_t'] = (1/2)*(panel_df.char_trades_volume_ca
                                        + panel_df.char_trades_cm)

    # form char_volume_global_t
    panel_df['char_volume_24h_global_t'] = (1/3)*(panel_df.char_usd_volume_24h_cmc
                                                    + panel_df.char_usd_volume_24h_cg
                                                    + panel_df.char_volume_usd_san)

    # ensure no missing, weakly positive, and bounded in new cols
    new_cols = ['char_volume_t', 'char_trades_t', 'char_volume_24h_global_t']
    for col in new_cols:
        assert 0 == panel_df[col].isnull().sum()
        assert 0 == (panel_df[col]<0).sum()
        assert 0 == (panel_df[col]>2e12).sum()

    # drop old columns
    panel_df = panel_df.drop(volume_cols, axis=1)

    return panel_df


In [8]:
def formMcap(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form the main marketcap column. """

    # Confirm valid values
    mcap_cols = ['char_cap_mrkt_est_usd_cm', 'char_usd_mcap_cmc',
                'char_usd_mcap_cg', 'char_marketcap_usd_san']
    for col in mcap_cols:
        panel_df.loc[panel_df[col]==0, col] = np.nan
        assert 0 == (panel_df[col]<=0).sum()
        assert 0 == (panel_df[col]>2e12).sum()

    # Form char_mcap_t
    panel_df['char_mcap_t'] = panel_df[mcap_cols].mean(axis=1)

    # Manually fix known issues
    panel_df.loc[(panel_df.asset=='chr')
                & (panel_df.date.dt.month==5)
                & (panel_df.char_mcap_t.isnull()), 'char_mcap_t'] = 8e6
    panel_df.loc[(panel_df.asset=='dia')
                & (panel_df.date.dt.month==9)
                & (panel_df.date.dt.year==2020)
                & (panel_df.char_mcap_t.isnull()), 'char_mcap_t'] = 8e7
    panel_df.loc[(panel_df.asset=='movr')
                & (panel_df.date.dt.month==9)
                & (panel_df.date.dt.year==2021)
                & (panel_df.char_mcap_t.isnull()), 'char_mcap_t'] = 5e8
    panel_df.loc[(panel_df.asset=='qrdo')
                & (panel_df.date.dt.year==2021)
                & (panel_df.char_mcap_t.isnull()), 'char_mcap_t'] = 7.7e7
    panel_df.loc[(panel_df.asset=='ctc')
                & (panel_df.date.dt.year==2022)
                & (panel_df.char_mcap_t.isnull()), 'char_mcap_t'] = 2.5e8

    # Ensure valid values in new column
    assert 0 == panel_df['char_mcap_t'].isnull().sum()
    assert 0 == (panel_df['char_mcap_t']<0).sum()
    assert 0 == (panel_df['char_mcap_t']>2e12).sum()

    # drop old columns
    panel_df = panel_df.drop(mcap_cols, axis=1)

    return panel_df


In [9]:
def formStaticCharacteristics(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form all static characteristics. """
    # ensure messari usage columns are nonmissing and all ones and zeros
    messari_cols = ['char_asset_usage_payments_messari',
        'char_asset_usage_vote_messari',
        'char_asset_usage_work_messari',
        'char_asset_usage_dividends_messari',
        'char_asset_usage_access_messari',
        'char_asset_usage_discount_messari',
        'char_pow_messari',
        'char_pos_messari']
    for col in messari_cols:
        assert len(panel_df) == (panel_df[col].isin([0,1])).sum()
    new_messari_cols = [column.replace('_messari', '') for column in messari_cols]
    column_mapping = dict(zip(messari_cols, new_messari_cols))
    panel_df.rename(columns=column_mapping, inplace=True)

    # clean momtaz ico columns
    panel_df['char_ico_days_since_t'] = (panel_df.date - panel_df.char_ico_date_momtaz).dt.days
    panel_df['char_ico_days_since_t'] = panel_df['char_ico_days_since_t'].fillna(-1)
    panel_df['char_ico_days_since_t'] = panel_df['char_ico_days_since_t'].astype(int)
    panel_df.loc[panel_df['char_ico_days_since_t']<0, 'char_ico_days_since_t'] = np.nan
    panel_df['char_ico_days_since_t'] = panel_df['char_ico_days_since_t'].fillna(0)
    panel_df = panel_df.drop('char_ico_date_momtaz', axis=1)
    panel_df['char_ico_momtaz'] = panel_df['char_ico_momtaz'].fillna(0)
    panel_df['char_ico_price_momtaz'] = panel_df['char_ico_momtaz'].fillna(-1)
    panel_df = panel_df.rename(columns={'char_ico_momtaz': 'char_ico',
                                        'char_ico_price_momtaz': 'char_ico_price'})

    # Form industry
    assert 0 == panel_df.char_industry_messari.isnull().sum()
    panel_df['char_industry'] = panel_df.char_industry_messari
    panel_df.loc[panel_df.asset=='forth', 'char_industry'] = 'other_defi'
    panel_df = panel_df.drop(['char_industry_messari', 'char_category_san'], axis=1)
    assert 0 == panel_df.char_industry.isnull().sum()
    indicator_columns = pd.get_dummies(panel_df['char_industry'], prefix='char_industry')
    assert 0 == indicator_columns.isnull().sum().sum()
    panel_df = panel_df.join(indicator_columns)
    assert (len(panel_df.groupby('asset').char_industry.value_counts().index.get_level_values(0))
        == len(panel_df.asset.unique())), "Industry is not unique within asset."
    panel_df = panel_df.drop('char_industry', axis=1)

    return panel_df


In [10]:
def cleanDistributionCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # drop columns
    dist_cols_to_drop = ['char_holders_distribution_combined_balance_over_1_san',
        'char_holders_distribution_combined_balance_over_10_san',
        'char_holders_distribution_combined_balance_over_100_san',
        'char_holders_distribution_combined_balance_over_100k_san',
        'char_holders_distribution_combined_balance_over_10k_san',
        'char_holders_distribution_combined_balance_over_1M_san',
        'char_holders_distribution_combined_balance_over_1k_san',
        'char_holders_distribution_combined_balance_total_san',
        'char_active_holders_distribution_combined_balance_over_1_san',
        'char_active_holders_distribution_combined_balance_over_10_san',
        'char_active_holders_distribution_combined_balance_over_100_san',
        'char_active_holders_distribution_combined_balance_over_100k_san',
        'char_active_holders_distribution_combined_balance_over_10k_san',
        'char_active_holders_distribution_combined_balance_over_1M_san',
        'char_active_holders_distribution_combined_balance_over_1k_san',
        'char_active_holders_distribution_combined_balance_total_san',
        'char_active_holders_distribution_over_1_san',
        'char_active_holders_distribution_over_10_san',
        'char_active_holders_distribution_over_100_san',
        'char_active_holders_distribution_over_100k_san',
        'char_active_holders_distribution_over_10k_san',
        'char_active_holders_distribution_over_1M_san',
        'char_active_holders_distribution_over_1k_san',
        'char_active_holders_distribution_total_san']
    panel_df = panel_df.drop(dist_cols_to_drop, axis=1)

    # clean remaining dist cols
    dist_cols = ['char_holders_distribution_over_1_san',
        'char_holders_distribution_over_10_san',
        'char_holders_distribution_over_100_san',
        'char_holders_distribution_over_100k_san',
        'char_holders_distribution_over_10k_san',
        'char_holders_distribution_over_1M_san',
        'char_holders_distribution_over_1k_san',
        'char_holders_distribution_total_san']
    for col in dist_cols:
        new_col = col[:-4]+'_t'
        panel_df = panel_df.rename(columns={col: new_col})
        panel_df[new_col] = panel_df.groupby('asset')[new_col].fillna(method='ffill')
        panel_df.loc[panel_df[new_col].isnull(), new_col] = 0
        assert 0 == panel_df[new_col].isnull().sum()

    return panel_df


In [11]:
def formSupplyCols(panel_df: pd.DataFrame, cm_raw_panel_in_fp: str) -> pd.DataFrame:
    """ From circulating and max supply columns from underlying columns. """
    # import the cm raw panel data and cut down to needed data
    cm_raw_panel_df = pd.read_pickle(cm_raw_panel_in_fp)
    cm_supply_cols = ['SplyAct10yr', 'SplyActEver', 'SplyCur', 'SplyFF']
    cm_raw_panel_df = cm_raw_panel_df[['date', 'asset']+cm_supply_cols]

    # merge on to main df
    panel_df = panel_df.merge(cm_raw_panel_df, on=['date', 'asset'], how='left', validate='one_to_one')
    del cm_raw_panel_df

    # ensure supply columns are all weakly positive, nonmissing, and bounded
    supply_cols = ['char_circulating_supply_cmc', 'char_max_supply_cmc',
        'char_total_supply_cmc', 'char_total_supply_san',
        'char_circulation_san', 'char_circulation_5y_san', 
        'SplyAct10yr', 'SplyActEver', 'SplyCur', 'SplyFF']
    for col in supply_cols:
        panel_df.loc[panel_df[col]<=0, col] = np.nan
        panel_df[col] = panel_df.groupby('asset')[col].fillna(method='ffill')
        assert 0 == (panel_df[col]<=0).sum()
        assert 0 == (panel_df[col]>1e18).sum()

    # create circulating supply column
    panel_df['char_supply_circ_t'] = panel_df[['char_circulating_supply_cmc',
        'SplyAct10yr', 'SplyActEver', 'SplyCur', 'SplyFF',
        'char_circulation_san', 'char_circulation_5y_san']].mean(axis=1)
    panel_df.loc[(panel_df.asset=='chr')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 8e8
    panel_df.loc[(panel_df.asset=='dia')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 1.5e8
    panel_df.loc[(panel_df.asset=='unfi')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 2.5e6
    panel_df.loc[(panel_df.asset=='orca')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 1.5e7
    panel_df.loc[(panel_df.asset=='t')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 3e9
    panel_df.loc[(panel_df.asset=='gmt')
        & (panel_df.char_supply_circ_t.isnull()), 'char_supply_circ_t'] = 6e8

    # create total supply column by averaging cmc and san
    panel_df['char_supply_max_t'] = panel_df[['char_max_supply_cmc',
                                                'char_total_supply_cmc',
                                                'char_total_supply_san']].mean(axis=1)
    panel_df.loc[panel_df.char_supply_max_t.isnull(), 'char_supply_max_t'] = panel_df.char_supply_circ_t

    # ensure for all weakly positive, nonmissing, and bounded
    new_cols = ['char_supply_circ_t', 'char_supply_max_t']
    for col in new_cols:
        assert 0 == panel_df[col].isnull().sum()
        assert 0 == (panel_df[col]<=0).sum()
        assert 0 == (panel_df[col]>1e18).sum()

    # drop old columns
    panel_df = panel_df.drop(supply_cols, axis=1)

    return panel_df


In [12]:
def formDevActivity(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form single developer activity column from underlying column. """
    # set cols to combine
    dev_cols = ['char_github_activity_cg', 'char_github_activity_san',
        'char_github_activity_contributors_count_san', 'char_dev_activity_san',
        'char_dev_activity_contributors_count_san']
    
    # fill missing
    panel_df.loc[panel_df.char_github_activity_cg.isnull(), 'char_github_activity_cg'] = -1

    # ensure columns are all weakly positive, nonmissing, and bounded
    for col in dev_cols:
        panel_df[col] = panel_df.groupby('asset')[col].fillna(method='ffill')
        panel_df.loc[panel_df[col].isnull(), col] = 0
        assert 0 == panel_df[col].isnull().sum()
        assert 0 == (panel_df[col]<-1).sum()
        assert 0 == (panel_df[col]>1e18).sum()

    # normalize all columns to -1 to 1
    for col in dev_cols:
        panel_df = Helper.xsecNormalizeToMinusOneOne(panel_df, target_col=col, asset_col='asset')

    # form single dev column from underlying
    panel_df['char_dev_activity_t'] = panel_df[dev_cols].mean(axis=1)
    assert 0 == panel_df.char_dev_activity_t.isnull().sum()

    # drop old columns
    panel_df = panel_df.drop(dev_cols, axis=1)

    return panel_df


In [13]:
def cleanCmcColumns(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Clean the remaining few cmc columns. """
    # clean rank column with forward filling and then replacing any missing with crossectional min
    panel_df['char_rank_cmc'] = panel_df.groupby('asset')['char_rank_cmc'].fillna(method='ffill')
    panel_df['char_rank_cmc'] = panel_df.groupby('date')['char_rank_cmc'].apply(lambda x: x.fillna(x.min()))

    # clean num market pairs and vc to be zero if missing
    panel_df.loc[panel_df['char_num_market_pairs_cmc'].isnull(), 'char_num_market_pairs_cmc'] = 0
    panel_df.loc[panel_df['char_vc_cmc'].isnull(), 'char_vc_cmc'] = 0

    # confirm ranges and missingness
    cmc_cols = ['char_num_market_pairs_cmc', 'char_rank_cmc', 'char_vc_cmc']
    for col in cmc_cols:
        assert 0 == panel_df[col].isnull().sum()
        assert 0 == (panel_df[col]<0).sum()
        assert 0 == (panel_df[col]>1e6).sum()

    # rename
    panel_df = panel_df.rename(columns={'char_num_market_pairs_cmc': 'char_num_pairs_t', 
                                        'char_rank_cmc': 'char_rank_cmc_t', 
                                        'char_vc_cmc': 'char_vc_t'})

    return panel_df


In [14]:
def formSocial(panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Form social columns. """
    # set social columns
    social_cols = ['char_unique_social_volume_total_1h_san',
        'char_sentiment_balance_reddit_san',
        'char_sentiment_balance_total_san',
        'char_sentiment_balance_twitter_san',
        'char_sentiment_balance_twitter_crypto_san',
        'char_sentiment_negative_reddit_san',
        'char_sentiment_negative_total_san',
        'char_sentiment_negative_twitter_san',
        'char_sentiment_negative_twitter_crypto_san',
        'char_sentiment_positive_reddit_san',
        'char_sentiment_positive_total_san',
        'char_sentiment_positive_twitter_san',
        'char_sentiment_positive_twitter_crypto_san',
        'char_sentiment_volume_consumed_reddit_san',
        'char_sentiment_volume_consumed_total_san',
        'char_sentiment_volume_consumed_twitter_san',
        'char_sentiment_volume_consumed_twitter_crypto_san',
        'char_social_dominance_reddit_san',
        'char_social_dominance_total_san',
        'char_social_dominance_twitter_san',
        'char_social_dominance_twitter_crypto_san',
        'char_social_volume_reddit_san',
        'char_social_volume_total_san',
        'char_social_volume_twitter_san',
        'char_social_volume_twitter_crypto_san',
        'char_reddit_activity_cg',
        'char_twitter_followers_cg']

    # fill missings with ffill or zeros.
    for col in social_cols:
        panel_df[col] = panel_df.groupby('asset')[col].fillna(method='ffill')
        panel_df.loc[panel_df[col].isnull(), col] = 0

    # drop social columns not needed
    drop_cols = ['char_sentiment_balance_total_san',
        'char_sentiment_balance_twitter_crypto_san',
        'char_sentiment_negative_total_san',
        'char_sentiment_negative_twitter_crypto_san',
        'char_sentiment_positive_total_san',
        'char_sentiment_positive_twitter_crypto_san',
        'char_sentiment_volume_consumed_reddit_san',
        'char_sentiment_volume_consumed_twitter_san',
        'char_sentiment_volume_consumed_twitter_crypto_san',
        'char_social_dominance_reddit_san',
        'char_social_dominance_twitter_san',
        'char_social_dominance_twitter_crypto_san',
        'char_social_volume_total_san',
        'char_social_volume_twitter_crypto_san']
    panel_df = panel_df.drop(drop_cols, axis=1)

    # normalize columns to -1 to 1 x-sectionally before combining
    cols_to_norm = ['char_social_volume_reddit_san',
        'char_reddit_activity_cg',
        'char_social_volume_twitter_san',
        'char_twitter_followers_cg']
    for col in cols_to_norm:
        panel_df = Helper.xsecNormalizeToMinusOneOne(panel_df, target_col=col, asset_col='asset')

    # combine columns
    panel_df['char_social_volume_reddit_t'] = panel_df[['char_social_volume_reddit_san',
                                                        'char_reddit_activity_cg']].mean(axis=1)
    panel_df['char_social_volume_twitter_t'] = panel_df[['char_social_volume_twitter_san',
                                                        'char_twitter_followers_cg']].mean(axis=1)
    assert 0 == panel_df.char_social_volume_reddit_t.isnull().sum()
    assert 0 == panel_df.char_social_volume_twitter_t.isnull().sum()

    # drop combined columns
    panel_df = panel_df.drop(cols_to_norm, axis=1)

    # rename columns to drop the _san
    cols_to_rename = list(set(social_cols).intersection(set(list(panel_df.columns))))
    new_col_names = [col[:-4]+'_t' for col in cols_to_rename]
    column_mapping = dict(zip(cols_to_rename, new_col_names))
    panel_df.rename(columns=column_mapping, inplace=True)

    return panel_df


In [15]:
def formAddrCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # set address cols
    address_cols = ['char_daily_active_addresses_san',
        'char_active_addresses_1h_san',
        'char_network_growth_san',
        'char_payments_count_san',
        'char_transaction_volume_san',
        'char_transactions_count_san']

    # confirm ranges
    for col in address_cols:
        panel_df[col] = panel_df.groupby('asset')[col].fillna(method='ffill')
        assert 0 == (panel_df[col]<0).sum()
        assert 0 == (panel_df[col]>1e15).sum()

    # combine active addresses into single column
    panel_df['char_active_addr_t'] = (1/2)*(panel_df.char_daily_active_addresses_san/24 +
                                            panel_df.char_active_addresses_1h_san)
    panel_df.loc[panel_df.char_active_addr_t.isnull(), 'char_active_addr_t'] = 0

    # clear network growth
    panel_df.loc[panel_df['char_network_growth_san'].isnull(), 'char_network_growth_san'] = 0
    panel_df = panel_df.rename(columns = {'char_network_growth_san': 'char_network_growth_t'})

    # create transaction volume
    panel_df['char_tx_volume_t'] = panel_df[['char_payments_count_san', 
                                            'char_transaction_volume_san',
                                            'char_transactions_count_san']].mean(1)
    panel_df.loc[panel_df['char_tx_volume_t'].isnull(), 'char_tx_volume_t'] = 0

    # drop the old cols
    address_cols.remove('char_network_growth_san')
    panel_df.drop(address_cols, axis=1, inplace=True)

    return panel_df


In [16]:
def cleanDepositAndWithdrawCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    
    # clean up deposit columns
    panel_df = panel_df.drop(['char_active_deposits_san', 'char_active_deposits_per_exchange_san',
        'char_deposit_transactions_per_exchange_san', 'char_deposit_balance_san'], axis=1)
    panel_df = panel_df.rename(columns={'char_deposit_transactions_san': 'char_tx_deposit_t'})
    panel_df['char_tx_deposit_t'] = panel_df.groupby('asset')['char_tx_deposit_t'].fillna(method='ffill')
    panel_df.loc[panel_df['char_tx_deposit_t'].isnull(), 'char_tx_deposit_t'] = 0
    assert 0 == panel_df["char_tx_deposit_t"].isnull().sum()
    assert 0 == (panel_df["char_tx_deposit_t"]<0).sum()
    assert 0 == (panel_df["char_tx_deposit_t"]>1e6).sum()

    # clean up withdraw columns
    panel_df = panel_df.drop(['char_active_withdrawals_san',
        'char_active_withdrawals_per_exchange_san', 'char_withdrawal_balance_san'], axis=1)
    panel_df = panel_df.rename(columns={'char_withdrawal_transactions_san': 'char_tx_withdraw_t'})
    panel_df['char_tx_withdraw_t'] = panel_df.groupby('asset')['char_tx_withdraw_t'].fillna(method='ffill')
    panel_df.loc[panel_df['char_tx_withdraw_t'].isnull(), 'char_tx_withdraw_t'] = 0
    assert 0 == panel_df["char_tx_withdraw_t"].isnull().sum()
    assert 0 == (panel_df["char_tx_withdraw_t"]<0).sum()
    assert 0 == (panel_df["char_tx_withdraw_t"]>1e8).sum()

    return panel_df


In [17]:
def cleanAge(panel_df: pd.DataFrame) -> pd.DataFrame:
    # clean age destroyed
    panel_df = panel_df.drop('char_age_consumed_san', axis=1)
    panel_df = panel_df.rename(columns={'char_age_destroyed_san': 'char_age_destroyed_t'})
    panel_df['char_age_destroyed_t'] = panel_df.groupby('asset')['char_age_destroyed_t'].fillna(method='ffill')
    panel_df.loc[panel_df['char_age_destroyed_t'].isnull(), 'char_age_destroyed_t'] = 0
    assert 0 == panel_df["char_age_destroyed_t"].isnull().sum()

    # clean mean age
    panel_df = panel_df.drop('char_mean_age_san', axis=1)
    panel_df = panel_df.rename(columns={'char_mean_dollar_invested_age_san': 'char_age_mean_dollar_t'})
    panel_df['char_age_mean_dollar_t'] = panel_df.groupby('asset')['char_age_mean_dollar_t'].fillna(method='ffill')
    panel_df.loc[panel_df['char_age_mean_dollar_t'].isnull(), 'char_age_mean_dollar_t'] = 0
    assert 0 == panel_df["char_age_mean_dollar_t"].isnull().sum()

    return panel_df


In [18]:
def cleanCryptoValuationCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # drop cols
    panel_df = panel_df.drop('char_total_supply_in_profit_san', axis=1)

    # clean cols
    cols = ['char_stock_to_flow_san',
        'char_realized_value_usd_san',
        'char_mean_realized_price_usd_san',
        'char_mvrv_long_short_diff_usd_san',
        'char_mvrv_usd_san',
        'char_nvt_san',
        'char_nvt_transaction_volume_san',
        'char_percent_of_total_supply_in_profit_san']
    for col in cols:
        new_col = col[:-4]+'_t'
        panel_df = panel_df.rename(columns={col: new_col})
        panel_df[new_col] = panel_df.groupby('asset')[new_col].fillna(method='ffill')
        panel_df.loc[panel_df[new_col].isnull(), new_col] = 0
        assert 0 == panel_df[new_col].isnull().sum()

    return panel_df


In [19]:
def cleanFlowCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    cols = ['char_cexes_to_defi_flow_san', 'char_cexes_to_dex_flow_san',
            'char_cexes_to_dex_traders_flow_san', 'char_cexes_to_traders_flow_san',
            'char_cexes_to_whale_flow_san', 'char_defi_to_cexes_flow_san',
            'char_defi_to_dex_traders_flow_san', 'char_defi_to_dexes_flow_san',
            'char_defi_to_exchanges_flow_san', 'char_defi_to_traders_flow_san',
            'char_defi_to_whale_flow_san', 'char_dex_traders_to_cexes_flow_san',
            'char_dex_traders_to_defi_flow_san', 'char_dex_traders_to_dexes_flow_san',
            'char_dex_traders_to_exchanges_flow_san', 'char_dex_traders_to_whale_flow_san',
            'char_dexes_to_defi_flow_san', 'char_dexes_to_dex_traders_flow_san',
            'char_dexes_to_traders_flow_san', 'char_dexes_to_whale_flow_san',
            'char_dex_to_cexes_flow_san', 'char_exchange_inflow_san',
            'char_exchange_inflow_usd_san', 'char_exchange_outflow_san',
            'char_exchange_outflow_usd_san', 'char_exchanges_to_defi_flow_san',
            'char_exchanges_to_dex_traders_flow_san', 'char_exchanges_to_genesis_flow_san',
            'char_exchanges_to_traders_flow_san', 'char_exchanges_to_whales_flow_san',
            'char_traders_to_cexes_flow_san', 'char_traders_to_defi_flow_san',
            'char_traders_to_dexes_flow_san', 'char_traders_to_exchanges_flow_san',
            'char_traders_to_whale_flow_san', 'char_whale_to_cexes_flow_san',
            'char_whale_to_defi_flow_san', 'char_whale_to_dex_traders_flow_san',
            'char_whale_to_dexes_flow_san', 'char_whale_to_traders_flow_san',
            'char_whales_to_exchanges_flow_san']
    for col in cols:
        new_col = col[:-4]+'_t'
        panel_df = panel_df.rename(columns={col: new_col})
        panel_df[new_col] = panel_df.groupby('asset')[new_col].fillna(method='ffill')
        panel_df.loc[panel_df[new_col].isnull(), new_col] = 0
        assert 0 == panel_df[new_col].isnull().sum()

    return panel_df


In [20]:
def cleanCirculationCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    cols = ['char_circulation_1d_san', 'char_circulation_2y_san',
            'char_circulation_30d_san', 'char_circulation_365d_san',
            'char_circulation_3y_san', 'char_circulation_7d_san',
            'char_circulation_90d_san', 'char_dormant_circulation_180d_san', 
            'char_dormant_circulation_365d_san', 'char_dormant_circulation_90d_san']
    for col in cols:
        new_col = col[:-4]+'_t'
        panel_df = panel_df.rename(columns={col: new_col})
        panel_df[new_col] = panel_df.groupby('asset')[new_col].fillna(method='ffill')
        panel_df.loc[panel_df[new_col].isnull(), new_col] = 0
        assert 0 == panel_df[new_col].isnull().sum()
    return panel_df


In [21]:
def cleanBalanceCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # drop cols
    cols_to_drop = ['char_all_known_balance_san',
                    'char_amount_in_exchange_top_holders_san',
                    'char_amount_in_non_exchange_top_holders_san',
                    'char_dex_trader_balance_san',
                    'char_dex_traders_cex_balance_san',
                    'char_dex_traders_defi_balance_san',
                    'char_dex_traders_dex_balance_san',
                    'char_dex_traders_exchange_balance_san',
                    'char_dex_traders_whale_balance_san',
                    'char_whales_exchange_balance_san']
    panel_df = panel_df.drop(cols_to_drop, axis=1)

    # rename typo
    panel_df = panel_df.rename(columns={'char_trader_balance_san': 'char_traders_balance_san'})

    # clean cols
    cols = ['char_amount_in_top_holders_san', 'char_defi_balance_san',
        'char_defi_cex_balance_san', 'char_defi_dex_balance_san',
        'char_defi_exchange_balance_san', 'char_cex_balance_san',
        'char_dex_balance_san', 'char_dex_cex_balance_san',
        'char_exchange_balance_san', 'char_percent_of_total_supply_on_exchanges_san',
        'char_supply_on_exchanges_san', 'char_supply_outside_exchanges_san',
        'char_traders_balance_san', 'char_traders_cex_balance_san',
        'char_traders_defi_balance_san', 'char_traders_dex_balance_san',
        'char_traders_exchange_balance_san', 'char_traders_whale_balance_san',
        'char_whale_balance_san', 'char_whale_cex_balance_san',
        'char_whale_defi_balance_san', 'char_whale_dex_balance_san']
    for col in cols:
        new_col = col[:-4]+'_t'
        panel_df = panel_df.rename(columns={col: new_col})
        panel_df[new_col] = panel_df.groupby('asset')[new_col].fillna(method='ffill')
        panel_df.loc[panel_df[new_col].isnull(), new_col] = 0
        assert 0 == panel_df[new_col].isnull().sum()

    return panel_df


In [22]:
def formStablecoinDeviation(macro_df: pd.DataFrame) -> pd.DataFrame:
    # form deviation column with coinapi data
    macro_df.loc[macro_df.macro_usd_per_usdc_ca.isnull(), 'macro_stablecoin_dev_t'] = macro_df.macro_usd_per_usdc_ca - 1
    macro_df.loc[macro_df.macro_usd_per_usdc_ca.notnull()
        & (np.abs(macro_df.macro_usd_per_usdc_ca-1) > np.abs(macro_df.macro_usd_per_usdt_ca-1)), 'macro_stablecoin_dev_t'] = macro_df.macro_usd_per_usdt_ca-1
    macro_df.loc[macro_df.macro_usd_per_usdc_ca.notnull()
        & (np.abs(macro_df.macro_usd_per_usdc_ca-1) <= np.abs(macro_df.macro_usd_per_usdt_ca-1)), 'macro_stablecoin_dev_t'] = macro_df.macro_usd_per_usdt_ca-1

    # avg with existing deviation column from coinmetrics
    macro_df['macro_stablecoin_dev_t'] = macro_df[['macro_stablecoin_dev_t', 'macro_usdt_usdc_dev_from_one_cm']].mean(axis=1)

    # ensure clean
    assert 0 == macro_df.macro_stablecoin_dev_t.isnull().sum()
    assert 0 == macro_df[(macro_df.macro_stablecoin_dev_t < -0.5) 
        | (macro_df.macro_stablecoin_dev_t > 0.5)].shape[0]

    # drop old columns
    macro_df = macro_df.drop(['macro_usd_per_usdc_ca', 'macro_usd_per_usdt_ca',
        'macro_usdt_usdc_dev_from_one_cm'], axis=1)
    
    return macro_df


In [23]:
def formICO(macro_df: pd.DataFrame) -> pd.DataFrame:
    # rename
    macro_df = macro_df.rename(columns={'macro_ico_sum_momtaz': 'macro_ico_count_t'})

    # clean the col
    macro_df.loc[macro_df.macro_ico_count_t.isnull(), 'macro_ico_count_t'] = 0

    # ensure clean
    assert 0 == macro_df.macro_ico_count_t.isnull().sum()
    assert 0 == (macro_df.macro_ico_count_t<0).sum()

    return macro_df


In [24]:
def formMcCrakenColumns(macro_df: pd.DataFrame) -> pd.DataFrame:
    # final all fed columns
    fed_cols = [col for col in macro_df.columns if '_fed' in col]

    # rename messed up columns
    macro_df = macro_df.rename(columns={'macro_ces0600000008_fed': 'macro_ces06_fed',
        'macro_ces2000000008_fed': 'macro_ces20_fed',
        'macro_ces3000000008_fed': 'macro_ces30_fed',
        'macro_s&p 500_fed': 'macro_snp500_fed',
        'macro_s&p pe ratio_fed': 'macro_snp_pe_fed',
        'macro_s&p div yield_fed': 'macro_snp_div_yield_fed',
        'macro_s&p: indust_fed': 'macro_snp_indust_fed'})

    # make list of columns to clean and keep
    fed_cols_to_clean = ['macro_aaa_fed', 'macro_acogno_fed', 'macro_amdmnox_fed', 
        'macro_andenox_fed', 'macro_awhman_fed', 'macro_awotman_fed',
        'macro_baa_fed', 'macro_businvx_fed', 'macro_busloans_fed', 
        'macro_ce16ov_fed', 'macro_claimsx_fed', 'macro_clf16ov_fed',
        'macro_cmrmtsplx_fed', 'macro_compapffx_fed', 'macro_conspi_fed', 
        'macro_cp3mx_fed', 'macro_cpiaucsl_fed', 'macro_cusr0000sac_fed',
        'macro_cusr0000sad_fed', 'macro_cusr0000sas_fed',
        'macro_ddurrg3m086sbea_fed', 'macro_dndgrg3m086sbea_fed',
        'macro_dpcera3m086sbea_fed', 'macro_dserrg3m086sbea_fed',
        'macro_dtcthfnm_fed', 'macro_excausx_fed', 'macro_exjpusx_fed',
        'macro_exszusx_fed', 'macro_exusukx_fed', 'macro_fedfunds_fed',
        'macro_gs1_fed', 'macro_gs10_fed', 'macro_gs5_fed', 
        'macro_houst_fed', 'macro_hwiuratio_fed', 'macro_indpro_fed',
        'macro_invest_fed', 'macro_m1sl_fed', 'macro_m2real_fed', 
        'macro_m2sl_fed', 'macro_bogmbase_fed', 'macro_manemp_fed',
        'macro_nonrevsl_fed', 'macro_oilpricex_fed', 'macro_payems_fed', 
        'macro_pcepi_fed', 'macro_permit_fed', 'macro_realln_fed',
        'macro_rpi_fed', 'macro_tb3ms_fed', 'macro_tb6ms_fed', 
        'macro_totresns_fed', 'macro_twexafegsmthx_fed', 'macro_uempmean_fed',
        'macro_umcsentx_fed', 'macro_unrate_fed', 'macro_uscons_fed', 
        'macro_usfire_fed', 'macro_usgood_fed', 'macro_vixclsx_fed',
        'macro_ces06_fed', 'macro_ces20_fed', 'macro_ces30_fed', 'macro_snp500_fed',
        'macro_snp_pe_fed', 'macro_snp_div_yield_fed', 'macro_snp_indust_fed']

    # drop fed columns not used
    fed_cols_to_drop = list(set(fed_cols).difference(set(fed_cols_to_clean)))
    fed_cols_to_drop = list(set(fed_cols_to_drop).intersection(set(macro_df.columns)))
    macro_df = macro_df.drop(fed_cols_to_drop, axis=1)

    # rename columns and ensure clean
    for col in fed_cols_to_clean:
        new_col = col[:-4]+'_t'
        macro_df = macro_df.rename(columns={col: new_col})
        assert 0 == macro_df[new_col].isnull().sum()

    return macro_df


In [25]:
def cleanCustomMacroColumns(macro_df: pd.DataFrame) -> pd.DataFrame:
    # final all fred columns
    fred_cols = [col for col in macro_df.columns if ('_fred' in col) | ('_ui' in col)]

    # rename messed up columns
    macro_df = macro_df.rename(columns={'macro_teu-sca_ui': 'macro_teu_sca_ui',
        'macro_tmu-sca_ui': 'macro_tmu_sca_ui'})

    # make list of columns to clean and keep
    fred_cols_to_clean = ['macro_dgs1mo_fred', 'macro_expinf10yr_fred',
        'macro_expinf1yr_fred', 'macro_expinf20yr_fred',
        'macro_expinf2yr_fred', 'macro_expinf30yr_fred',
        'macro_expinf3yr_fred', 'macro_expinf5yr_fred',
        'macro_t10yie_fred', 'macro_t20yiem_fred',
        'macro_t30yiem_fred', 'macro_t5yie_fred']
    ui_cols_to_clean = ['macro_teu_sca_ui', 'macro_tmu_sca_ui', 'macro_emv_ui', 
        'macro_emv_inflation_ui', 'macro_gepu_ui', 'macro_us_mpu_ui']

    # drop fred columns not used
    cols_to_drop = list(set(fred_cols).difference(set(fred_cols_to_clean)))
    cols_to_drop = list(set(cols_to_drop).difference(set(ui_cols_to_clean)))
    cols_to_drop = list(set(cols_to_drop).intersection(set(macro_df.columns)))
    macro_df = macro_df.drop(cols_to_drop, axis=1)

    # rename columns and ensure clean
    for col in fred_cols_to_clean:
        new_col = col[:-5]+'_t'
        macro_df = macro_df.rename(columns={col: new_col})
        macro_df[new_col] = macro_df[new_col].ffill()
        assert 0 == macro_df[new_col].isnull().sum()
    for col in ui_cols_to_clean:
        new_col = col[:-3]+'_t'
        macro_df = macro_df.rename(columns={col: new_col})
        macro_df[new_col] = macro_df[new_col].ffill()
        assert 0 == macro_df[new_col].isnull().sum()

    return macro_df


In [26]:
def cleanRemainingMacroColumns(macro_df: pd.DataFrame) -> pd.DataFrame:
    # drop cols
    cols_to_drop = ['macro_btc_adr_act_rec_cnt_cm', 'macro_btc_adr_act_sent_cnt_cm',
        'macro_eth_fee_rev_pct_cm', 'macro_btc_fee_rev_pct_cm',
        'macro_eth_avg_fee_san', 'macro_btc_puell_mul_tot_cm',
        'macro_eth_puell_mul_tot_cm', 'macro_altcoin_usd_volume_24h_cmc',
        'macro_btc_dominance_cmc', 'macro_total_adr_act_rec_cnt_cm',
        'macro_total_adr_act_sent_cnt_cm']
    macro_df = macro_df.drop(cols_to_drop, axis=1)

    # adjust columns before avg
    macro_df['macro_total_usd_volume_24h_cmc'] = macro_df['macro_total_usd_volume_24h_cmc']/24
    macro_df['macro_ex_usd_volume_24h_dex_cmc'] = macro_df['macro_ex_usd_volume_24h_dex_cmc']/24

    # set up lists for averaging
    col_pairs_to_avg = [['macro_btc_cap_mvrv_cur_cm', 'macro_btc_mvrv_san'],
                        ['macro_eth_fee_med_usd_cm', 'macro_eth_median_fee_san'],
                        ['macro_eth_mvrv_san', 'macro_eth_cap_mvrv_cur_cm'],
                        ['macro_eth_total_fee_san', 'macro_eth_fee_tot_usd_cm'],
                        ['macro_ex_volume_spot_usd_cm', 'macro_total_usd_volume_24h_cmc'],
                        ['macro_total_dex_volume_san', 'macro_ex_usd_volume_24h_dex_cmc']]
    new_col_names = ['macro_btc_mvrv_t', 'macro_eth_fee_med_t', 'macro_eth_mvrv_t', 
                    'macro_eth_total_fee_t', 'macro_ex_volume_t', 'macro_dex_volume_t']
    for new_col, col_pair in zip(new_col_names, col_pairs_to_avg):
        macro_df[new_col] = macro_df[col_pair].mean(axis=1)
        macro_df[new_col] = macro_df[new_col].ffill()
        assert 0 == macro_df[new_col].isnull().sum()
        macro_df = macro_df.drop(col_pair, axis=1)

    # obtaining remaining macro columns 
    cols_end_in_t = [col for col in macro_df.columns if col[-2:]=='_t']
    remaining_cols = list(set(macro_df.columns).difference(set(cols_end_in_t)))
    remaining_cols.remove('date')

    # clean remaining cols
    for col in remaining_cols:
        last_underscore_index = col.rfind('_')
        new_col = col[:last_underscore_index]+'_t'
        macro_df = macro_df.rename(columns={col: new_col})
        macro_df[new_col] = macro_df[new_col].ffill()
        macro_df.loc[macro_df[new_col].isnull(), new_col] = 0
        assert 0 == macro_df[new_col].isnull().sum()
    
    return macro_df


In [27]:
def combineAndCleanPanelAndMacro(panel_df: pd.DataFrame, macro_df: pd.DataFrame,
    asset_universe_dict: Dict[str, list]) -> pd.DataFrame:
    """ Perform various checks and combine the panel and macro data. """
    # merge on macro variables
    panel_df = panel_df.merge(macro_df, on='date', how='left', validate='many_to_one')

    # ensure i have all hours
    min_dt, max_dt = panel_df.date.min(), panel_df.date.max()
    full_date_range = pd.date_range(start=min_dt, end=max_dt, freq='1H')
    assert len(panel_df.date.unique()) == len(full_date_range)

    # ensure no missing values
    assert 0 == panel_df.isnull().sum().sum()

    # ensure date column is clean
    assert len(panel_df) == panel_df[panel_df.date.dt.minute==0].shape[0]

    # ensure asset column is clean
    asset_universe = Helper.findUniqueAssets(asset_universe_dict)
    assert len(panel_df) == panel_df[panel_df.asset.isin(asset_universe)].shape[0]

    # ensure no duplicates
    assert not panel_df.duplicated(subset=['date', 'asset']).any()

    # sort by date then asset and reset index
    panel_df = panel_df.sort_values(by=['date', 'asset'], ignore_index=True)

    # sort columns
    cols = list(panel_df.columns.values)
    first_cols = ['date', 'asset', 'r_ex_tp1', 'r_ex_tp24', 
        'char_price_t', 'char_volume_t', 'char_trades_t', 'char_mcap_t']
    asset_usage_cols = [col for col in panel_df.columns if 'char_asset_usage' in col]
    asset_usage_cols.sort()
    static_cols = [col for col in panel_df.columns if 'char_industry' in col]
    static_cols += ['char_pow', 'char_pos']
    static_cols += asset_usage_cols
    static_cols += ['char_ico_price', 'char_ico', 'char_ico_days_since_t']
    cols_to_remove = first_cols + static_cols
    for col in cols_to_remove:
        cols.remove(col)
    cols.sort()
    cols_before = panel_df.shape[1]
    panel_df = panel_df[cols_to_remove+cols]
    assert panel_df.shape[1] == cols_before

    return panel_df


In [None]:
if __name__ == "__main__":
    # set args
    CW_IN_FP            = '../data/clean/cw.pkl'
    ASSET_IN_FP         = '../data/derived/asset_universe_dict.pickle'
    ASSET_OUT_FP        = '../data/clean/asset_universe_dict.pickle'
    CA_PANEL_IN_FP      = '../data/derived/ca_panel.pkl'
    CA_MACRO_IN_FP      = '../data/derived/ca_macro.pkl'
    CM_PANEL_IN_FP      = "../data/derived/cm_panel.pkl"
    CM_MACRO_IN_FP      = '../data/derived/cm_macro.pkl'
    CG_PANEL_IN_FP      = '../data/derived/cg_panel.pkl'
    CMC_PANEL_IN_FP     = '../data/derived/cmc_panel.pkl'
    CMC_MACRO_IN_FP     = '../data/derived/cmc_macro.pkl'
    SAN_PANEL_IN_FP     = "../data/derived/san_panel.pkl"
    SAN_MACRO_IN_FP     = '../data/derived/san_macro.pkl'
    MACRO_IN_FP         = '../data/derived/macro.pkl'
    ASSET_ICO_IN_FP     = '../data/derived/momtaz_ico_asset.pkl' 
    MACRO_ICO_IN_FP     = '../data/derived/momtaz_ico_macro.pkl' 
    MESSARI_IN_FP       = '../data/derived/messari.pkl'
    CM_RAW_PANEL_IN_FP  = '../data/raw/coinmetrics_panel_hourly.pkl'
    PANEL_OUT_FP        = '../data/clean/panel.pkl'

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    
    # import and form panel and macro data
    panel_df = formPanel(CW_IN_FP, CA_PANEL_IN_FP, CM_PANEL_IN_FP, CMC_PANEL_IN_FP,
                CG_PANEL_IN_FP, SAN_PANEL_IN_FP, ASSET_ICO_IN_FP, MESSARI_IN_FP)
    macro_df = formMacro(MACRO_IN_FP, CA_MACRO_IN_FP, CM_MACRO_IN_FP,
        CMC_MACRO_IN_FP, SAN_MACRO_IN_FP, MACRO_ICO_IN_FP)
    
    # update asset universe
    panel_df, asset_universe_dict = finalizeAssetUniverse(panel_df, asset_universe_dict)
    with open(ASSET_OUT_FP, 'wb') as f:
        pickle.dump(asset_universe_dict, f)

    # clean panel columns
    panel_df = formPrices(panel_df)
    panel_df = formLHSs(panel_df, macro_df)
    panel_df = formVolumes(panel_df)
    panel_df = formMcap(panel_df)
    panel_df = formStaticCharacteristics(panel_df)
    panel_df = cleanDistributionCols(panel_df)
    panel_df = formSupplyCols(panel_df, CM_RAW_PANEL_IN_FP) 
    panel_df = formDevActivity(panel_df)
    panel_df = cleanCmcColumns(panel_df)
    panel_df = formSocial(panel_df)
    panel_df = formAddrCols(panel_df)
    panel_df = cleanDepositAndWithdrawCols(panel_df)
    panel_df = cleanAge(panel_df)
    panel_df = cleanCryptoValuationCols(panel_df)
    panel_df = cleanFlowCols(panel_df)
    panel_df = cleanCirculationCols(panel_df)
    panel_df = cleanBalanceCols(panel_df)
    
    # clean macro columns
    macro_df = formStablecoinDeviation(macro_df)
    macro_df = formICO(macro_df)
    macro_df = formMcCrakenColumns(macro_df)
    macro_df = cleanCustomMacroColumns(macro_df)
    macro_df = cleanRemainingMacroColumns(macro_df)

    # combine the panel and macro data
    panel_df = combineAndCleanPanelAndMacro(panel_df, macro_df, asset_universe_dict)

    # output
    panel_df.to_pickle(PANEL_OUT_FP)
