In [16]:
import pandas as pd
import numpy as np

In [17]:
def cleanPanel(df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    # Fix columns with known issues
    df = df.drop(columns='alexa_rank', axis=1)
    df = df.rename(columns={'usd_volume_cg': 'usd_volume_24h_cg',
                            'forks': 'github_forks',
                            'stars': 'github_stars',
                            'subscribers': 'github_subscribers',
                            'total_issues': 'github_total_issues',
                            'closed_issues': 'github_closed_issues',
                            'pull_requests_merged': 'github_pull_requests_merged',
                            'pull_requests_contributors': 'github_pull_requests_contributors',
                            'code_additions_4_weeks': 'github_code_additions_4_weeks',
                            'code_deletions_4_weeks': 'github_code_deletions_4_weeks',
                            'commit_count_4_weeks': 'github_commit_count_4_weeks'})
    df['github_code_deletions_4_weeks'] = np.abs(df['github_code_deletions_4_weeks'])
    df['usd_mcap_cg'] = df.groupby('asset_cg')['usd_mcap_cg'].ffill()

    # Convert all columns to float32
    cols = list(df.columns.values)
    cols.remove('date')
    cols.remove('asset_cg')
    for col in cols:
        df[col] = df[col].astype('float32')

    # Convert date column to datetime format and remove timezone information
    df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_localize(None)

    # Confirm no mising dates nor asset ids
    assert 0 == df.date.isnull().sum()
    assert 0 == df.asset_cg.isnull().sum()

    # Confirm all asset ids are in cw
    assert cw_df.asset_cg.is_unique
    asset_ids = list(cw_df.asset_cg.values)
    assert df.shape[0] == df[df.asset_cg.isin(asset_ids)].shape[0]

    # Cut down to relevant dates between July 1, 2016 and January 2, 2023
    df = df[(df['date'] >= '2016-07-01') & (df['date'] <= '2023-01-02')]

    # For price, volume, and mcap columns, ensure no missing obs and below thresholds
    thresholds = {'usd_per_token_cg': 1e9, 'usd_volume_24h_cg': 1e11, 'usd_mcap_cg': 1e13}
    for col, thresh in thresholds.items():
        assert 0 == df[col].isnull().sum()
        df = df[df[col] < thresh]

    # For each asset_cg, replace missing values with zero if there's at least one non-missing observation in the column
    for col in cols:
        has_nonmissing = df.groupby('asset_cg')[col].transform(lambda x: x.notnull().any())
        df.loc[has_nonmissing, col] = df.loc[has_nonmissing, col].fillna(0)

    # Ensure no negative numbers
    assert 0 == (df[cols]<0).sum().sum()

    # Loop over all assets to add any missing days and fill missing price, volume, and mcap data
    final_df = pd.DataFrame()
    assets = list(np.unique(df.asset_cg.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = df[df.asset_cg==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        num_datetime_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        start_datetimes = dates[indices_to_expand]
        new_datetimes = []
        for i in range(len(start_datetimes)):
            start_datetime = start_datetimes[i]
            datetime_to_add = num_datetime_to_add[i]
            for j in range(1, datetime_to_add):
                new_datetimes.append(start_datetime+np.timedelta64(24*(j), 'h'))

        # add the new days to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_datetimes})
        new_asset_df['asset_cg'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # forward fill 
        asset_df = asset_df.ffill()

        # add data to master df
        final_df = pd.concat((final_df, asset_df))

    # reset df name
    del df
    df = final_df.copy()

    # Set column order
    cols.remove('usd_per_token_cg')
    cols.remove('usd_volume_24h_cg')
    cols.remove('usd_mcap_cg')
    cols.sort()
    df = df[['date', 'asset_cg', 'usd_per_token_cg', 'usd_mcap_cg', 'usd_volume_24h_cg']+cols]

    # Ensure all columns have cg in name
    df.columns = [col if col == 'date' or col.endswith('_cg') else col + '_cg' for col in df.columns]

    # ensure no duplicates by date and asset
    assert not df.duplicated(subset=['date', 'asset_cg']).any()

    # Sort by date then asset and reset index
    df = df.sort_values(by=['date', 'asset_cg']).reset_index(drop=True)

    return df

In [18]:
def cleanCrosswalk(cw_df: pd.DataFrame) -> pd.DataFrame:
    """ Cleans a crosswalk DataFrame by ensuring that both asset IDs are unique, 
    sorting the columns, sorting by the asset name, and resetting the index.

    Args:
        cw_df: A pandas DataFrame representing the crosswalk.

    Returns:
        A pandas DataFrame with the cleaned crosswalk.

    Raises:
        AssertionError: If either of the asset IDs are not unique.
    """
    # Ensure both asset ids are unique
    assert cw_df['asset_san'].is_unique, "asset_san must be unique"
    assert cw_df['asset_cm'].is_unique, "asset_san must be unique"

    # Sort columns
    cw_df = cw_df[['asset_cm', 'asset_san']]

    # Sort by cm asset name and reset index
    cw_df = cw_df.sort_values(by='asset_cm').reset_index(drop=True)

    return cw_df

In [19]:
if __name__ == "__main__":
    # set args
    CW_IN_FP = '../data/raw/san_coinmetrics_cw.pkl'
    PANEL_IN_FP = "../data/raw/san_panel.pkl"
    MACRO_IN_FP = '../data/raw/san_macro.pkl'
    CW_OUT_FP = '../data/derived/san_cm_cw.pkl'
    PANEL_OUT_FP = "../data/derived/san_panel.pkl"
    MACRO_OUT_FP = '../data/derived/san_macro.pkl'

    # import data
    cw_df = pd.read_pickle(CW_IN_FP)
    df = pd.read_pickle(PANEL_IN_FP)
    macro_df = pd.read_pickle(MACRO_IN_FP)

    # clean data
    # df = cleanPanel(df, cw_df)
    # cw_df = cleanCrosswalk(cw_df)
    
    # macro_df = cleanMacro(macro_df)

    # # save data
    # cw_df.to_pickle(CW_OUT_FP)
    # df.to_pickle(PANEL_OUT_FP)
    # macro_df.to_pickle(MACRO_OUT_FP)

In [20]:
# Fix columns with known issues
df = df.rename(columns={'asset': 'asset_san'})
df = df.merge(cw_df[['asset_san', 'category_san']], on='asset_san', how='left', validate='many_to_one')


In [22]:
list(df.columns.values)

['datetime',
 'asset_san',
 'active_addresses_1h',
 'active_deposits',
 'active_deposits_per_exchange',
 'active_holders_distribution_combined_balance_over_1',
 'active_holders_distribution_combined_balance_over_10',
 'active_holders_distribution_combined_balance_over_100',
 'active_holders_distribution_combined_balance_over_100k',
 'active_holders_distribution_combined_balance_over_10k',
 'active_holders_distribution_combined_balance_over_1M',
 'active_holders_distribution_combined_balance_over_1k',
 'active_holders_distribution_combined_balance_total',
 'active_holders_distribution_over_1',
 'active_holders_distribution_over_10',
 'active_holders_distribution_over_100',
 'active_holders_distribution_over_100k',
 'active_holders_distribution_over_10k',
 'active_holders_distribution_over_1M',
 'active_holders_distribution_over_1k',
 'active_holders_distribution_total',
 'active_withdrawals',
 'active_withdrawals_per_exchange',
 'age_consumed',
 'age_destroyed',
 'all_known_balance',
 '

In [None]:
# Clean datetime column to form proper date column
# TODO SCOPE TO ENSURE I GET THE TIMESTAMP IN HOW I CLEAN IT

In [None]:
['active_addresses_1h',
 'active_deposits',
 'active_deposits_per_exchange',
 'active_holders_distribution_combined_balance_over_1',
 'active_holders_distribution_combined_balance_over_10',
 'active_holders_distribution_combined_balance_over_100',
 'active_holders_distribution_combined_balance_over_100k',
 'active_holders_distribution_combined_balance_over_10k',
 'active_holders_distribution_combined_balance_over_1M',
 'active_holders_distribution_combined_balance_over_1k',
 'active_holders_distribution_combined_balance_total',
 'active_holders_distribution_over_1',
 'active_holders_distribution_over_10',
 'active_holders_distribution_over_100',
 'active_holders_distribution_over_100k',
 'active_holders_distribution_over_10k',
 'active_holders_distribution_over_1M',
 'active_holders_distribution_over_1k',
 'active_holders_distribution_total',
 'active_withdrawals',
 'active_withdrawals_per_exchange',
 'age_consumed',
 'age_destroyed',
 'all_known_balance',
 'amount_in_exchange_top_holders',
 'amount_in_non_exchange_top_holders',
 'amount_in_top_holders',
 'cex_balance',
 'cexes_to_defi_flow',
 'cexes_to_dex_flow',
 'cexes_to_dex_traders_flow',
 'cexes_to_traders_flow',
 'cexes_to_whale_flow',
 'circulation',
 'circulation_1d',
 'circulation_2y',
 'circulation_30d',
 'circulation_365d',
 'circulation_3y',
 'circulation_5y',
 'circulation_7d',
 'circulation_90d',
 'daily_active_addresses',
 'defi_balance',
 'defi_cex_balance',
 'defi_dex_balance',
 'defi_exchange_balance',
 'defi_to_cexes_flow',
 'defi_to_dex_traders_flow',
 'defi_to_dexes_flow',
 'defi_to_exchanges_flow',
 'defi_to_traders_flow',
 'defi_to_whale_flow',
 'deposit_balance',
 'deposit_transactions',
 'deposit_transactions_per_exchange',
 'dev_activity',
 'dev_activity_contributors_count',
 'dex_balance',
 'dex_cex_balance',
 'dex_to_cexes_flow',
 'dex_trader_balance',
 'dex_traders_cex_balance',
 'dex_traders_defi_balance',
 'dex_traders_dex_balance',
 'dex_traders_exchange_balance',
 'dex_traders_to_cexes_flow',
 'dex_traders_to_defi_flow',
 'dex_traders_to_dexes_flow',
 'dex_traders_to_exchanges_flow',
 'dex_traders_to_whale_flow',
 'dex_traders_whale_balance',
 'dexes_to_defi_flow',
 'dexes_to_dex_traders_flow',
 'dexes_to_traders_flow',
 'dexes_to_whale_flow',
 'dormant_circulation_180d',
 'dormant_circulation_365d',
 'dormant_circulation_90d',
 'exchange_balance',
 'exchange_inflow',
 'exchange_inflow_usd',
 'exchange_outflow',
 'exchange_outflow_usd',
 'exchanges_to_defi_flow',
 'exchanges_to_dex_traders_flow',
 'exchanges_to_genesis_flow',
 'exchanges_to_traders_flow',
 'exchanges_to_whales_flow',
 'github_activity',
 'github_activity_contributors_count',
 'holders_distribution_combined_balance_over_1',
 'holders_distribution_combined_balance_over_10',
 'holders_distribution_combined_balance_over_100',
 'holders_distribution_combined_balance_over_100k',
 'holders_distribution_combined_balance_over_10k',
 'holders_distribution_combined_balance_over_1M',
 'holders_distribution_combined_balance_over_1k',
 'holders_distribution_combined_balance_total',
 'holders_distribution_over_1',
 'holders_distribution_over_10',
 'holders_distribution_over_100',
 'holders_distribution_over_100k',
 'holders_distribution_over_10k',
 'holders_distribution_over_1M',
 'holders_distribution_over_1k',
 'holders_distribution_total',
 'marketcap_usd',
 'mean_age',
 'mean_dollar_invested_age',
 'mean_realized_price_usd',
 'mvrv_long_short_diff_usd',
 'mvrv_usd',
 'network_growth',
 'nvt',
 'nvt_transaction_volume',
 'payments_count',
 'percent_of_total_supply_in_profit',
 'percent_of_total_supply_on_exchanges',
 'price_usd',
 'realized_value_usd',
 'sentiment_balance_reddit',
 'sentiment_balance_total',
 'sentiment_balance_twitter',
 'sentiment_balance_twitter_crypto',
 'sentiment_negative_reddit',
 'sentiment_negative_total',
 'sentiment_negative_twitter',
 'sentiment_negative_twitter_crypto',
 'sentiment_positive_reddit',
 'sentiment_positive_total',
 'sentiment_positive_twitter',
 'sentiment_positive_twitter_crypto',
 'sentiment_volume_consumed_reddit',
 'sentiment_volume_consumed_total',
 'sentiment_volume_consumed_twitter',
 'sentiment_volume_consumed_twitter_crypto',
 'social_dominance_reddit',
 'social_dominance_total',
 'social_dominance_twitter',
 'social_dominance_twitter_crypto',
 'social_volume_reddit',
 'social_volume_total',
 'social_volume_twitter',
 'social_volume_twitter_crypto',
 'stock_to_flow',
 'supply_on_exchanges',
 'supply_outside_exchanges',
 'total_supply',
 'total_supply_in_profit',
 'trader_balance',
 'traders_cex_balance',
 'traders_defi_balance',
 'traders_dex_balance',
 'traders_exchange_balance',
 'traders_to_cexes_flow',
 'traders_to_defi_flow',
 'traders_to_dexes_flow',
 'traders_to_exchanges_flow',
 'traders_to_whale_flow',
 'traders_whale_balance',
 'transaction_volume',
 'transactions_count',
 'unique_social_volume_total_1h',
 'volume_usd',
 'whale_balance',
 'whale_cex_balance',
 'whale_defi_balance',
 'whale_dex_balance',
 'whale_to_cexes_flow',
 'whale_to_defi_flow',
 'whale_to_dex_traders_flow',
 'whale_to_dexes_flow',
 'whale_to_traders_flow',
 'whales_exchange_balance',
 'whales_to_exchanges_flow',
 'withdrawal_balance',
 'withdrawal_transactions',
 'category_san']

In [10]:

# TODO LOOK TO SEE IF SAN PRICE, MCAP, OR VOLUME IS MISSING

# Convert all columns to float32
cols = list(df.columns.values)
cols.remove('date')
cols.remove('asset_cg')
for col in cols:
    df[col] = df[col].astype('float32')

# Convert date column to datetime format and remove timezone information
df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_localize(None)

# Confirm no mising dates nor asset ids
assert 0 == df.date.isnull().sum()
assert 0 == df.asset_cg.isnull().sum()

# Confirm all asset ids are in cw
assert cw_df.asset_cg.is_unique
asset_ids = list(cw_df.asset_cg.values)
assert df.shape[0] == df[df.asset_cg.isin(asset_ids)].shape[0]

# Cut down to relevant dates between July 1, 2016 and January 2, 2023
df = df[(df['date'] >= '2016-07-01') & (df['date'] <= '2023-01-02')]

# For price, volume, and mcap columns, ensure no missing obs and below thresholds
thresholds = {'usd_per_token_cg': 1e9, 'usd_volume_24h_cg': 1e11, 'usd_mcap_cg': 1e13}
for col, thresh in thresholds.items():
    assert 0 == df[col].isnull().sum()
    df = df[df[col] < thresh]

# For each asset_cg, replace missing values with zero if there's at least one non-missing observation in the column
for col in cols:
    has_nonmissing = df.groupby('asset_cg')[col].transform(lambda x: x.notnull().any())
    df.loc[has_nonmissing, col] = df.loc[has_nonmissing, col].fillna(0)

# Ensure no negative numbers
assert 0 == (df[cols]<0).sum().sum()

# Loop over all assets to add any missing days and fill missing price, volume, and mcap data
final_df = pd.DataFrame()
assets = list(np.unique(df.asset_cg.values))
for asset in assets:
    # subset to asset of interest
    asset_df = df[df.asset_cg==asset].copy()

    # determine the date gaps
    date_gaps = []
    dates = asset_df.date.values
    for i in range(1, len(dates)):
        date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

    # determine new days to add
    indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
    num_datetime_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
    start_datetimes = dates[indices_to_expand]
    new_datetimes = []
    for i in range(len(start_datetimes)):
        start_datetime = start_datetimes[i]
        datetime_to_add = num_datetime_to_add[i]
        for j in range(1, datetime_to_add):
            new_datetimes.append(start_datetime+np.timedelta64(24*(j), 'h'))

    # add the new days to the asset df
    new_asset_df = pd.DataFrame(data={'date': new_datetimes})
    new_asset_df['asset_cg'] = asset
    asset_df = pd.concat((asset_df, new_asset_df))
    asset_df = asset_df.sort_values(by='date', ignore_index=True)

    # forward fill the price col
    asset_df['usd_per_token_cg'] = asset_df['usd_per_token_cg'].ffill()

    # replace volume and trades with zeros
    asset_df.loc[asset_df.usd_per_token_cg.isnull(), 'usd_volume_cg'] = 0
    asset_df.loc[asset_df.usd_mcap_cg.isnull(), 'usd_mcap_cg'] = 0

    # add data to master df
    final_df = pd.concat((final_df, asset_df))

# reset df name
del df
df = final_df.copy()

# Set column order
cols.remove('usd_per_token_cg')
cols.remove('usd_volume_24h_cg')
cols.remove('usd_mcap_cg')
cols.sort()
df = df[['date', 'asset_cg', 'usd_per_token_cg', 'usd_mcap_cg', 'usd_volume_24h_cg']+cols]

# Ensure all columns have cg in name
df.columns = [col if col == 'date' or col.endswith('_cg') else col + '_cg' for col in df.columns]

# ensure no duplicates by date and asset
assert not df.duplicated(subset=['date', 'asset_cg']).any()

# Sort by date then asset and reset index
df = df.sort_values(by=['date', 'asset_cg']).reset_index(drop=True)


(4842144, 182)

In [None]:
# TODO CLEAN CROSSWALK
# TODO CLEAN MACRO DATA
# TODO CLEAN PANEL
# TODO also how to handle the missing values for it
# TODO make sure the timestamp is good

In [None]:
# TODO UPDATE OLD CLEANING SCRIPT

In [None]:
# CLEAN THE MACRO DATA

# Move over some data from the timeseries df to the macro df
temp_df = ts_df[['date', 'san_slug', 'stock_to_flow', 'traders_to_defi_flow', 'traders_defi_balance']]
ts_df = ts_df.drop(['stock_to_flow', 'traders_to_defi_flow', 'traders_defi_balance'], axis=1)
macro_df = macro_df.merge(temp_df,
                          on=['date', 'san_slug'],
                          how='outer',
                          validate='one_to_one')

# Form mcap weighted average variables
temp_df = macro_df[['date', 'san_slug', 'stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio']]
temp_df = temp_df.dropna(how='all', subset=['stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio'])
stf_tokens  = list(np.unique(temp_df[~temp_df.stock_to_flow.isnull()].san_slug.values))
mvrv_tokens = list(np.unique(temp_df[~temp_df.mvrv_usd_intraday.isnull()].san_slug.values))
mcd_tokens  = list(np.unique(temp_df[~temp_df.mcd_collat_ratio.isnull()].san_slug.values))
mcap_needed_tokens = list(np.unique(np.array(stf_tokens + mvrv_tokens + mcd_tokens + scd_tokens)))
temp_mcap_df = ts_df[ts_df.san_slug.isin(mcap_needed_tokens)][['date', 'san_slug', "marketcap_usd"]]
temp_df = temp_df.merge(temp_mcap_df,
                        on=['date', 'san_slug'],
                        how='inner',
                        validate='one_to_one')
for col in ['stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio']:
    mcap_avg_temp_df = temp_df[['date', 'san_slug', col, 'marketcap_usd']]
    mcap_avg_temp_df = mcap_avg_temp_df.dropna()
    mcap_avg_temp_df['total_mcap'] = mcap_avg_temp_df.groupby('date')['marketcap_usd'].transform('sum')
    mcap_avg_temp_df['mcap_fraction'] = mcap_avg_temp_df.marketcap_usd / mcap_avg_temp_df.total_mcap
    mcap_avg_temp_df['temp'] = mcap_avg_temp_df.mcap_fraction * mcap_avg_temp_df[col]
    mcap_avg_temp_df = mcap_avg_temp_df.groupby('date')[['temp']].sum()
    mcap_avg_temp_df['san_slug'] = 'macro'
    mcap_avg_temp_df = mcap_avg_temp_df.reset_index()
    mcap_avg_temp_df = mcap_avg_temp_df.rename(columns = {'temp': ('santiment_token_mcap_avg_'+col)})
    macro_df = macro_df.merge(mcap_avg_temp_df,
                              on=['date', 'san_slug'],
                              how='outer',
                              validate='one_to_one')
    
# Form columns of dollar sum across tokens
sum_columns = ['cexes_to_dex_flow', 'exchanges_to_defi_flow', 'whale_to_defi_flow', 'dex_traders_to_defi_flow', 
               'whale_defi_balance', 'traders_to_defi_flow', 'traders_defi_balance']
for col in sum_columns: 
    temp_df = macro_df[['date', col]]
    temp_df = temp_df.groupby('date')[[col]].sum()
    temp_df = temp_df.rename(columns={col: 'santiment_token_sum_'+col})
    temp_df['san_slug'] = 'macro'
    temp_df = temp_df.reset_index()
    macro_df = macro_df.merge(temp_df,
                              on=['date', 'san_slug'],
                              how='outer',
                              validate='one_to_one')
    
# Form macro variables from column with just bitcoin and/or ethereum
columns = ['defi_total_value_locked_usd', 'nft_trade_volume_usd', 
           'nft_trades_count', 'nft_retail_trade_volume_usd', 
           'nft_whale_trade_volume_usd', 'nft_whale_trades_count', 
           'percent_of_whale_stablecoin_total_supply', 
           'average_fees_usd', 'fees_usd', 'eth2_roi', 'median_fees_usd',
           'miners_to_exchanges_flow', 'miners_exchange_balance', 
           'traders_to_defi_flow', 'traders_defi_balance', 
           'mvrv_usd_intraday', 'stock_to_flow']
for col in columns:
    temp_df = macro_df[['date', 'san_slug', col]]
    temp_df = temp_df.dropna()
    btc_eth_token = list(np.unique(temp_df.san_slug.values))
    if col in ['traders_defi_balance', 'traders_to_defi_flow',
               'mvrv_usd_intraday', 'stock_to_flow']:
        btc_eth_token = ['ethereum']
    for token in btc_eth_token:
        temp_token_df = temp_df[temp_df.san_slug == token]
        temp_token_df['san_slug'] = 'macro'
        temp_token_df = temp_token_df.rename(columns = {col: 'santiment_'+token+'_'+col})
        macro_df = macro_df.merge(temp_token_df,
                                  on=['date', 'san_slug'],
                                  how='outer',
                                  validate='one_to_one')

# Drop sum_columns and columns from macro_df
macro_df = macro_df.drop((sum_columns + columns), axis=1)
macro_df = macro_df.drop(['mcd_collat_ratio'], axis=1)

# Keep just the new rows
macro_df = macro_df[macro_df.san_slug == 'macro']
macro_df = macro_df.drop('san_slug', axis=1)

# Clean it up
macro_df = macro_df.reset_index(drop=True)
macro_df = macro_df.sort_values(by='date')