In [2]:
import pandas as pd
import numpy as np
import time
import san
import pickle
from typing import Dict, List, Any, Callable
from helper_functions import Helper
import retrying

In [2]:
@retrying.retry(
    wait_exponential_multiplier=1000,
    wait_exponential_max=20000,
    stop_max_attempt_number=3
)
def callSanFunction(san_function: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
    """Calls a function in the 'san' library with retrying.

    Args:
        san_function: The function to call from the 'san' library.
        *args: Positional arguments to pass to the function.
        **kwargs: Keyword arguments to pass to the function.

    Returns:
        The return value of the 'san_function' call.
    """
    try:
        result = san_function(*args, **kwargs)
        return result
    except retrying.RetryError as e:
        print(f"Request failed after {e.last_attempt_time}s of retries.")
        raise
    except retrying.Retrying as r:
        remaining_time = round((r.wait_time - r.elapsed), 2)
        print(f"Request failed. Retrying in {remaining_time}s...")
        raise
    except Exception:
        print("An error occurred while calling the 'san_function'.")
        raise

In [3]:
def formSantimentAssetUniverse(asset_universe: List[str]) -> pd.DataFrame:
    """ form the crosswalk for santiment to coinmetrics assets.

    Args:
        asset_universe (List[str]): Coinmetrics asset IDs.

    Returns:
        df (pd.DataFrame): crosswalk between asset_san and asset_cm with column for category_san.    
    """
    # Set slugs to drop that duplciates
    slugs_to_drop = ['o-balancer', 'p-balancer', 'farmatrust', 'planet', 'plair', 'invacio', 'nftx-hashmasks-index', 
                    'truebit', 'game', 'bonded-finance', 'o-aave', 'bnb-aave', 'p-aave', 'p-chainlink', 
                    'arb-chainlink', 'bnb-chainlink', 'o-chainlink', 'mir-coin', 'p-matic-network', 'bnb-ankr',
                    'bnb-cardano', 'o-perpetual-protocol', 'bnb-mines-of-dalarnia', 'p-quickswap', 
                    'bnb-synthetix-network-token', 'o-synthetix-network-token', 'arb-curve', 'p-uniswap',
                    'bnb-uniswap', 'arb-stargate-finance', 'arb-sushi', 'bnb-sushi', 'gomining-token', 
                    'bnb-swipe', 'bnb-1inch', 'bnb-chromia', 'bnb-myneighboralice', 'bnb-alpha-finance-lab', 
                    'bnb-avalanche', 'bnb-axie-infinity']
    
    # TODO FIND CRO/CRONOS, GMT/STEPN, ORCA

    # specify san endpoint
    san_function = san.get
    san_args = ("projects/all",) 

    # make the call
    result = callSanFunction(san_function, *san_args)

    # convert to dataframe object
    assert type(result) == pd.DataFrame
    df = result.copy()

    # manually add asset that we are missig
    manual_df = df[df.slug.isin(['nano'])][['marketSegment', 'slug']]
    manual_df['symbol_lower'] = 'nano'

    # subset down to matched assets
    df['symbol_lower'] = df.ticker.str.lower()
    df = df[df.symbol_lower.isin(asset_universe)]

    # remove duplicated assets
    df = df[~df.slug.isin(slugs_to_drop)]

    # subset to relevant columns and append other asset(s)
    df = df[['marketSegment', 'slug', 'symbol_lower']]
    df = pd.concat([manual_df, df])

    # rename
    df = df.rename(columns={'marketSegment': 'category_san', 'slug': 'asset_san', 'symbol_lower': 'asset_cm'})

    # confirm full one to one mapping, knowing we are missing orca
    asset_universe.remove('orca')
    assert len(asset_universe) == np.sum(np.unique(df.asset_cm.values) == np.unique(asset_universe))

    # return
    return df[['asset_san', 'asset_cm', 'category_san']].sort_values(by='asset_cm', ignore_index=True)

In [4]:
def formAssetMetricsDicts(san_slug_universe: List[str]) -> tuple:
    """ form dictionaries of asset slugs and associated metrics that we could pull and that we will pull.

    Args:
        san_slug_universe (List[str]): list of strings of Santiment slugs in study universe.
    
    Returns:
        (tuple): dictionaries of asset slugs and associated metrics that we could pull and that we will pull.
    """
    # define metrics of interest at asset level
    asset_metrics_to_include = ['active_addresses_1h', 'active_deposits', 'active_deposits_per_exchange',
        'active_holders_distribution_combined_balance_over_1', 'active_holders_distribution_combined_balance_over_10',
        'active_holders_distribution_combined_balance_over_100', 'active_holders_distribution_combined_balance_over_100k',
        'active_holders_distribution_combined_balance_over_10k', 'active_holders_distribution_combined_balance_over_1M',
        'active_holders_distribution_combined_balance_over_1k', 'active_holders_distribution_combined_balance_total',
        'active_holders_distribution_over_1', 'active_holders_distribution_over_10', 'active_holders_distribution_over_100',
        'active_holders_distribution_over_100k', 'active_holders_distribution_over_10k', 'active_holders_distribution_over_1M',
        'active_holders_distribution_over_1k', 'active_holders_distribution_total', 'active_withdrawals',
        'active_withdrawals_per_exchange', 'age_consumed', 'age_destroyed', 'all_known_balance',
        'amount_in_exchange_top_holders', 'amount_in_non_exchange_top_holders', 'amount_in_top_holders',
        'cex_balance', 'cexes_to_defi_flow', 'cexes_to_dex_flow', 'cexes_to_dex_traders_flow', 'cexes_to_traders_flow',
        'cexes_to_whale_flow', 'circulation', 'circulation_1d', 'circulation_2y', 'circulation_30d', 'circulation_365d',
        'circulation_3y', 'circulation_5y', 'circulation_7d', 'circulation_90d', 
        'daily_active_addresses', 'defi_balance', 'defi_cex_balance', 'defi_dex_balance', 'defi_exchange_balance', 
        'defi_to_cexes_flow', 'defi_to_dex_traders_flow', 'defi_to_dexes_flow', 'defi_to_exchanges_flow', 'defi_to_traders_flow',
        'defi_to_whale_flow', 'deposit_balance', 'deposit_transactions', 'deposit_transactions_per_exchange', 'dev_activity',
        'dev_activity_contributors_count', 'dex_balance', 'dex_cex_balance', 'dex_to_cexes_flow', 'dex_trader_balance',
        'dex_traders_cex_balance', 'dex_traders_defi_balance', 'dex_traders_dex_balance',
        'dex_traders_exchange_balance', 'dex_traders_to_cexes_flow', 'dex_traders_to_defi_flow', 'dex_traders_to_dexes_flow', 
        'dex_traders_to_exchanges_flow', 'dex_traders_to_whale_flow', 'dex_traders_whale_balance', 'dexes_to_defi_flow',
        'dexes_to_dex_traders_flow', 'dexes_to_traders_flow', 'dexes_to_whale_flow', 'dormant_circulation_180d', 
        'dormant_circulation_365d', 'dormant_circulation_90d', 'exchange_balance', 'exchange_inflow', 'exchange_inflow_usd',
        'exchange_outflow', 'exchange_outflow_usd', 'exchanges_to_defi_flow', 'exchanges_to_dex_traders_flow',
        'exchanges_to_genesis_flow', 'exchanges_to_traders_flow', 'exchanges_to_whales_flow',
        'github_activity', 'github_activity_contributors_count',
        'holders_distribution_combined_balance_over_1', 'holders_distribution_combined_balance_over_10',
        'holders_distribution_combined_balance_over_100', 'holders_distribution_combined_balance_over_100k',
        'holders_distribution_combined_balance_over_10k', 'holders_distribution_combined_balance_over_1M',
        'holders_distribution_combined_balance_over_1k', 'holders_distribution_combined_balance_total',
        'holders_distribution_over_1', 'holders_distribution_over_10', 'holders_distribution_over_100',
        'holders_distribution_over_100k', 'holders_distribution_over_10k', 'holders_distribution_over_1M', 
        'holders_distribution_over_1k', 'holders_distribution_total',
        'marketcap_usd', 'mean_age', 'mean_dollar_invested_age', 'mean_realized_price_usd',
        'mvrv_long_short_diff_usd', 'mvrv_usd',
        'network_growth', 'nvt', 'nvt_transaction_volume', 'payments_count', 'percent_of_total_supply_in_profit',
        'percent_of_total_supply_on_exchanges', 'price_usd', 'realized_value_usd', 'sentiment_balance_reddit', 'sentiment_balance_total',
        'sentiment_balance_twitter', 'sentiment_balance_twitter_crypto', 'sentiment_negative_reddit', 'sentiment_negative_total',
        'sentiment_negative_twitter', 'sentiment_negative_twitter_crypto', 'sentiment_positive_reddit', 'sentiment_positive_total',
        'sentiment_positive_twitter', 'sentiment_positive_twitter_crypto', 'sentiment_volume_consumed_reddit',
        'sentiment_volume_consumed_total','sentiment_volume_consumed_twitter', 'sentiment_volume_consumed_twitter_crypto', 
        'social_dominance_reddit', 'social_dominance_total', 'social_dominance_twitter', 'social_dominance_twitter_crypto',
        'social_volume_reddit', 'social_volume_total', 'social_volume_twitter', 'social_volume_twitter_crypto', 'stock_to_flow',
        'supply_on_exchanges', 'supply_outside_exchanges', 'total_supply', 'total_supply_in_profit', 'trader_balance',
        'traders_cex_balance', 'traders_defi_balance', 'traders_dex_balance', 'traders_exchange_balance', 'traders_to_cexes_flow',
        'traders_to_defi_flow', 'traders_to_dexes_flow', 'traders_to_exchanges_flow', 'traders_to_whale_flow',
        'traders_whale_balance', 'transaction_volume', 'transactions_count', 'unique_social_volume_total_1h',
        'volume_usd', 'whale_balance', 'whale_cex_balance', 'whale_defi_balance', 'whale_dex_balance', 'whale_to_cexes_flow',
        'whale_to_defi_flow', 'whale_to_dex_traders_flow', 'whale_to_dexes_flow', 'whale_to_traders_flow', 'whales_exchange_balance',
        'whales_to_exchanges_flow', 'withdrawal_balance', 'withdrawal_transactions']

    # specify san endpoint
    san_function = san.available_metrics_for_slug

    # initialize list for the metrics
    assets_metrics_dict = {}

    # loop over all assets to build list of all available metrics
    for i, san_slug in enumerate(san_slug_universe):
        # update progress
        progress = (i + 1) / len(san_slug_universe) * 100
        print(f"Processing asset number #{i+1} ({progress:.2f}%): {san_slug}")

        # make the call
        san_args = (san_slug,) 
        result = callSanFunction(san_function, *san_args)
        if type(result) == list:
            assets_metrics_dict[san_slug] = result
        else:
            print(f"Did not obtain data for asset {san_slug}")
            break
        time.sleep(0.1)

    # form dictionary of assets and metrics to pull
    assets_metrics_to_pull_dict = {}
    for key in assets_metrics_dict:
        assets_metrics_to_pull_dict[key] = [value for value in assets_metrics_dict[key] if value in asset_metrics_to_include]

    # return both what to pull and a master list
    return assets_metrics_to_pull_dict, assets_metrics_dict

In [5]:
def pullAssetMetrics(study_start: str, study_end: str, 
    assets_metrics_dict_to_pull: Dict[str, List[str]], san_df: pd.DataFrame, daily_panel_df: pd.DataFrame) -> pd.DataFrame:
    """
    Pulls asset metrics from the Sanbase API for a specified date range and set of assets.

    Args:
        study_start (str): The start date of the study in string format ('YYYY-MM-DD').
        study_end (str): The end date of the study in string format ('YYYY-MM-DD').
        assets_metrics_dict_to_pull (Dict[str, List[str]]): A dictionary mapping asset names to lists of metric names to pull for each asset.
        san_df (pd.DataFrame): A dataframe containing mappings between asset names used by Sanbase and Coinmetrics.
        daily_panel_df (pd.DataFrame): A dataframe containing daily panel data for all assets.

    Returns:
        pd.DataFrame: A dataframe containing the pulled asset metrics for each asset and datetime.

    """
    # Convert strings to datetimes
    study_start_dt = np.datetime64(study_start)
    study_end_dt = np.datetime64(study_end)

    # Form list of assets
    assets = list(assets_metrics_dict_to_pull.keys())

    # Initialize a dataframe for the results
    df = pd.DataFrame()

    # Loop over assets
    for i, asset in enumerate(assets):
        # Obtain CoinMetrics name for obtaining dates
        asset_cm = san_df.loc[san_df['asset_san'] == asset, 'asset_cm'].values[0]

        # Monitor progress
        print(f"Processing asset #{i+1} ({(i+1)/len(assets)*100:.2f}%): {asset}")

        # Obtain metrics for this asset
        metrics = assets_metrics_dict_to_pull[asset]
        
        # Determine date range for this asset
        asset_dates = daily_panel_df.loc[daily_panel_df['asset'] == asset_cm, 'date'].values
        asset_min_date = np.min(asset_dates)
        asset_max_date = np.max(asset_dates)
        if study_start_dt >= asset_min_date:
            start_date = np.datetime_as_string(study_start_dt, 'D')
        else:
            start_date = np.datetime_as_string(asset_min_date, 'D')
        if study_end_dt <= asset_max_date:
            end_date = np.datetime_as_string(study_end_dt, 'D') 
        else:
            end_date = np.datetime_as_string(asset_max_date, 'D') 

        # Loop over metrics to pull
        asset_df = pd.DataFrame(data={'datetime': [], 'asset': []})
        for metric in metrics:
            # Monitor
            print(metric)
            
            # Pull at hourly level
            san_args = (metric, ) 
            san_kwargs = {'slug': asset,
                        'from_date': start_date,
                        'to_date': end_date,
                        'interval': '1h'}
            temp_df = callSanFunction(san.get, *san_args, **san_kwargs)

            # If it returns nothing, then try to pull at daily level
            if (not isinstance(temp_df, pd.DataFrame)) | (len(temp_df) == 0):
                san_kwargs['interval'] = '1d'
                temp_df = callSanFunction(san.get, *san_args, **san_kwargs)

            # If still no results then report and carry on
            if (not isinstance(temp_df, pd.DataFrame)) | (len(temp_df) == 0):
                print(f"No data for {asset} and {metric}.")
                continue
                
            # quick cleaning
            temp_df = temp_df.rename(columns={'value': metric})
            temp_df['asset'] = asset
            temp_df = temp_df.reset_index()
            temp_df = temp_df[['datetime', 'asset', metric]]

            # Merge onto master df
            asset_df = asset_df.merge(temp_df, on=['datetime', 'asset'], how='outer', validate='one_to_one')

            # Space out the calls
            time.sleep(0.1)

        # Append the asset's results to the master df to return
        df = pd.concat([df, asset_df])

    return df

In [6]:
def pullMacroMetrics(study_start: str, study_end: str, san_slug_universe: List[str]) -> pd.DataFrame:
    """ Pull the panel data for specified macro metrics for the relevant study time period.

    Args:
        study_start (str): The start date of the study in string format ('YYYY-MM-DD').
        study_end (str): The end date of the study in string format ('YYYY-MM-DD').
        san_slug_universe (List[str]): A list of strings of Santimnet unique slug IDs in the study universe.
    
    Returns:
        (pd.DataFrame): A panel dataframe containing the pulled macro metrics for all assets and datetimes.
    """
    # Form dictionary of macro metrics with empty lists for associated assets.
    macro_metrics = ['aave_v2_stable_borrow_apy', 'aave_v2_supply_apy', 'aave_v2_total_borrowed_usd',
        'aave_v2_total_deposits_usd', 'aave_v2_total_liquidations_usd', 'aave_v2_total_new_debt_usd',
        'aave_v2_total_supplied_usd', 'aave_v2_variable_borrow_apy', 'average_fees_usd',
        'compound_total_borrowed_usd', 'compound_total_deposits_usd', 'compound_total_liquidations_usd',
        'compound_total_new_debt_usd', 'compound_total_supplied_usd', 'dai_created', 'dai_repaid',
        'eth2_roi', 'eth2_stakers_count', 'fees_usd',
        'makerdao_total_borrowed_usd', 'makerdao_total_deposits_usd', 'makerdao_total_supplied_usd',
        'mcd_collat_ratio', 'mcd_liquidation', 'mcd_locked_token', 'median_fees_usd',
        'mvrv_usd_intraday', 'nft_retail_trade_volume_usd', 'nft_retail_trades_count',
        'nft_trade_volume_usd', 'nft_trades_count', 'nft_whale_trade_volume_usd',
        'nft_whale_trades_count', 'total_assets_issued', 'total_trade_volume_by_dex',
        'uniswap_total_claims_amount', 'uniswap_total_lp_claims_amount', 'uniswap_total_user_claims_amount',
        'usdt_binance_funding_rate', 'usdt_binance_open_interest', 'usdt_binance_open_value']
    macro_metrics_assets_dict = {key: [] for key in macro_metrics}

    # Obtain assets for each metric
    for metric in macro_metrics_assets_dict.keys():
        san_args = (metric, )
        san_kwargs = {'arr': ["availableSlugs", "isAccessible"]}
        metric_metadata_dict = callSanFunction(san.metadata, *san_args, **san_kwargs)
        if metric_metadata_dict['isAccessible']:
            macro_metrics_assets_dict[metric] = metric_metadata_dict['availableSlugs']
        else:
            macro_metrics_assets_dict.pop(metric)
        if metric[:5] == 'usdt_':
            macro_metrics_assets_dict[metric] = set(metric_metadata_dict['availableSlugs']).intersection(set(san_slug_universe))

    # Form list of metrics
    metrics = list(macro_metrics_assets_dict.keys())

    # Initialize a dataframe for the results
    df = pd.DataFrame(data={'datetime': [], 'asset': []})

    # Loop over macro metrics
    for i, metric in enumerate(metrics):
        # Monitor progress
        print(f"Processing metric #{i+1} ({(i+1)/len(metrics)*100:.2f}%): {metric}")

        # Obtain assets for this metric
        assets = macro_metrics_assets_dict[metric]

        # Check if no assets then just pull without slug name
        if len(assets) == 0:
            san_args = (metric, ) 
            san_kwargs = {'from_date': study_start,
                          'to_date': study_end,
                          'interval': '1h'}
            temp_df = callSanFunction(san.get, *san_args, **san_kwargs)

            temp_df = temp_df.rename(columns={'value': metric})
            temp_df['asset'] = 'all'
            temp_df = temp_df.reset_index()
            temp_df = temp_df[['datetime', 'asset', metric]]

            metric_df = temp_df.copy()
            continue # skip to next metric
        
        # Loop over assets to pull
        metric_df = pd.DataFrame(data={'datetime': [], 'asset': []})
        for asset in assets:
            # Pull at hourly level
            san_args = (metric, ) 
            san_kwargs = {'slug': asset,
                          'from_date': study_start,
                          'to_date': study_end,
                          'interval': '1h'}
            temp_df = callSanFunction(san.get, *san_args, **san_kwargs)

            # If it returns nothing, then try to pull at daily level
            if (not isinstance(temp_df, pd.DataFrame)) | (len(temp_df) == 0):
                san_kwargs['interval'] = '1d'
                temp_df = callSanFunction(san.get, *san_args, **san_kwargs)

            # If still no results then report and carry on
            if (not isinstance(temp_df, pd.DataFrame)) | (len(temp_df) == 0):
                print(f"No data for {asset} and {metric}.")
                continue
                
            # quick cleaning
            temp_df = temp_df.rename(columns={'value': metric})
            temp_df['asset'] = asset
            temp_df = temp_df.reset_index()
            temp_df = temp_df[['datetime', 'asset', metric]]

            # Append on the new asset's metrics
            metric_df = pd.concat([metric_df, temp_df])

            # Space out the calls
            time.sleep(0.1)

        # Merge together metrics for given assets-datetimes
        df = df.merge(metric_df, on=['datetime', 'asset'], how='outer', validate='one_to_one')

    return df

In [None]:
if __name__ == "__main__":
    # Set args
    CW_IN_FP = '../data/derived/cm_to_coinapi_cw.pkl'
    ASSET_IN_FP = '../data/derived/asset_universe_dict.pickle'
    PANEL_DAILY_IN_FP = '../data/derived/basic_panel.pkl'
    API_FP = '../../admin/santiment.txt'
    STUDY_START = '2016-07-01'
    STUDY_END = '2023-01-02'
    PANEL_OUT_FP = "../data/raw/san_panel.pkl"
    CW_OUT_FP = '../data/raw/san_coinmetrics_cw.pkl'
    MACRO_OUT_FP = '../data/raw/san_macro.pkl'

    # Import asset universe and cw
    cw_df = pd.read_pickle(CW_IN_FP)
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    asset_universe = Helper.findUniqueAssets(asset_universe_dict)
    daily_panel_df = pd.read_pickle(PANEL_DAILY_IN_FP)

    # import api key and set
    with open(API_FP) as f:
        API_KEY = f.readlines()
        API_KEY = API_KEY[0].strip()
    san.ApiConfig.api_key = API_KEY
    
    # monitor progress
    print(san.api_calls_remaining())

    # Form crosswalk
    san_df = formSantimentAssetUniverse(asset_universe)
    san_df.to_pickle(CW_OUT_FP)
    san_slug_universe = list(san_df.asset_san.values)

    # Form asset metrics dictionary to pull
    assets_metrics_dict_to_pull, assets_metrics_dict = formAssetMetricsDicts(san_slug_universe)

    # Pull macro metrics
    macro_df = pullMacroMetrics(STUDY_START, STUDY_END, san_slug_universe)
    macro_df.to_pickle(MACRO_OUT_FP)

    # Pull asset metrics
    panel_df = pullAssetMetrics(STUDY_START, STUDY_END, 
                                assets_metrics_dict_to_pull, 
                                san_df, daily_panel_df)
    panel_df.to_pickle(PANEL_OUT_FP)