In [1]:
import pandas as pd
import numpy as np
import time
import pickle
from helper_functions import Helper
from typing import Dict, List


In [2]:
def formCmcAssetUniverse(base_url: str, base_headers: Dict[str, str], asset_universe: List[str]) -> pd.DataFrame:
    """ Form universe of CMC assets mapped to coinmetrics asset ids.

    Args:
        base_url (str): Base URL for the API.
        base_headers (Dict[str, str]): Base headers for the API.  
        asset_universe (List[str]): list of strings of coinmetrics asset IDs.

    Returns:
        df (pd.DataFrame): crosswalk betwen asset_cmc IDs and asset_cm IDs.
    """
    # Set slugs to drop that are duplicates
    slugs_to_drop = ['ethercoin', 'unitecoin', 'universe', 'uni-coin', 'unicorn-token', 'bantam',
        'stealthcash', 'stox', 'staykx', 'icon-futures', 'global-tour-coin', 'game', 'gastrocoin',
        'midnight', 'meta-dance', 'polybit', 'farmpoly', 'poly-maximus', 'blazecoin', 'cartercoin',
        'credit-tag-chain', 'culture-ticket-chain', 'cybertronchain', 'cryptocoin', 'calltocombat',
        'solcoin', 'sola-token', 'sol-rune---rune-game', 'wrapped-solana', 'plair', 'playchip', 
        'planet', 'atomic-coin', 'burstocean', 'farmatrust', 'freetip', 'cronos-coin', 'arcoin',
        'hinto', 'compound-coin', 'global-rental-token', 'golden-ratio-token', 'aca-token',
        'retawars-goldrose-token', 'flux', 'flux-protocol', 'crowdvilla-ownership', 'acash-coin', 
        'six-dragons-mft', 'metaface', 'my-farm', 'anoncoin', 'aragon-china-token', 'synereo', 
        'ach', 'supercoin', 'superswap', 'superciety', 'bond', 'bonded-finance', 'bondly-', 
        'truebit', 'mir-coin', 'mir-token', 'oxy-fi', 'oxycoin', 'rare', 'unique-one', 
        'rare-finance', 'thorchain-erc20', 'rune', 'rune-farm', 'clevercoin', 'nftx-hashmasks-index',
        'investoland', 'apecoin', 'apestrong-finance', 'ape-finance', 'wall-street-apes', 'apelab', 
        'just-ape', 'miss-ape-yacht-club', 'apemove', 'mercury-protocol', 'gambit-finance', 
        'gomining-token', 'greekmythology', 'qiswap', 'qidao', 'impact', 'impermax', 'fitmin-finance',
        'bat-finance', 'orca-alliance', 'orcadao', 'operand', 'onplanet', 'galatasaray-fan-token', 
        'gallant', 'polyalpha-finance', 'alpha', 'aavegotchi-alpha', 'subgame', 'rinnegan', 'robinos',
        'covicoin', 'covid-cutter', 'coinviewcap', 'genesis-mana', 'uniswap-finance', 'cardanomics',
        'kart-racing-league', 'pyroblock', 'atlantis', 'atlas-cloud', 'the-atlas-coin', 
        'atlas-fc-fan-token', 'snt', 'shib-ninja-token', 'share-nft-token', 'flower-solana', 'cake',
        'agrofarm', 'anontoken', 'scarpacoin', 'silver-coin', 'shibchain', 'gas-dao', 'rose',
        'metaplanet', 'meta-plane', 'rari-games', 'icecream-finance', 'bobatama', 
        'virtual-reality-asset', 'playground-waves-floor-index', 'ecowatt', 'stargod', 'jumpn', 
        'quickswap-new', 'kaisen-inu', 'rising-sun', 'smartlands-network-new', 'omega-finance',
        'mechaverse', 'musicfi', 'listenify', 'onlymemes', 'avatly', 'avalon', 't', 'twitfi',
        'synergy-diamonds']

    # initialize a df for the crosswalk
    df = pd.DataFrame()

    # obtain cmc asset ids
    endpoint = '/v1/cryptocurrency/map'
    url = f"{base_url}{endpoint}"
    for start in [1, 5001, 10001, 15001, 20001]:
        params = {'listing_status': 'active,inactive,untracked', 'start': start, 'limit': 5000}
        response_json = Helper.makeApiCall(url, headers=base_headers, params=params)
        df = pd.concat([df, pd.DataFrame(response_json['data'])])

    # subset down to matched assets
    df['symbol_lower'] = df.symbol.str.lower()
    df = df[df.symbol_lower.isin(asset_universe)]

    # remove duplicated assets
    df = df[~df.slug.isin(slugs_to_drop)]

    # manually add one missing asset
    df = pd.concat([df, pd.DataFrame(data={'id': 1567, 'slug': ['nano'], 'symbol_lower': ['nano']})])

    # rename
    df = df.rename(columns={'slug': 'asset_cmc', 'symbol_lower': 'asset_cm'})

    # confirm full one to one mapping
    assert len(asset_universe) == np.sum(np.unique(df.asset_cm.values) == np.unique(asset_universe))

    # return
    return df[['id', 'asset_cmc', 'asset_cm']].sort_values(by='asset_cm', ignore_index=True)

In [3]:
def pullAssetHistoricalMetadata(base_url: str, base_headers: Dict[str, str], 
    cmc_slug_universe: List[str], study_start: str, study_end: str) -> pd.DataFrame:
    """ Pull metadata on cmc asset universe.

    Args:
        base_url (str): The base URL for the CMC API.
        base_headers (Dict[str, str]): A dictionary containing the basic headers for the CMC API call.
        cmc_slug_universe (List[str]): A list of cmc slugs that are in our asset universe.        
        study_start (str): string time for the start of the study window in format 'YYYY-MM-DD'.
        study_end (str): string time for the end of the study window in format 'YYYY-MM-DD'.                               
            
    Returns:
        asset_covars_df (pd.DataFrame): panel data with asset covariates.    
    """
    # extract all dates and add one on the front as we will lag dates by one day
    all_dates  = Helper.generateDailyDateList(study_start, study_end)
    all_dates = ['2016-06-30'] + all_dates

    # initialize metadata for the results
    results_dict = {'date': [],
                    'slug': [],
                    'rank_cmc': [],
                    'num_market_pairs_cmc': [],
                    'circulating_supply': [],
                    'total_supply': [],
                    'max_supply': [],
                    'tags': [],
                    'platform': [],
                    'tvl_ratio': []}

    # form url
    endpoint = '/v1/cryptocurrency/listings/historical'
    url = f"{base_url}{endpoint}"

    # form params
    params = {'convert': 'USD',
              'limit': 5000,
              'aux': 'platform,tags,circulating_supply,total_supply,max_supply,cmc_rank,num_market_pairs'}

    # loop over all dates except the last date we are lagging days by one
    for i in range(len(all_dates[:-1])):
        # update date to pull
        current_date = all_dates[i]
        params['date'] = current_date

        # monitor progress
        print(f"Processing date number #{i+1} ({(i+1)/len(all_dates)*100:.2f}%): {current_date}")
            
        # make the call for all assets and append
        for start in [1, 5001, 10001, 15001, 20001]:
            params['start'] = start
            response_json = Helper.makeApiCall(url, headers=base_headers, params=params)
            try:
                for result in response_json['data']:
                    if result['slug'] in cmc_slug_universe:
                        results_dict['date'].append(all_dates[i+1]) # note: this info is updated at end of this utc so midnight of next
                        results_dict['slug'].append(result['slug'])
                        results_dict['rank_cmc'].append(result['cmc_rank'])
                        results_dict['num_market_pairs_cmc'].append(result['num_market_pairs'])
                        results_dict['circulating_supply'].append(result['circulating_supply'])
                        results_dict['total_supply'].append(result['total_supply'])
                        results_dict['max_supply'].append(result['max_supply'])
                        results_dict['tags'].append(result['tags'])
                        results_dict['platform'].append(result['platform'])
                        results_dict['tvl_ratio'].append(result['tvl_ratio'])
            except:
                print(f"No data for starting at {start} for date {current_date}")

            # space out calls
            time.sleep(0.5)

    # build dataframe to return
    return pd.DataFrame(results_dict)

In [4]:
def pullAssetPriceVolumeMcap(base_url: str, base_headers: Dict[str, str], 
                             study_start: str, study_end: str, 
                             cmc_df: pd.DataFrame, daily_panel_df: pd.DataFrame) -> pd.DataFrame:
    """ Pull price, volume, and mcap data at hourly freq for cmc asset universe in cmc_df.

    Args:
        base_url (str): The base URL for the CMC API.
        base_headers (Dict[str, str]): A dictionary containing the basic headers for the CMC API call.  
        study_start (str): string time for the start of the study window in format 'YYYY-MM-DD'.
        study_end (str): string time for the end of the study window in format 'YYYY-MM-DD'.     
        cmc_df (pd.DataFrame): crosswalk between cmc ids and CoinMetrics asset ids.
        daily_panel_df (pd.DataFrame): panel data with CoinMetrics asset ids to use for 
                                       start and end date of each asset.                          
            
    Returns:
        df (pd.DataFrame): panel data with asset price, 24h volume, and mcap at hourly frequency.
    """
    # convert strings to datetimes
    study_start_dt = np.datetime64(study_start)
    study_end_dt = np.datetime64(study_end)

    # initialze asset list to pull
    cmc_asset_ids = list(cmc_df.id.values)

    # initialize dict for the results
    results_dict = {'date': [],
                    'cmc_id': [],
                    'usd_per_token': [],
                    'usd_volume_24h': [],
                    'usd_mcap': []}

    # form url
    endpoint = '/v3/cryptocurrency/quotes/historical'
    url = f"{base_url}{endpoint}"

    # form parameters dictionary
    params = {'count': 10000,
              'interval': '1h',
              'aux': 'price,volume,market_cap,quote_timestamp'}

    # loop over the assets
    num_assets = len(cmc_asset_ids)
    for i in range(num_assets):
        # update asset
        cmc_asset_id = cmc_asset_ids[i]
        params['id'] = str(cmc_asset_id)

        # monior progress
        print(f"Processing asset number #{i+1} ({(i+1)/num_assets*100:.2f}%).")

        # form list of all dates for this asset
        asset_cm = cmc_df[cmc_df.id==cmc_asset_id].asset_cm.values[0]
        asset_dates = daily_panel_df[daily_panel_df.asset==asset_cm].date.values
        asset_min_date = np.min(asset_dates)
        asset_max_date = np.max(asset_dates)
        if study_start_dt >= asset_min_date:
            start_date = np.datetime_as_string(study_start_dt, 'D')
        else:
            start_date = np.datetime_as_string(asset_min_date, 'D')
        if study_end_dt <= asset_max_date:
            end_date = np.datetime_as_string(study_end_dt, 'D') 
        else:
            end_date = np.datetime_as_string(asset_max_date, 'D') 

        # extract dates for this asset
        date_list  = Helper.generateYearlyCalendarYearDateList(start_date, end_date)

        # loop over the dates for this asset
        for j in range(len(date_list)-1):
            # update params for these dates
            params['time_start'] = date_list[j]
            params['time_end']   = date_list[j+1]

            # make the call
            response_json = Helper.makeApiCall(url, headers=base_headers, params=params)
            
            # process the data
            if isinstance(response_json, dict):
                if  response_json['data'] is not None:
                    for quote in response_json['data'][str(cmc_asset_id)]['quotes']:
                        results_dict['date'].append(quote['quote']['USD']['timestamp'])
                        results_dict['cmc_id'].append(cmc_asset_id) 
                        results_dict['usd_per_token'].append(quote['quote']['USD']['price'])
                        results_dict['usd_volume_24h'].append(quote['quote']['USD']['volume_24h'])
                        results_dict['usd_mcap'].append(quote['quote']['USD']['market_cap'])
            else:
                print(f'No data for {cmc_asset_id} for date {date_list[j]}')
                continue

            # space out the calls
            time.sleep(0.2)

    # build dataframe to return
    df = pd.DataFrame(results_dict)  
    df = df.drop_duplicates(subset=['date', 'cmc_id'])
    df = df.merge(cmc_df[['id', 'asset_cmc']], left_on='cmc_id', right_on='id', how='left', validate='many_to_one')
    df = df[['date', 'asset_cmc', 'usd_per_token', 'usd_volume_24h', 'usd_mcap']]
    df = df.sort_values(by=['date', 'asset_cmc'], ignore_index=True)

    return df


In [5]:
def pullMacro(base_url: str, base_headers: Dict[str, str], 
              study_start: str, study_end: str) -> pd.DataFrame:
    """ Pull macro data for the study period.
    
    Args:
        base_url (str): The base URL for the CMC API.
        base_headers (Dict[str, str]): A dictionary containing the basic headers for the CMC API call.  
        study_start (str): string time for the start of the study window in format 'YYYY-MM-DD'.
        study_end (str): string time for the end of the study window in format 'YYYY-MM-DD'.    

    Returns:
        df (pd.DataFrame): timeseries data for the available macro covariates from CMC.
    """
    # initialize dict for the results
    results_dict = {'date': [],
                    'total_usd_mcap': [],
                    'altcoin_usd_mcap': [],
                    'total_usd_volume_24h': [],
                    'altcoin_usd_volume_24h': [],
                    'active_cryptos': [],
                    'active_exchanges': [],
                    'active_market_pairs': [],
                    'btc_dominance': []}

    # form url
    endpoint = '/v1/global-metrics/quotes/historical'
    url = f"{base_url}{endpoint}"

    # form parameters dictionary
    params = {'count': 10000,
              'interval': '1h',
              'aux': "btc_dominance,active_cryptocurrencies,active_exchanges,active_market_pairs,total_volume_24h,altcoin_market_cap,altcoin_volume_24h"}

    # form the yearly dates thing
    date_list  = Helper.generateYearlyCalendarYearDateList(study_start, study_end)

    # loop over dates to pull
    for i in range(len(date_list)-1):
        # set dates
        params['time_start'] = date_list[i]
        params['time_end'] = date_list[i+1]

        # make the call
        response_json = Helper.makeApiCall(url, headers=base_headers, params=params)
        assert(type(response_json)==dict)
        assert('data' in response_json.keys())
        for quote in response_json['data']['quotes']:
            results_dict['date'].append(quote['timestamp'])
            results_dict['total_usd_mcap'].append(quote['quote']['USD']['total_market_cap'])
            results_dict['altcoin_usd_mcap'].append(quote['quote']['USD']['altcoin_volume_24h'])
            results_dict['total_usd_volume_24h'].append(quote['quote']['USD']['total_volume_24h'])
            results_dict['altcoin_usd_volume_24h'].append(quote['quote']['USD']['altcoin_volume_24h'])
            results_dict['active_cryptos'].append(quote['active_cryptocurrencies'])
            results_dict['active_exchanges'].append(quote['active_exchanges'])
            results_dict['active_market_pairs'].append(quote['active_market_pairs'])
            results_dict['btc_dominance'].append(quote['btc_dominance'])

    # build the dataframe to return
    df = pd.DataFrame(results_dict)
    df = df.drop_duplicates(subset=['date'])
    df = df.sort_values(by=['date'], ignore_index=True)

    return df


In [6]:
def pullCMCExchangeHistoricalData(base_url: str, base_headers: Dict[str, str], 
                                  study_start: str, study_end: str) -> pd.DataFrame:
    """ Pull exchange data.
    
    Args:
        base_url (str): The base URL for the CMC API.
        base_headers (Dict[str, str]): A dictionary containing the basic headers for the CMC API call.  
        study_start (str): string time for the start of the study window in format 'YYYY-MM-DD'.
        study_end (str): string time for the end of the study window in format 'YYYY-MM-DD'.    

    Returns:
        df (pd.DataFrame): panel data frame of exchange covariates.
    """
    # convert strings to datetimes
    study_start_dt = np.datetime64(study_start)
    study_end_dt = np.datetime64(study_end)

    # specify legit exchanges that we will consider
    legit_exchanges = ['aave', 'balancer-v2', 'bancor-network', 'binance', 'binance-us',
                       'bitfinex', 'bitmex', 'bitstamp', 'coinbase-exchange',
                       'compound', 'crypto-com-exchange', 'curve-finance', 'deribit',
                       'dydx', 'ftx', 'ftx-us', 'gemini', 'huobi', 'kraken', 'kucoin',
                       'okcoin', 'pancakeswap-v2', 'poloniex', 'sushiswap',
                       'uniswap-v2', 'uniswap-v3']

    # obtain exchange mapping ids
    endpoint = '/v1/exchange/map'
    url      = f"{base_url}{endpoint}"
    params   = {'listing_status': 'active,inactive'}
    response_json = Helper.makeApiCall(url, headers=base_headers, params=params)
    exchanges_dict = {'slug': [],
                      'id': [],
                      'first_date': [],
                      'last_date': []}
    for ex in response_json['data']:
        if ex['slug'] in legit_exchanges:
            exchanges_dict['slug'].append(ex['slug'])
            exchanges_dict['id'].append(ex['id'])
            exchanges_dict['first_date'].append(ex['first_historical_data'])
            exchanges_dict['last_date'].append(ex['last_historical_data'])
    exchanges_df = pd.DataFrame(exchanges_dict)
    
    # build url and params for exchange historical data
    endpoint = '/v1/exchange/quotes/historical'
    url = f"{base_url}{endpoint}"
    params = {'interval': '1h',
              'count': 10000}

    # initialize dict for ex data
    ex_data_dict = {'date': [],
                    'ex_slug': [],
                    'ex_usd_volume_24h': [],
                    'ex_num_market_pairs': []}

    # loop over all exchanges
    ex_ids = list(exchanges_df.id.values)
    for ex_id in ex_ids:
        # Monitor progress
        ex_slug = exchanges_df[exchanges_df.id==ex_id].slug.values[0]
        print(f"Working on {ex_slug}.")

        # Add ex id to params
        params['id'] = ex_id

        # Determine start and end date and break into years
        ex_min_date = exchanges_df[exchanges_df.id==ex_id].first_date.values[0]
        ex_max_date = exchanges_df[exchanges_df.id==ex_id].last_date.values[0]
        ex_min_date_dt = np.datetime64(ex_min_date[:10])
        ex_max_date_dt = np.datetime64(ex_max_date[:10])
        if study_start_dt >= ex_min_date_dt:
            start_date = np.datetime_as_string(study_start_dt, 'D')
        else:
            start_date = np.datetime_as_string(ex_min_date_dt, 'D')
        if study_end_dt <= ex_max_date_dt:
            end_date = np.datetime_as_string(study_end_dt, 'D') 
        else:
            end_date = np.datetime_as_string(ex_max_date_dt, 'D') 

        # extract dates for this exchange
        date_list  = Helper.generateYearlyCalendarYearDateList(start_date, end_date)

        # loop over date list
        for j in range(len(date_list)-1):
            # update params
            params['time_start'] = date_list[j]
            params['time_end'] = date_list[j+1]

            # make the call
            response_json = Helper.makeApiCall(url, headers=base_headers, params=params)

            # extract the data to a dict
            for ex in response_json['data']['quotes']:
                ex_data_dict['date'].append(ex['quote']['USD']['timestamp'])
                ex_data_dict['ex_slug'].append(ex_slug)
                ex_data_dict['ex_usd_volume_24h'].append(ex['quote']['USD']['volume_24h'])
                ex_data_dict['ex_num_market_pairs'].append(ex['num_market_pairs'])
            
            # space out the calls
            time.sleep(0.2)

    # build the dataframe to return
    df = pd.DataFrame(ex_data_dict)
    df = df.drop_duplicates(subset=['date', 'ex_slug'])
    df = df.sort_values(by=['date', 'ex_slug'], ignore_index=True)

    return df

In [125]:
def formFinalPanel(panel_df: pd.DataFrame, assets_df: pd.DataFrame) -> pd.DataFrame:
    """ add the asset covars to the main panel of price, volume, and mcap.
    
    Args:
        panel_df (pd.DataFrame): panel with asset price, trading volume, and mcap.
        assets_df (pd.DataFrame): panel with asset metadata.
    
    Returns:
        panel_df (pd.DataFrame): panel with all asset covariates.
    """
    # prep each df for the merge
    assets_df = assets_df.drop_duplicates(subset=['date', 'slug'])
    assets_df = assets_df.rename(columns={'slug': 'asset_cmc'})
    assets_df['date_day'] = pd.to_datetime(assets_df['date'], utc=True).dt.tz_localize(None)
    assets_df = assets_df.drop('date', axis=1)
    panel_df['date'] = pd.to_datetime(panel_df['date'], utc=True).dt.tz_localize(None)
    panel_df['date_day'] = panel_df.date.dt.floor("D") - pd.Timedelta(hours=24)

    # merge
    panel_df = panel_df.merge(assets_df,
                            on=['date_day', 'asset_cmc'],
                            validate='many_to_one',
                            how='left')
    
    # clean up
    panel_df = panel_df.drop('date_day', axis=1)
    panel_df = panel_df.sort_values(by=['date', 'asset_cmc'], ignore_index=True)
    
    return panel_df

In [7]:
if __name__ == "__main__":
    # Set args
    CW_IN_FP = '../data/derived/cm_to_coinapi_cw.pkl'
    ASSET_IN_FP = '../data/clean/asset_universe_dict.pickle'
    PANEL_DAILY_IN_FP = '../data/derived/basic_panel.pkl'
    API_FP = '../../admin/cmc.txt'
    STUDY_START = '2016-07-01'
    STUDY_END = '2023-01-02'
    BASE_URL = "https://pro-api.coinmarketcap.com"
    PANEL_OUT_FP = "../data/raw/cmc_panel.pkl"
    CW_OUT_FP = '../data/raw/cmc_coinmetrics_cw.pkl'
    MACRO_OUT_FP = '../data/raw/cmc_macro.pkl'
    EX_OUT_FP = '../data/raw/cmc_exchange_panel.pkl'

    # Import asset universe and cw
    cw_df = pd.read_pickle(CW_IN_FP)
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    asset_universe = Helper.findUniqueAssets(asset_universe_dict)
    daily_panel_df = pd.read_pickle(PANEL_DAILY_IN_FP)

    # import api key and set base parameters
    with open(API_FP) as f:
        API_KEY = f.readlines()
        API_KEY = API_KEY[0].strip()
    BASE_HEADERS = {'Accepts': 'application/json', 'X-CMC_PRO_API_KEY': API_KEY}

    # Form crosswalk
    cmc_df = formCmcAssetUniverse(BASE_URL, BASE_HEADERS, asset_universe)
    cmc_df[['asset_cmc', 'asset_cm']].to_pickle(CW_OUT_FP)
    cmc_slug_universe = list(cmc_df.asset_cmc.values)

    # Pull historical asset metadata
    assets_df = pullAssetHistoricalMetadata(BASE_URL, BASE_HEADERS, 
                                            cmc_slug_universe, STUDY_START, STUDY_END)

    # Pull asset price volume and mcap data
    panel_df = pullAssetPriceVolumeMcap(BASE_URL, BASE_HEADERS, 
                                        STUDY_START, STUDY_END,
                                        cmc_df, daily_panel_df)
    
    # Form final panel and save
    panel_df = formFinalPanel(panel_df, assets_df)
    panel_df.to_pickle(PANEL_OUT_FP)
    
    # Pull macro data
    macro_df = pullMacro(BASE_URL, BASE_HEADERS, STUDY_START, STUDY_END)
    macro_df.to_pickle(MACRO_OUT_FP)
    ex_df = pullCMCExchangeHistoricalData(BASE_URL, BASE_HEADERS, STUDY_START, STUDY_END)
    ex_df.to_pickle(EX_OUT_FP)