In [403]:
import pandas as pd
import numpy as np
from typing import List, Dict
from helper_functions import Helper

In [404]:
def pullExchangeInfo(base_url: str, base_params: dict, target_exchanges: List[str]) -> pd.DataFrame:
    """
    Returns a DataFrame containing information about cryptocurrency exchanges.

    Args:
        base_url (str): A string representing the base URL of the Coinmetrics service.
        base_params (dict): A dictionary representing the parameters to be sent with the API request.
        target_exchanges (List[str]): A list of strings with the target exchanges for this study.

    Returns:
        A Pandas DataFrame containing information about cryptocurrency exchanges.
    """
    # Build target URL and headers
    target_url = "catalog-all/exchanges"
    url        = f"{base_url}{target_url}"
    params = base_params.copy()

    # Call API and convert to DataFrame
    response_json = Helper.makeApiCall(url, headers={}, params=params)
    df = pd.DataFrame(response_json['data'])

    # Subset to relevant exchanges
    df = df[df.exchange.isin(target_exchanges)].reset_index(drop=True)
    
    return df

In [405]:
def pullAssetInfo(base_url: str, base_params: dict) -> pd.DataFrame:
    """
    Returns a DataFrame containing information about cryptocurrency assets.

    Args:
        base_url: A string representing the base URL of the Coinmetrics service.
        base_params: A dictionary representing the parameters to be sent with the API request.

    Returns:
        A Pandas DataFrame containing information about cryptocurrency assets.
    """
    # Build target URL and headers
    target_url = "catalog-all/assets"
    url        = f"{base_url}{target_url}"
    params = base_params.copy()

    # Call API and convert to DataFrame
    response_json = Helper.makeApiCall(url, headers={}, params=params)
    df = pd.DataFrame(response_json['data'])

    # Subset to assets with trading data
    df = df[~df.markets.isnull()]
    
    return df

In [406]:
def pullAndFormRelevantMarkets(exchanges_df: pd.DataFrame, assets_df: pd.DataFrame,
        base_url: str, base_params: Dict[str, str]) -> pd.DataFrame:
    """
    Returns a pandas DataFrame containing information about coinmetrics markets that are on a target exchange with
    USD or stablecoin quote asset.

    Args:
    - exchanges_df (pd.DataFrame): a DataFrame containing exchange data
    - assets_df (pd.DataFrame): a DataFrame containing asset data
    - base_url (str): a string containing the base url for the API
    - base_params (Dict[str, str]): a dictionary containing the base parameters for the API

    Returns:
    - pd.DataFrame: a pandas DataFrame containing information about relevant markets
    """
    # form dataframe of all markets
    markets_list = []
    for markets in list(exchanges_df.markets.values):
        markets_list.extend(markets)
    df = pd.DataFrame(data={'market': markets_list})

    # remove duplicates
    df = df.drop_duplicates(subset='market')

    # form market info
    df['exchange'] = df['market'].str.split('-', n=4, expand=True)[0]
    df['asset'] = df['market'].str.split('-', n=4, expand=True)[1]
    df['quote'] = df['market'].str.split('-', n=4, expand=True)[2]
    df['type'] = df['market'].str.split('-', n=4, expand=True)[3]

    # subset to spot markets
    df = df[df.type == 'spot']
    df = df.drop(columns='type', axis=1)

    # subset to quote asset is USD, USDC, or USDT
    df = df[df.quote.isin(['usd', 'usdt', 'usdc'])]

    # remove assets that are derivatives of other symbols or stablecoins
    assets_to_remove = ['wbtc', 'wluna', 'wnxm', 'tbtc', 'cusd', 'musd', 'nusd', 'dai', 'busd', 
                        'cusdt', 'gusd', 'lusd', 'ousd', 'usdj', 'usdk', 'usdn', 'usdt', 'usdc', 
                        'aoa', 'ausd', 'ern', 'krw', 'mtl', 'tusd', 'susd', 'usdd', 'ust', 'ustc', 
                        'eur', 'aud', 'gbp', 'cad', 'cbeth', 'lbp', 'sos', 'usdp', '00', 'bifi_beef', 
                        'bifi_bifr', 'btcauction', 'cix100']
    df = df[~df.asset.isin(assets_to_remove)]
    df = df[~df['asset'].str.contains('3l|3s|2s|2l')]

    # remove assets if they have no coinmetrics metrics
    df = df[df.asset.isin(list(assets_df[~assets_df.metrics.isnull()].asset.values))]

    # build target url and headers for call for market meta data
    target_url = "catalog-all/markets"
    url        = f"{base_url}{target_url}"
    params = base_params.copy()

    # call API and convert to DataFrame
    response_json = Helper.makeApiCall(url, headers={}, params=params)
    markets_df = pd.DataFrame(response_json['data'])

    # subset to markets and columnns of interest
    markets_df = markets_df[markets_df.market.isin(df.market.values)].reset_index(drop=True)
    markets_df = markets_df[['market', 'exchange', 'base', 'quote', 'quotes']]
    markets_df = markets_df.drop('quotes', axis=1).join(pd.json_normalize(markets_df.quotes))

    # drop markets without quote data
    markets_df = markets_df[~markets_df.min_time.isnull()]

    return markets_df 

In [407]:
def pullUSDTandUSDCexchangeRates(base_url: str, base_params: Dict[str, str]) -> pd.DataFrame:
    """
    Returns a DataFrame containing Coinmetrics reference exchange rates for USDT and USDC.

    Args:
        base_url (str): Base URL for the API.
        base_params (Dict[str, str]): Base parameters for the API.

    Returns:
        pd.DataFrame: A pandas DataFrame containing USDT and USDC price timeserieses.
    """
    # Define API parameters
    api_params = {
        'page_size': 10000,
        'metrics': 'ReferenceRateUSD',
        'assets': 'usdt,usdc',
        'frequency': '1d',
        'limit_per_asset': 5000
    }

    # Merge base parameters with API parameters
    params = {**base_params, **api_params}
    
    # Build API URL
    api_endpoint = 'timeseries/asset-metrics'
    url = f"{base_url}{api_endpoint}"

    # Call API and convert response to DataFrame
    response_json = Helper.makeApiCall(url, headers={}, params=params)
    df = pd.DataFrame(response_json['data'])

    # Clean the data
    df['date'] = pd.to_datetime(df.time, format='%Y-%m-%d').dt.date
    df['price'] = df.ReferenceRateUSD.astype(float)
    df = df.sort_values('date', ignore_index=True)

    # Ensure that all prices are within expected range
    assert 0 == df[(df.price > 2) | (df.price < 0.8)].shape[0]

    # Split data into USDC and USDT DataFrames
    usdc_df = df[df.asset == 'usdc'][['date', 'price']]
    usdc_df = usdc_df.rename(columns={'price': 'usd_per_usdc'})
    usdt_df = df[df.asset == 'usdt'][['date', 'price']]
    usdt_df = usdt_df.rename(columns={'price': 'usd_per_usdt'})

    # Ensure that the DataFrames contain consecutive dates
    expected_dates_usdc = pd.Series(pd.date_range(usdc_df['date'].iloc[0], usdc_df['date'].iloc[-1]))
    expected_dates_usdt = pd.Series(pd.date_range(usdt_df['date'].iloc[0], usdt_df['date'].iloc[-1]))
    assert usdc_df.shape[0] == (expected_dates_usdc.values == pd.to_datetime(usdc_df['date'])).sum()
    assert usdt_df.shape[0] == (expected_dates_usdt.values == pd.to_datetime(usdt_df['date'])).sum()
    
    # Merge the DataFrames
    df = usdc_df.merge(usdt_df, on='date', how='outer', validate='one_to_one')
    df = df.sort_values(by='date', ignore_index=True)
    df['date'] = pd.to_datetime(df.date).dt.date

    return df

In [395]:
def pullOHLCV(base_url: str, base_params: Dict[str, str], 
              markets_df: pd.DataFrame, usd_df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a panel DataFrame containing market prices, volumes, and trade counts.

    Args:
        base_url (str): Base URL for the API.
        base_params (Dict[str, str]): Base parameters for the API.
        markets_df (pd.DataFrame): A pandas DataFrame containing information about relevant markets.
        usd_df (pd.DataFrame): A pandas DataFrame containing USDT and USDC price timeserieses.

    Returns:
        A Pandas DataFrame panel of dates and markets with their usd_per_token prices, usd_volume_per_24h, and trades.
    """

    # Set up object to store data
    df = pd.DataFrame()

    # Form list of markets
    assert markets_df.market.is_unique
    markets_list = list(markets_df.market.values)

    # Define API parameters
    api_params = {
        'frequency': '1d',
        'start_time': '2015-01-01',
        'end_time': '2023-02-02',
        'page_size': 10000,
        'limit_per_market': 5000
    }
    
    # Build API URL
    api_endpoint = "timeseries/market-candles"
    url = f"{base_url}{api_endpoint}"

    # Pull all markets
    for i in range(len(markets_list)):
        # update market to pull
        market = markets_list[i]
        params = {**base_params, **api_params, 'markets': market}

        # monitor progress
        print(f"Processing market #{i+1} ({(i+1)/len(markets_list)*100:.2f}%): {market}")

        # call API and convert response to DataFrame
        response_json = Helper.makeApiCall(url, headers={}, params=params)

        # catch if there is no data
        try:
            # clean the data
            result_df = pd.DataFrame(response_json['data'])
            result_df = result_df[['market', 'time', 'price_close', 'candle_usd_volume', 'candle_trades_count']]

            # save
            df = pd.concat((df, result_df))
        except:
            print(f"{market} did not have data")
            continue

    # Confirm no missing obs
    assert 0==df.isnull().sum().sum()

    # Add market meta data for exchange, base asset, and quote asset
    df = df.merge(markets_df, on='market', how='inner', validate='many_to_one')

    # Form date column
    df['date'] = pd.to_datetime(df.time, format='%Y-%m-%d').dt.date
    df = df.drop(columns='time', axis=1)

    # Merge on USDT and USDC prices
    min_usdt_date = np.min(df[df.quote=='usdt'].date)
    min_usdc_date = np.min(df[df.quote=='usdc'].date)
    assert min_usdt_date >= np.min(usd_df[~usd_df.usd_per_usdt.isnull()].date)
    assert min_usdc_date >= np.min(usd_df[~usd_df.usd_per_usdc.isnull()].date)
    df = df.merge(usd_df, on='date', how='left', validate='many_to_one')

    # Form price column
    df['price_close'] = df.price_close.astype(float)
    df.loc[df.quote=='usd', 'usd_per_token_cm'] = df.loc[df.quote=='usd', 'price_close']
    df.loc[df.quote=='usdc', 'usd_per_token_cm'] = df.loc[df.quote=='usdc', 'price_close']*df.loc[df.quote=='usdc', 'usd_per_usdc']
    df.loc[df.quote=='usdt', 'usd_per_token_cm'] = df.loc[df.quote=='usdt', 'price_close']*df.loc[df.quote=='usdt', 'usd_per_usdt']
    assert 0 == df.usd_per_token_cm.isnull().sum()

    # Form volume columns
    df['usd_volume_per_24h_cm'] = df.candle_usd_volume.astype(float)
    df['trades_cm'] = df.candle_trades_count.astype(int)
    assert 0 == df.usd_volume_per_24h_cm.isnull().sum()

    # collapse to the asset date level
    df.loc[df.usd_volume_per_24h_cm==0, 'usd_volume_per_24h_cm'] = 1
    grouped = df.groupby(['date', 'base'])
    weighted_avg = grouped.apply(lambda x: (x['usd_per_token_cm'] * x['usd_volume_per_24h_cm']).sum() / x['usd_volume_per_24h_cm'].sum())
    total_volume = grouped['usd_volume_per_24h_cm'].sum()
    total_trades = grouped['trades_cm'].sum()
    df = pd.DataFrame({'usd_per_token_cm': weighted_avg, 
                    'usd_volume_per_24h_cm': total_volume, 
                    'trades_cm': total_trades}).reset_index()
    df.loc[df.usd_volume_per_24h_cm==1, 'usd_volume_per_24h_cm'] = 0

    # Check for valid ranges and dtypes
    assert 0 == df.usd_per_token_cm.isnull().sum()
    assert 0 == df.usd_volume_per_24h_cm.isnull().sum()
    df = df[(df['usd_per_token_cm'] >= 0) & (df['usd_per_token_cm'] < 1e9)]
    df = df[(df['usd_volume_per_24h_cm'] >= 0) & (df['usd_volume_per_24h_cm'] < 1e11)]
    df = df[(df['trades_cm'] >= 0) & (df['trades_cm'] < 1e9)]

    # Ensure dtypes are set
    df['usd_per_token_cm'] = df['usd_per_token_cm'].astype('float32')
    df['usd_volume_per_24h_cm'] = df['usd_volume_per_24h_cm'].astype('float32')
    df['trades_cm'] = df['trades_cm'].astype('float32')

    # ensure panel is sorted
    df = df.rename(columns={'base': 'asset'})
    df = df.sort_values(by=['date', 'asset'], ignore_index=True)
    df['date'] = pd.to_datetime(df.date)

    # Initial a final dataframe to return
    final_df = pd.DataFrame(data={'date': [], 'asset': [], 'usd_per_token_cm': [], 
                                'usd_volume_per_24h_cm': [], 'trades_cm': []})

    # Loop over all assets to add any missing days
    assets = list(np.unique(df.asset.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = df[df.asset==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        num_days_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        start_days = dates[indices_to_expand]
        new_days = []
        for i in range(len(start_days)):
            start_day = start_days[i]
            days_to_add = num_days_to_add[i]
            for j in range(1, days_to_add):
                new_days.append(start_day+np.timedelta64(24*(j), 'h'))

        # add the new days to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_days})
        new_asset_df['asset'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # forward fill the price column
        asset_df['usd_per_token_cm'] = asset_df.usd_per_token_cm.ffill()

        # replace volume and trades with zeros
        asset_df.loc[asset_df.usd_volume_per_24h_cm.isnull(), 'usd_volume_per_24h_cm'] = 0
        asset_df.loc[asset_df.trades_cm.isnull(), 'trades_cm'] = 0

        # add data to master df
        final_df = pd.concat((final_df, asset_df))

    # Final clean
    df = final_df.copy()
    df = df.sort_values(by=['date', 'asset'], ignore_index=True)
    assert not df.duplicated(subset=['date', 'asset']).any()

    return df


In [429]:
def pullAssetMetrics(base_url: str, base_params: Dict[str, str], 
                     asset_universe: List[str], target_asset_metrics: List[str]) -> pd.DataFrame:
    """
    Pulls asset metrics for assets in the given asset universe using the given base URL and parameters.

    Args:
    - base_url (str): Base URL for the API.
    - base_params (Dict[str, str]): Base parameters for the API.
    - asset_universe (List[str]): list of strings representing the assets to pull metrics for.
    - target_asset_metrics (List[str]): list of strings of asset metrics of interest.

    Returns:
    - results_df: Pandas DataFrame containing the asset metrics as panel data.
    """
    # Initialize DataFrame to return results
    results_df = pd.DataFrame()

    # Define API parameters
    api_params = {
        'start_time': '2015-01-01',
        'end_time': '2023-02-02',
        'page_size': 10000,
        'limit_per_asset': 10000
    }

    # Build API URL
    api_endpoint = "timeseries/asset-metrics"
    url = f"{base_url}{api_endpoint}"

    # Loop over every asset
    for i in range(len(asset_universe)):
        # update asset
        asset = asset_universe[i]

        # monitor progress
        print(f"Processing the {i+1}th asset ({(i+1)/len(asset_universe)*100:.2f}%): {asset}")

        # initialize object for this asset results
        asset_results_df = pd.DataFrame(data={'asset': [], 'time': []})

        # update params for this asset
        params = {**base_params, **api_params, 'assets': asset}

        # determine metrics to pull
        metrics = assets_df[assets_df.asset==asset].metrics.values[0]
        if type(metrics) is not list:
            continue
        metrics_df = pd.DataFrame(metrics)
        metrics_df = metrics_df[metrics_df.metric.isin(target_asset_metrics)]

        # pull data for each metric
        assert metrics_df.metric.is_unique
        for metric in list(metrics_df.metric.values):
            # form dataframe of different freq options for this metric
            metric_options_df = pd.DataFrame(metrics_df[metrics_df.metric==metric].frequencies.values[0])

            # set frequency
            if '1d' in list(metric_options_df.frequency.values):
                params['frequency'] = '1d'
            else:
                print(metric_options_df)
                print(f"The metric {metric} for asset {asset} does not have a 1d frequency option.")
                continue

            # make the API call
            params['metrics'] = metric
            response_json = Helper.makeApiCall(url, headers={}, params=params)
            try:
                asset_df = pd.DataFrame(response_json['data'])
            except:
                print(f'The metric {metric} was not available.')

            # add data to asset results data frame
            asset_results_df = asset_results_df.merge(asset_df, on=['asset', 'time'], how='outer', validate='one_to_one')

        # add this data to results df
        results_df = pd.concat((results_df, asset_results_df))

    # form date column
    results_df['date'] = pd.to_datetime(results_df.time, format='%Y-%m-%d')
    results_df = results_df.drop(columns='time', axis=1)

    # convert other columns to float32
    columns = list(results_df.columns.values)
    columns.remove('asset')
    columns.remove('date')
    for col in columns:
        results_df[col] = results_df[col].astype('float32')

    # ensure not duplicated
    assert not results_df.duplicated(subset=['date', 'asset']).any()

    return results_df

In [273]:
if __name__ == "__main__":
    # set args
    CM_API_FP = '../../admin/coinmetrics.txt'
    BASE_URL = 'https://api.coinmetrics.io/v4/'
    PANEL_FP = '../data/raw/coinmetrics_panel_initial.pkl'
    TARGET_US_EXCHANGES = ['binance.us',  'bitstamp', 'coinbase', 'crypto.com', 'ftx.us', 
        'gemini', 'kraken', 'kucoin']
    MCAP_METRICS = ['SplyAct1yr', 'SplyActEver', 'SplyCur', 'SplyFF', 
                      'CapMrktCurUSD', 'CapMrktEstUSD', 'CapMrktFFUSD', 'CapRealUSD']
    
    # import api key
    with open(CM_API_FP) as f:
        API_KEY = f.readlines()
        API_KEY = API_KEY[0].strip()
    BASE_PARAMS = {'api_key': API_KEY}

    # pull meta data on target exchanges
    exchanges_df = pullExchangeInfo(BASE_URL, BASE_PARAMS, TARGET_US_EXCHANGES)

    # pull meta data on coinmetrics assets
    assets_df = pullAssetInfo(BASE_URL, BASE_PARAMS)

    # pull meta data markets and subset down to target markets
    markets_df = pullAndFormRelevantMarkets(exchanges_df, assets_df, BASE_URL, BASE_PARAMS)

    # pull usdt and usdc exchange rates
    usd_df = pullUSDTandUSDCexchangeRates(BASE_URL, BASE_PARAMS)

    # pull ohlcv data
    panel_df = pullOHLCV(BASE_URL, BASE_PARAMS, markets_df, usd_df)

    # pull mcap data
    asset_universe = list(np.unique(panel_df.asset.values))
    results_df = pullAssetMetrics(BASE_URL, BASE_PARAMS, asset_universe, MCAP_METRICS)

    # merge and save the panel
    panel_df = panel_df.merge(results_df, 
                          on=['asset', 'date'],
                          how='outer',
                          validate='one_to_one')
    panel_df = panel_df.sort_values(by=['date', 'asset'], ignore_index=True)
    panel_df.to_pickle(PANEL_FP)