In [1]:
import requests
import pandas as pd
import numpy as np
import time
from typing import Any, Dict, Optional
import logging
logger = logging.getLogger(__name__)

In [2]:
def makeApiCall(url: str, headers: dict, params: dict={}, retries: int = 4) -> Optional[Dict[str, Any]]:
    """
    Makes an API call to the given endpoint with the given parameters.

    Args:
    - url (str): string representing the URL for the API.
    - headers (dict): dictionary containing the headers for the API call.
    - params (dict): dictionary containing the parameters for the API call.
    - retries (int): integer representing the number of times to retry the API call in case of an error.

    Returns:
    - response (dict): the data from the API response, or None if the API call failed.
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, params=params, timeout=5)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            logger.warning(f'The API call failed with error: {str(e)}')
            if attempt == retries - 1:
                logger.error(f'The API call failed after {retries} attempts.')
                return None
            else:
                sleep_time = 4 ** attempt
                logger.warning(f'Retrying after {sleep_time} seconds.')
                time.sleep(sleep_time)

In [3]:
def pullAssetInfo(base_url: str, base_headers: dict) -> pd.DataFrame:
    """
    Returns a DataFrame containing information about cryptocurrency assets.
    
    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.

    Returns:
        A Pandas DataFrame containing information about cryptocurrency assets.
    """
    # Build target URL
    target_url = 'assets'
    url        = f"{base_url}{target_url}"
    headers    = base_headers.copy()

    # Call API and convert to DataFrame
    response_json = makeApiCall(url, headers=headers)
    df = pd.DataFrame(response_json)

    # Convert date columns to datetime
    df['data_start'] = pd.to_datetime(df['data_start'])
    df['data_end'] = pd.to_datetime(df['data_end'])

    # Calculate duration in days
    df['duration_days'] = (df['data_end'] - df['data_start']).dt.days

    # Subset to cryptocurrency assets
    df = df[df['type_is_crypto'] == 1]

    # Subset to assets with trading data
    df = df[~df['data_start'].isnull() & ~df['data_end'].isnull()]

    # Subset to assets with at least four months of history
    df = df[df['duration_days'] > 120]

    # Subset to assets with data start on or before 2022-09-01
    df = df[df['data_start'] <= '2022-09-01']

    return df

In [4]:
def pullExchangeInfo(base_url: str, base_headers: dict, target_exchanges: list[str]) -> pd.DataFrame:
    """
    Returns a DataFrame containing information about cryptocurrency exchanges.

    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.
        target_exchanges: A list of strings with the target exchanges for this study.

    Returns:
        A Pandas DataFrame containing information about cryptocurrency exchanges.
    """
    # Build target URL and headers
    target_url = 'exchanges'
    url        = f"{base_url}{target_url}"
    headers    = base_headers.copy()

    # Call API and convert to DataFrame
    response_json = makeApiCall(url, headers=headers)
    df = pd.DataFrame(response_json)

    # Subset to relevant exchanges
    df = df[df.exchange_id.isin(target_exchanges)]
    
    return df

In [5]:
def pullMarketInfo(base_url: str, base_headers: dict, target_exchanges: list) -> pd.DataFrame:
    """
    Returns a DataFrame containing information about coinapi markets that are on a target exchange with
        USD or stablecoin quote asset.

    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.
        target_exchanges: A list of strings with the target exchanges for this study.

    Returns:
        A Pandas DataFrame containing information about cryptocurrency markets.
    """
    # Build target URL
    target_url = 'symbols'
    url        = f"{base_url}{target_url}"
    headers    = base_headers.copy()

    # Call API and convert to DataFrame
    response_json = makeApiCall(url, headers=headers)
    df = pd.DataFrame(response_json)

    # subset to exchanges of interest
    df = df[df.exchange_id.isin(target_exchanges)]

    # clean columns
    df['data_start'] = pd.to_datetime(df.data_start)
    df['data_end'] = pd.to_datetime(df.data_end)
    df['duration_days'] = (df.data_end - df.data_start).dt.days

    # subset to assets of interest
    df = df[df.symbol_type=='SPOT'] # spot markets
    df = df[df.asset_id_quote.isin(['USD', 'USDC', 'USDT'])] # quote asset is fiat USD or stablecoin USD
    df = df.dropna(subset=['data_start', 'data_end'])  # have data
    df = df[df.duration_days > 120] # have at least four months of data
    target_date = pd.Timestamp('2022-09-01')
    df = df[df.data_start <= target_date] # have at least four months of data in target window

    # remove symbols that are derivatives of other symbols or stablecoins
    assets_to_remove = ['WBTC', 'WLUNA', 'WNXM', 'TBTC', 'CUSD', 'MUSD', 'NUSD', 'DAI', 'BUSD', 'CUSDT', 
        'GUSD', 'LUSD', 'OUSD', 'USDJ', 'USDK', 'USDN', 'USDT', 'USDC', 'AOA', 'AUSD', 'ERN', 'KRW', 'MTL', 
        'TUSD', 'SUSD', 'USDD', 'UST', 'USTC', 'EUR', 'AUD', 'GBP', 'CAD', 'CBETH', 'LBP', 'SOS']
    df = df[~df.asset_id_base.isin(assets_to_remove)]
    df = df[~df['asset_id_base'].str.contains('3L|3S')]

    return df


In [6]:
def pullUSDTandUSDCexchangeRates(base_url: str, base_headers: dict) -> pd.DataFrame:
    """
    Returns a DataFrame containing prices of usdc and usdt.

    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.

    Returns:
        A Pandas DataFrame containing usdt and usdc price timeserieses.
    """
    # set params
    headers = base_headers.copy()
    params = {'period_id': '1DAY',
        'time_start': '2015-01-01',
        'time_end': '2023-02-02',
        'limit': 5000}
    
    # pull tether
    asset_id = 'USDT'
    url = f"{base_url}exchangerate/{asset_id}/USD/history"
    response_json = makeApiCall(url, headers=headers, params=params)
    usdt_df = pd.DataFrame(response_json) 

    # pull usdc
    asset_id = 'USDC'
    url = f"{base_url}exchangerate/{asset_id}/USD/history"
    response_json = makeApiCall(url, headers=headers, params=params)
    usdc_df = pd.DataFrame(response_json) 

    # clean usdc
    usdc_df = usdc_df[usdc_df.rate_close!=0].reset_index(drop=True)
    usdc_df = usdc_df[usdc_df.time_period_end!='0001-01-01T00:00:00.0000000Z']
    usdc_df['date'] = pd.to_datetime(usdc_df.time_period_end, format='%Y-%m-%d').dt.date
    usdc_df['usd_per_usdc'] = usdc_df.rate_close
    usdc_df = usdc_df[['date', 'usd_per_usdc']]
    usdc_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=usdc_df.index.min(), end=usdc_df.index.max(), freq='D')
    usdc_df = usdc_df.reindex(date_range)
    usdc_df.loc[usdc_df.usd_per_usdc>2, 'usd_per_usdc'] = np.nan
    usdc_df.loc[usdc_df.usd_per_usdc<0.8, 'usd_per_usdc'] = np.nan
    usdc_df['usd_per_usdc'] = usdc_df.usd_per_usdc.ffill()
    assert 0 == usdc_df.usd_per_usdc.isnull().sum()
    usdc_df = usdc_df.reset_index()
    usdc_df = usdc_df.rename(columns={'index': 'date'})

    # clean usdt
    usdt_df = usdt_df[usdt_df.time_period_end!='0001-01-01T00:00:00.0000000Z']
    usdt_df['date'] = pd.to_datetime(usdt_df.time_period_end, format='%Y-%m-%d').dt.date
    usdt_df['usd_per_usdt'] = usdt_df.rate_close
    usdt_df = usdt_df[['date', 'usd_per_usdt']]
    usdt_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=usdt_df.index.min(), end=usdt_df.index.max(), freq='D')
    usdt_df = usdt_df.reindex(date_range)
    usdt_df.loc[usdt_df.usd_per_usdt>2, 'usd_per_usdt'] = np.nan
    usdt_df.loc[usdt_df.usd_per_usdt<0.8, 'usd_per_usdt'] = np.nan
    usdt_df['usd_per_usdt'] = usdt_df.usd_per_usdt.ffill()
    assert 0 == usdt_df.usd_per_usdt.isnull().sum()
    usdt_df = usdt_df.reset_index()
    usdt_df = usdt_df.rename(columns={'index': 'date'})

    # merge
    macro_df = usdc_df.merge(usdt_df, on='date', how='outer', validate='one_to_one')
    macro_df = macro_df.sort_values(by='date', ignore_index=True)
    macro_df['date'] = pd.to_datetime(macro_df.date).dt.date

    return macro_df

In [7]:
def pullMarketData(base_url: str, base_headers: dict, markets_list: list, macro_df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a panel DataFrame containing market prices, volumes, and trade counts.

    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.
        markets_list: A list of strings of market names to pull.
        macro_df: A Pandas DataFrame containing usdt and usdc price timeserieses.

    Returns:
        A Pandas DataFrame panel of dates and markets with their usd_per_token prices, 
            usd_volume_per_24h, and trades.
    """
    # set up object to store all
    df = pd.DataFrame()

    # set up args
    params = {'period_id': '1DAY', 
            'time_start': '2015-01-01T00:00:00',
            'time_end': '2023-02-02T00:00:00',
            'include_empty_items': True,
            'limit': 4000}

    # pull all markets
    for i in range(len(markets_list)):
        # update market to pull
        market = markets_list[i]

        # monitor progress
        print(f"Processing market #{i+1} ({(i+1)/len(markets_list)*100:.2f}%): {market}")

        # make the call
        url = f"{base_url}ohlcv/{market}/history"
        headers = base_headers.copy()
        response_json = makeApiCall(url, headers=headers, params=params)

        # catch if there is no data
        try:
            # clean the market_df
            market_df = pd.DataFrame(response_json)
            market_df['symbol_id'] = market
            market_df = market_df[['symbol_id', 'time_period_end', 'price_close', 'volume_traded', 'trades_count']]

            # save data
            df = pd.concat((df, market_df))
        except:
            print(f"{market} did not have data")
            continue

    # remove asset-dates where there is a missing price and zero volume
    df = df[~(df.price_close.isnull() & (df.volume_traded==0) & (df.trades_count==0))]

    # extract names of exchange, base asset, and quote asset
    df['exchange'] = df['symbol_id'].str.split('_', n=4, expand=True)[0]
    df['asset_id'] = df['symbol_id'].str.split('_', n=4, expand=True)[2]
    df['quote_id'] = df['symbol_id'].str.split('_', n=4, expand=True)[3]

    # form the date column
    df['date'] = pd.to_datetime(df.time_period_end, format='%Y-%m-%d').dt.date
    df = df.drop(columns='time_period_end', axis=1)

    # merge on usdt and usdc prices
    df = df.merge(macro_df, on='date', how='left', validate='many_to_one')
    df['date'] = pd.to_datetime(df['date'])

    # form the price column
    df.loc[df.quote_id=='USD', 'usd_per_token_coinapi'] = df.loc[df.quote_id=='USD', 'price_close']
    df.loc[df.quote_id=='USDC', 'usd_per_token_coinapi'] = df.loc[df.quote_id=='USDC', 'price_close']*df.loc[df.quote_id=='USDC', 'usd_per_usdc']
    df.loc[df.quote_id=='USDT', 'usd_per_token_coinapi'] = df.loc[df.quote_id=='USDT', 'price_close']*df.loc[df.quote_id=='USDT', 'usd_per_usdt']
    assert 0 == df.usd_per_token_coinapi.isnull().sum()

    # form volume column
    df['usd_volume_per_24h_coinapi'] = df.volume_traded*df.usd_per_token_coinapi

    # collapse to the asset date level
    grouped = df.groupby(['date', 'asset_id'])
    weighted_avg = grouped.apply(lambda x: (x['usd_per_token_coinapi'] * x['usd_volume_per_24h_coinapi']).sum() / x['usd_volume_per_24h_coinapi'].sum())
    total_volume = grouped['usd_volume_per_24h_coinapi'].sum()
    total_trades = grouped['trades_count'].sum()
    df = pd.DataFrame({'usd_per_token_coinapi': weighted_avg, 
                            'usd_volume_per_24h_coinapi': total_volume, 
                            'trades_count': total_trades}).reset_index()

    # check for valid ranges and dtypes
    df = df[(df['usd_per_token_coinapi'] > 0) & (df['usd_per_token_coinapi'] < 1e6)]
    df = df[(df['usd_volume_per_24h_coinapi'] > 0) & (df['usd_volume_per_24h_coinapi'] < 1e9)]
    df = df[(df['trades_count'] > 0) & (df['trades_count'] < 1e9)]

    # ensure dtypes are set
    df['usd_per_token_coinapi'] = df['usd_per_token_coinapi'].astype('float32')
    df['usd_volume_per_24h_coinapi'] = df['usd_volume_per_24h_coinapi'].astype('float32')
    df['trades_count'] = df['trades_count'].astype('float32')

    # ensure panel is sorted
    df = df.sort_values(by=['date', 'asset_id'], ignore_index=True)
    df['date'] = pd.to_datetime(df.date)

    # initialize a new df
    final_df = pd.DataFrame(data={'date': [], 'asset_id': [], 'usd_per_token_coinapi': [], 'usd_volume_per_24h_coinapi': [], 'trades_count': []})

    # ensure panel is sorted
    df = df.sort_values(by=['date', 'asset_id'], ignore_index=True)

    # loop over all assets to add missing days
    assets = list(np.unique(df.asset_id.values))
    for asset in assets:
        # subset to asset of interest
        asset_df = df[df.asset_id==asset].copy()

        # determine the date gaps
        date_gaps = []
        dates = asset_df.date.values
        for i in range(1, len(dates)):
            date_gaps.append(np.timedelta64(dates[i]-dates[i-1], 'D').astype(int))

        # determine new days to add
        indices_to_expand = [i for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        num_days_to_add = [date_gaps[i] for i in range(len(date_gaps)) if (date_gaps[i] > 1) & (date_gaps[i] < 32)]
        start_days = dates[indices_to_expand]
        new_days = []
        for i in range(len(start_days)):
            start_day = start_days[i]
            days_to_add = num_days_to_add[i]
            for j in range(1, days_to_add):
                new_days.append(start_day+np.timedelta64(24*(j), 'h'))
        
        # add the new days to the asset df
        new_asset_df = pd.DataFrame(data={'date': new_days})
        new_asset_df['asset_id'] = asset
        asset_df = pd.concat((asset_df, new_asset_df))
        asset_df = asset_df.sort_values(by='date', ignore_index=True)

        # forward fill the price column
        asset_df['usd_per_token_coinapi'] = asset_df.usd_per_token_coinapi.ffill()

        # replace volume and trades with zeros
        asset_df.loc[asset_df.usd_volume_per_24h_coinapi.isnull(), 'usd_volume_per_24h_coinapi'] = 0
        asset_df.loc[asset_df.trades_count.isnull(), 'trades_count'] = 0

        # add data to master df
        final_df = pd.concat((final_df, asset_df))

    # final clean
    df = final_df.copy()
    df = df.rename(columns={'trades_count': 'trades_coinapi'})
    df = df.sort_values(by=['date', 'asset_id'], ignore_index=True)
    assert not df.duplicated(subset=['date', 'asset_id']).any()

    return df

In [8]:
def pullExchangeRates(base_url: str, base_headers: dict, asset_ids: list[str]) -> pd.DataFrame:
    """
    Returns a panel DataFrame containing exchange rates.

    Args:
        base_url: A string representing the base URL of the CoinAPI service.
        base_headers: A dictionary representing the headers to be sent with the API request.
        asset_ids: A list of strings of the asset ids to pull exchange rates for.

    Returns:
        A Pandas DataFrame panel of dates and assets with their clean exchange rate from coinapi.
    """
    # set api args
    headers = base_headers.copy()
    params = {'period_id': '1DAY',
                'time_start': '2015-01-01T00:00:00',
                'time_end': '2023-02-02T00:00:00',
                'limit': 5000}

    # initiate df for the results
    ref_df = pd.DataFrame()

    # pull for all assets
    for i in range(len(asset_ids)):
        # update asset
        asset_id = asset_ids[i]

        # monitor progress
        print(f"Processing asset #{i+1} ({(i+1)/len(asset_ids)*100:.2f}%): {asset_id}")

        # make the call
        url = f"{base_url}exchangerate/{asset_id}/USD/history"
        response_json = makeApiCall(url, headers=headers, params=params)
        asset_df = pd.DataFrame(response_json)

        # clean the df
        asset_df = asset_df[asset_df.time_period_end!='0001-01-01T00:00:00.0000000Z']
        asset_df['date'] = pd.to_datetime(asset_df.time_period_end).dt.date
        asset_df['usd_per_token_ref'] = asset_df.rate_close
        asset_df['asset_id'] = asset_id
        asset_df = asset_df[['date', 'asset_id', 'usd_per_token_ref']]
        ref_df = pd.concat((ref_df, asset_df))

        # ensure i pulled data
        assert 0 < asset_df.shape[0]

    return ref_df.sort_values(by=['date', 'asset_id'], ignore_index=True)

In [None]:
if __name__ == "__main__":
    # import api key
    API_KEY_FP = '../../admin/coinapi.txt'
    with open(API_KEY_FP) as f:
        API_KEY = f.readlines()[0].strip()

    # set args
    PANEL_FP = '../data/raw/coinapi_panel.pkl'
    MACRO_FP = '../data/raw/coinapi_macro.pkl'
    BASE_URL   = 'https://rest.coinapi.io/v1/'
    BASE_HEADERS = {'X-CoinAPI-Key': API_KEY}
    LEGIT_US_EXCHANGES = ['BINANCEUS', 'BITSTAMP', 'COINBASE', 'CRYPTOCOM', 'FTXUS', 
        'GEMINI', 'KRAKEN', 'KUCOIN', 'OKCOINUSD']

    # confirm api is working
    url = 'https://www.coinapi.io/api/subscriptions/usage/rest/history'
    response = requests.get(url, headers=BASE_HEADERS)
    print(response.json())    

    # pull initial asset universe
    asset_info_df = pullAssetInfo(BASE_URL, BASE_HEADERS)

    # pull exchange info
    exchanges_df = pullExchangeInfo(BASE_URL, BASE_HEADERS, LEGIT_US_EXCHANGES)

    # pull relevant markets
    symbols_df = pullMarketInfo(BASE_URL, BASE_HEADERS, LEGIT_US_EXCHANGES)

    # pull usdt and usdc exchange rates
    macro_df = pullUSDTandUSDCexchangeRates(BASE_URL, BASE_HEADERS)
    macro_df.to_pickle(MACRO_FP)

    # pull market data
    assert symbols_df.symbol_id.is_unique
    markets_list = symbols_df.symbol_id.values
    panel_df = pullMarketData(BASE_URL, BASE_HEADERS, markets_list, macro_df)
    panel_df.to_pickle(PANEL_FP)

    # pull exchange rates
    asset_ids = list(np.unique(panel_df.asset_id.values))
    ref_df = pullExchangeRates(BASE_URL, BASE_HEADERS, asset_ids)



In [None]:
# confirm dates are the same type
ref_df['date'] = ref_df.date.apply(np.datetime64)

# merge exchange rate and market prices together
df = ref_df.merge(panel_df, on=['date', 'asset_id'], how='outer', validate='one_to_one')
df = df.sort_values(by=['date', 'asset_id'], ignore_index=True)

# form list of assets
asset_ids = list(np.unique(df.asset_id.values))

# loop over assets to find dates that are missing
for i in range(len(asset_ids)):
    # update asset
    asset_id = asset_ids[i]

    # monitor progress
    print(f"Processing asset #{i+1} ({(i+1)/len(asset_ids)*100:.2f}%): {asset_id}")

    # subset to this asset
    asset_df = df[df.asset_id==asset_id]

    # find asset dates that matched
    matched_df = asset_df[~asset_df.usd_per_token_coinapi.isnull() & ~asset_df.usd_per_token_ref.isnull()]
    matched_df['prct_diff'] = np.abs((matched_df.usd_per_token_coinapi-matched_df.usd_per_token_ref)/matched_df.usd_per_token_ref)

    # report matched dates with different prices
    print('These are prices that are off:')
    print(matched_df[matched_df.prct_diff > 0.5][['date',	'asset_id',	'usd_per_token_ref', 'usd_per_token_coinapi']])

    # report dates missing from my data
    miss_df = asset_df[asset_df.usd_per_token_coinapi.isnull()]
    missing_dates = miss_df.date.values
    print(f"I am missing {len(missing_dates)} days in my data as compared to ref exchange rates. Specifically:")
    print(print(missing_dates))
    print("\n\n\n")

    # space out calls
    time.sleep(3)

In [None]:
# TODO make task for tomorrow to re run the coinapi notebook to ensure it works and to confirm 