In [1]:
import pandas as pd
import numpy as np
import time
from typing import Any, Dict, Optional
import requests

In [2]:
def makeApiCall(url: str, headers: dict, params: dict={}, retries: int = 4) -> Optional[Dict[str, Any]]:
    """
    Makes an API call to the given endpoint with the given parameters.

    Args:
    - url (str): string representing the URL for the API.
    - headers (dict): dictionary containing the headers for the API call.
    - params (dict): dictionary containing the parameters for the API call.
    - retries (int): integer representing the number of times to retry the API call in case of an error.

    Returns:
    - response (dict): the data from the API response, or None if the API call failed.
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, params=params, timeout=5)
            response.raise_for_status()
            if response.ok:
                return response.json()
            else:
                print(f"The API call failed with status code: {response.status_code}, retrying...")
        except requests.exceptions.Timeout:
            # Timeout error, retry after a short delay
            print('The API call timed out, retrying...')
            time.sleep(1)
        except requests.exceptions.RequestException as e:
            # There was an error, retry after a short delay
            print(f'The API call failed with error: {str(e)}, retrying...')
        if attempt == 0:
            time.sleep(1)
        elif attempt == 1:
            time.sleep(10)
        elif attempt == 2:
            time.sleep(30)

    print(f'The api call failed after {retries} attempts.')
    return None

In [163]:
if __name__ == "__main__":
    # import api key
    API_KEY_FP = '../../admin/coinapi.txt'
    with open(API_KEY_FP) as f:
        API_KEY = f.readlines()[0].strip()

    # set args
    BASE_URL   = 'https://rest.coinapi.io/v1/'
    BASE_HEADERS = {'X-CoinAPI-Key': API_KEY}
    LEGIT_US_EXCHANGES = ['BINANCEUS', 'BITSTAMP', 'COINBASE', 'CRYPTOCOM', 'FTXUS', 
                          'GEMINI', 'KRAKEN', 'KUCOIN', 'OKCOINUSD']

    # confirm api is working
    url = 'https://www.coinapi.io/api/subscriptions/usage/rest/history'
    response = requests.get(url, headers=BASE_HEADERS)
    print(response.json())    
    

[{'date': '2023-03-07', 'requests': 1, 'apicalls': 1}, {'date': '2023-03-06', 'requests': 65646, 'apicalls': 2194}]


In [191]:
# TODO GET ALL ASSET ICONS

# pull asset info
target_url = 'assets'
url = BASE_URL + target_url
response_json = makeApiCall(url, headers=BASE_HEADERS)
df = pd.DataFrame(response_json)

# convert date columns to dates to operate on them
df['data_start'] = pd.to_datetime(df.data_start)
df['data_end'] = pd.to_datetime(df.data_end)
df['duration_days'] = (df.data_end - df.data_start).dt.days

# subset to crypto assets
df = df[df.type_is_crypto==1]

# subset to assets with trading data
df = df[~df.data_start.isnull() & ~df.data_end.isnull()]

# subset to assets with at least four months of history
df = df[df.duration_days > 120]

# subset to assets with data start on or before 2022-09-01
df = df[df.data_start <= '2022-09-01']

# rename
assets_df = df.copy()

In [190]:
# TODO figure out all legit exchanges

target_url = 'exchanges'
url = BASE_URL + target_url
response_json = makeApiCall(url, headers=BASE_HEADERS)
exchanges_df = pd.DataFrame(response_json)
exchanges_df = exchanges_df[exchanges_df.exchange_id.isin(LEGIT_US_EXCHANGES)]


In [296]:
# TODO determine all markets on legit exchanges with USD or stablecoin quote

# pull all markets
target_url = 'symbols'
url = BASE_URL + target_url
response_json = makeApiCall(url, headers=BASE_HEADERS)
symbols_df = pd.DataFrame(response_json)

# subset to exchanges of interest
symbols_df = symbols_df[symbols_df.exchange_id.isin(LEGIT_US_EXCHANGES)]

# subset to assets of interest
symbols_df = symbols_df[symbols_df.symbol_type=='SPOT']
symbols_df = symbols_df[symbols_df.asset_id_quote.isin(['USD', 'USDC', 'USDT'])]
symbols_df['data_start'] = pd.to_datetime(symbols_df.data_start)
symbols_df['data_end'] = pd.to_datetime(symbols_df.data_end)
symbols_df['duration_days'] = (symbols_df.data_end - symbols_df.data_start).dt.days
symbols_df = symbols_df[~symbols_df.data_start.isnull() & ~symbols_df.data_end.isnull()] # have data
symbols_df = symbols_df[symbols_df.duration_days > 120] # have at least four months of data
symbols_df = symbols_df[symbols_df.data_start <= '2022-09-01'] # have at least four months of data in target window

# remove symbols that are derivatives of other symbols
symbols_df = symbols_df[~symbols_df.asset_id_base.isin(['WBTC', 'WLUNA', 'WNXM', 'TBTC'])]
symbols_df = symbols_df[~symbols_df.asset_id_base.isin(['CUSD', 'MUSD', 'NUSD', 'DAI', 'BUSD', 'CUSDT', 
    'GUSD', 'LUSD', 'OUSD', 'USDJ', 'USDK', 'USDN', 'USDT', 'USDC'])]
symbols_df = symbols_df[~symbols_df.asset_id_base.isin(['AOA', 'AUSD', 'ERN', 'KRW', 'MTL', 'TUSD', 
    'SUSD', 'USDD', 'UST', 'USTC', 'EUR', 'AUD', 'GBP', 'CAD', 'CBETH', 'LBP', 'SOS'])]
symbols_df = symbols_df[~symbols_df['asset_id_base'].str.contains('3L|3S')]
symbols_df = symbols_df[~((symbols_df.asset_id_base=='USDT') & (symbols_df.asset_id_quote=='USDC'))]
symbols_df = symbols_df[~((symbols_df.asset_id_quote=='USDT') & (symbols_df.asset_id_base=='USDC'))]

In [None]:
# TODO pull USDT and USDC exchange rates

# pull tether
asset_id = 'USDT'
params = {'period_id': '1DAY',
    'time_start': '2015-01-01',
    'time_end': '2023-02-02',
    'limit': 5000}
url = f"{BASE_URL}exchangerate/{asset_id}/USD/history"
headers = BASE_HEADERS.copy()
response_json = makeApiCall(url, headers=BASE_HEADERS, params=params)
usdt_df = pd.DataFrame(response_json) 

# pull usdc
asset_id = 'USDC'
url = f"{BASE_URL}exchangerate/{asset_id}/USD/history"
headers = BASE_HEADERS.copy()
response_json = makeApiCall(url, headers=BASE_HEADERS, params=params)
usdc_df = pd.DataFrame(response_json) 

# clean usdc
usdc_df = usdc_df[usdc_df.rate_close!=0].reset_index(drop=True)
usdc_df = usdc_df[usdc_df.time_period_end!='0001-01-01T00:00:00.0000000Z']
usdc_df['date'] = pd.to_datetime(usdc_df.time_period_end, format='%Y-%m-%d').dt.date
usdc_df['usd_per_usdc'] = usdc_df.rate_close
usdc_df = usdc_df[['date', 'usd_per_usdc']]
usdc_df.set_index('date', inplace=True)
date_range = pd.date_range(start=usdc_df.index.min(), end=usdc_df.index.max(), freq='D')
usdc_df = usdc_df.reindex(date_range)
usdc_df.loc[usdc_df.usd_per_usdc>2, 'usd_per_usdc'] = np.nan
usdc_df.loc[usdc_df.usd_per_usdc<0.8, 'usd_per_usdc'] = np.nan
usdc_df['usd_per_usdc'] = usdc_df.usd_per_usdc.ffill()
assert 0 == usdc_df.usd_per_usdc.isnull().sum()
usdc_df = usdc_df.reset_index()
usdc_df = usdc_df.rename(columns={'index': 'date'})

# clean usdt
usdt_df = usdt_df[usdt_df.time_period_end!='0001-01-01T00:00:00.0000000Z']
usdt_df['date'] = pd.to_datetime(usdt_df.time_period_end, format='%Y-%m-%d').dt.date
usdt_df['usd_per_usdt'] = usdt_df.rate_close
usdt_df = usdt_df[['date', 'usd_per_usdt']]
usdt_df.set_index('date', inplace=True)
date_range = pd.date_range(start=usdt_df.index.min(), end=usdt_df.index.max(), freq='D')
usdt_df = usdt_df.reindex(date_range)
usdt_df.loc[usdt_df.usd_per_usdt>2, 'usd_per_usdt'] = np.nan
usdt_df.loc[usdt_df.usd_per_usdt<0.8, 'usd_per_usdt'] = np.nan
usdt_df['usd_per_usdt'] = usdt_df.usd_per_usdt.ffill()
assert 0 == usdt_df.usd_per_usdt.isnull().sum()
usdt_df = usdt_df.reset_index()
usdt_df = usdt_df.rename(columns={'index': 'date'})

# merge
macro_df = usdc_df.merge(usdt_df, on='date', how='outer', validate='one_to_one')
macro_df = macro_df.sort_values(by='date', ignore_index=True)
macro_df['date'] = pd.to_datetime(macro_df.date).dt.date

In [None]:
# TODO pull data for all markets

# form markets to pull
assert symbols_df.symbol_id.is_unique
markets_list = symbols_df.symbol_id.values

# set up object to store all
results_df = pd.DataFrame()

# pull all markets
for i in range(len(markets_list)):
    # update market to pull
    market = markets_list[i]

    # monitor progress
    print(f"Processing market #{i+1} ({(i+1)/len(markets_list)*100:.2f}%): {market}")

    # make the call
    url = f"{BASE_URL}ohlcv/{market}/history"
    headers = BASE_HEADERS.copy()
    params = {'period_id': '1DAY', 
            'time_start': '2015-01-01T00:00:00',
            'time_end': '2023-02-02T00:00:00',
            'include_empty_items': True,
            'limit': 4000}
    response_json = makeApiCall(url, headers=BASE_HEADERS, params=params)

    # clean the df
    df = pd.DataFrame(response_json)
    df['symbol_id'] = market
    df = df[['symbol_id', 'time_period_end', 'price_close', 'volume_traded', 'trades_count']]

    # save data
    results_df = pd.concat((results_df, df))

In [306]:
results_df.to_csv('temp_backup.csv')

In [None]:
# TODO clean the results

# remove asset-dates where there is a missing price and zero volume
results_df = results_df[~(results_df.price_close.isnull() & (results_df.volume_traded==0) & (results_df.trades_count==0))]

# extract names of exchange, base asset, and quote asset
results_df['exchange'] = results_df['symbol_id'].str.split('_', n=4, expand=True)[0]
results_df['asset_id'] = results_df['symbol_id'].str.split('_', n=4, expand=True)[2]
results_df['quote_id'] = results_df['symbol_id'].str.split('_', n=4, expand=True)[3]

# form the date column
results_df['date'] = pd.to_datetime(results_df.time_period_end, format='%Y-%m-%d').dt.date
results_df = results_df.drop(columns='time_period_end', axis=1)

# merge on usdt and usdc prices
results_df = results_df.merge(macro_df, on='date', how='left', validate='many_to_one')
results_df['date'] = pd.to_datetime(results_df['date'])

# form the price column
results_df.loc[results_df.quote_id=='USD', 'usd_per_token_coinapi'] = results_df.loc[results_df.quote_id=='USD', 'price_close']
results_df.loc[results_df.quote_id=='USDC', 'usd_per_token_coinapi'] = results_df.loc[results_df.quote_id=='USDC', 'price_close']*results_df.loc[results_df.quote_id=='USDC', 'usd_per_usdc']
results_df.loc[results_df.quote_id=='USDT', 'usd_per_token_coinapi'] = results_df.loc[results_df.quote_id=='USDT', 'price_close']*results_df.loc[results_df.quote_id=='USDT', 'usd_per_usdt']
assert 0 == results_df.usd_per_token_coinapi.isnull().sum()

# form volume column
results_df['usd_volume_per_24h_coinapi'] = results_df.volume_traded*results_df.usd_per_token_coinapi

# collapse to the asset date level
grouped = results_df.groupby(['date', 'asset_id'])
weighted_avg = grouped.apply(lambda x: (x['usd_per_token_coinapi'] * x['usd_volume_per_24h_coinapi']).sum() / x['usd_volume_per_24h_coinapi'].sum())
total_volume = grouped['usd_volume_per_24h_coinapi'].sum()
total_trades = grouped['trades_count'].sum()
panel_df = pd.DataFrame({'usd_per_token_coinapi': weighted_avg, 
                         'usd_volume_per_24h_coinapi': total_volume, 
                         'trades_count': total_trades}).reset_index()
panel_df['date'] = pd.to_datetime(panel_df.date)

# check for valid ranges and dtypes
panel_df = panel_df[(panel_df['usd_per_token_coinapi'] > 0) & (panel_df['usd_per_token_coinapi'] < 1e6)]
panel_df = panel_df[(panel_df['usd_volume_per_24h_coinapi'] > 0) & (panel_df['usd_volume_per_24h_coinapi'] < 1e9)]
panel_df = panel_df[(panel_df['trades_count'] > 0) & (panel_df['trades_count'] < 1e6)]


# ensure dtypes are set
panel_df['usd_per_token_coinapi'] = panel_df['usd_per_token_coinapi'].astype('float32')
panel_df['usd_volume_per_24h_coinapi'] = panel_df['usd_volume_per_24h_coinapi'].astype('float32')
panel_df['trades_count'] = panel_df['trades_count'].astype('float32')

In [454]:
# ensure panel is sorted
panel_df = panel_df.sort_values(by=['date', 'asset_id'], ignore_index=True)
panel_df['date'] = pd.to_datetime(panel_df.date)

In [447]:
# initialize a new df
df = pd.DataFrame(data={'date': [], 'asset_id': [], 'usd_per_token_coinapi': [], 'usd_volume_per_24h_coinapi': [], 'trades_count': []})

# ensure panel is sorted
panel_df = panel_df.sort_values(by=['date', 'asset_id'], ignore_index=True)

# loop over all assets
assets = list(np.unique(panel_df.asset_id.values))
for asset in assets:
    # subset to asset of interest
    asset_df = panel_df[panel_df.asset_id==asset].copy()

    # determine the date gaps
    date_gaps = []
    dates = asset_df.date.values
    for i in range(1, len(dates)):
        date_gaps.append((dates[i]-dates[i-1]).days)

In [448]:
asset = assets[0]

In [None]:
# for each date gap above 1 and below 32, 
# grab the start date
# add to a new array the number of days after corresponding to diff array
# add these days to the asset df
# sort
# forward fill the price column
# set volume and trades to zero

In [460]:
pd.DataFrame(data=date_gaps).value_counts()

1    422
2      4
4      1
5      1
dtype: int64

In [287]:
# TODO for all assets, go pull exchange rates
# check that i am not missing tons of asset-dates between my panel and these exchange rates
# check also that my prices are not way off

# pull tether
asset_id = 'USDT'
params = {'period_id': '1DAY',
    'time_start': '2015-01-01',
    'time_end': '2023-02-02',
    'limit': 5000}
url = f"{BASE_URL}exchangerate/{asset_id}/USD/history"
headers = BASE_HEADERS.copy()
response_json = makeApiCall(url, headers=BASE_HEADERS, params=params)
usdt_df = pd.DataFrame(response_json) 

# try the asset and USD
# if not that, try asset and USDC; then USDT; then BTC; then ETH; then just report if not avaialaable


In [4]:
# TODO pull best ask and bid from those markets to obtain volume weighted spread

url = 'https://rest.coinapi.io/v1/quotes/BITSTAMP_SPOT_BTC_USD/history?time_start=2016-01-01T00:00:00'


time_start = 1
time_end = 2
limit = 50000

In [None]:
# TODO make task to pull hourly data