In [125]:
# Import packages
import pandas as pd
import numpy as np
import requests
import time
from datetime import date
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
from dateutil.relativedelta import relativedelta
import json.decoder
from typing import Dict, Any, Optional

In [118]:
def initiateAPI(base_url: str) -> Session:
    """ confirm the cmc api is working for the set api key.

    Args:
        base_url (str): the url for the pro api at cmc. 
    
    Returns:
        session (requests.Session): request class for pinging cmc.
    """
    endpoint = '/v1/key/info'
    headers = {'Accepts': 'application/json',
               'X-CMC_PRO_API_KEY': API_KEY}
    final_url = base_url + endpoint
    session = Session()
    session.headers.update(headers)
    r = session.get(final_url)
    print(r.json())

    return session


In [126]:
def makeCMCApiCall(session: Session, url: str, params: dict, retries: int=3) -> Optional[Dict[str, Any]]:
    """ makes an API call to CoinMarketCap using the provided requests.Session object.
    
    Args:
        session (requests.Session): A requests.Session object that will be used to make the API call.
        url (str): The API endpoint URL to call.
        params (dict): A dictionary of parameters to include in the API call.
        retries (int): The number of times to retry the API call if it fails. Default is 3.
        
    Returns:
        data (dict): the data from the api response, or None if the api call failed.
    """
    for attempt in range(retries):
        response = session.get(url, params=params)
        if response.ok:
            try:
                return response.json()['data']
            except json.decoder.JSONDecodeError as e:
                print(f'Error decoding JSON response: {str(e)}')
        else:
            # There was an error, retry after a short delay
            print(f'The API call failed with status code {response.status_code}, retrying...')
            time.sleep(0.5)
    
    print('The api call failed after 3 attempts.')
    return None

In [120]:
def obtainTopCMCTokens(base_url: str, session: Session, start_date: date, end_date: date) -> list:
    """ obtain the top cmc tokens for each month of the study peiod.

    Args:
        base_url (str): The url for the pro api at cmc. 
        session (Session): A requests.Session object that will be used to make the API call.
        start_date (datetime.date): start of study period.
        end_date (datetime.date): end of study period.
    
    Returns:
        unique_token_cmc_ids (list): unique cmc token integer ids.
    """
    # specify the dates to obtain
    dates = [start_date]
    current_date = start_date+relativedelta(months=1)
    while current_date <= end_date:
        dates.append(current_date)
        current_date += relativedelta(months=1)

    # set up target url
    endpoint = '/v1/cryptocurrency/listings/historical'
    url = f"{base_url}{endpoint}"

    # obtain the top 500 tokens by cmc ranking for each month in the study period
    token_cmc_ids = []
    for date in dates:
        # set up params for call
        if date.year < 2017:
            limit = 200
        elif date.year < 2020:
            limit = 400
        else:
            limit = 650
        params = {'date': date,
                  'limit': limit,
                  'convert': 'USD',
                  'aux': 'cmc_rank'}

        # make the call
        data = makeCMCApiCall(session, url, params)

        # extract the token ids
        new_tokens = [token['id'] for token in data]
        token_cmc_ids.extend(new_tokens)

        # space out calls
        time.sleep(0.5)

    # drop redundant tokens
    unique_token_cmc_ids = list(np.unique(np.array(token_cmc_ids)))

    # manually drop tokens
    unique_token_cmc_ids.remove()

    return unique_token_cmc_ids


In [121]:
def formDataframeOfTopCMCTokens(base_url: str, session: Session, cmc_ids: list) -> pd.DataFrame():
    """ pull all cmc meta data for tokens and merge onto universe of top tokens in cmc_ids.

    Args:
        base_url (str): the url for the pro api at cmc. 
        session (Session): A requests.Session object that will be used to make the API call.
        cmc_ids (list): top tokens by cmc ranking.


    Returns:
        token_df (pd.DataFrame): dataframe of token meta data for top tokens by cmc ranking.
    """

    # set up target url for obtaining mapping from id to token info
    endpoint = '/v1/cryptocurrency/map'
    url = f"{base_url}{endpoint}"

    # obtain the CMC mapping of IDs to token info
    full_data = []
    starts = [1, 5001, 10001, 15001]
    for start in starts:
        # set up params for call
        params = {'listing_status': 'active,inactive,untracked',
                  'limit': 5000,
                  'start': start,
                  'aux': 'platform,first_historical_data,last_historical_data'}

        # make the call
        data = makeCMCApiCall(session, url, params)

        # Append the results
        full_data.extend(data)

        # space out calls
        time.sleep(0.5)

    # clean up token info dictionaries
    clean_full_data = []
    for token_dict in full_data:
        new_dict = {}
        new_dict['cmc_id'] = token_dict['id']
        new_dict['cmc_symbol'] = token_dict['symbol']
        new_dict['name'] = token_dict['name']
        new_dict['cmc_slug'] = token_dict['slug']
        try:
            new_dict['cmc_first_date'] = token_dict['first_historical_data']
            new_dict['cmc_last_date'] = token_dict['last_historical_data']
        except KeyError:
            new_dict['cmc_first_date'] = None
            new_dict['cmc_last_date'] = None
        if token_dict['platform'] != None:
            new_dict['platform_cmc_slug'] = token_dict['platform']['slug']
        else:
            new_dict['platform_cmc_slug'] = None
        clean_full_data.append(new_dict)

    cmc_tokens_df = pd.DataFrame(clean_full_data)

    # Merge down to just the tokens of interest
    target_tokens_df = pd.DataFrame(data = {'cmc_id': cmc_ids})
    token_df = cmc_tokens_df.merge(target_tokens_df,
                                on='cmc_id',
                                how='inner',
                                validate='one_to_one')

    # reset index and sort
    token_df = token_df.sort_values(by='cmc_id', ignore_index=True)

    return token_df

In [122]:
def pullPriceMcapVolume(base_url: str, session: Session, 
        token_df: pd.DataFrame, start_date: date, end_date: date) -> pd.DataFrame:
    """ pulls historical price, volume, and mcap data for token ids in token_df.
    
    Args:
        base_url (str): The base URL for the CoinMarketCap API.
        session (requests.Session): A requests.Session object to be used to make the API calls.
        token_df (pd.DataFrame): A pandas DataFrame that contains information about the tokens to 
                                 retrieve data for. Must include columns 'cmc_id' and 'cmc_slug'.
        start_date (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.
    Returns:
        df (pd.DataFrame): price, volume, and mcap for target tokens within specified date range. 
                           The DataFrame has columns 'cmc_id', 'date', 'usd_per_token', 'usd_mcap',
                           and 'usd_volume_24h'.
    """
    # initialize list to build
    token_dfs = []

    # set up target url
    endpoint = '/v1/cryptocurrency/quotes/historical'
    url = f"{base_url}{endpoint}"

    # loop over tokens
    token_ids = list(token_df.cmc_id.values)
    token_names = list(token_df.cmc_slug.values)
    for i, (token_id, token_name) in enumerate(zip(token_ids, token_names)):
        # monitor progress
        print(f"Processing the {i+1}th token ({(i+1)/len(token_ids)*100:.2f}%): {token_name}")

        # build parameters
        params = {'id': str(token_id),
                'time_start': start_date.strftime('%Y-%m-%d'),
                'time_end': end_date.strftime('%Y-%m-%d'),
                'count': 1,
                'interval': '1d',
                'convert': 'USD'} 
        
        # make the api call
        data = makeCMCApiCall(session, url, params, retries=3)

        # clean the data
        if data != None:
            if data['is_fiat'] == 0:
                token_quote_dict_list = []
                for quote in data['quotes']:
                    new_dict = {}
                    new_dict['date']           = quote['quote']['USD']['timestamp'][:10]
                    new_dict['usd_per_token']  = quote['quote']['USD']['price']
                    new_dict['usd_volume_24h'] = quote['quote']['USD']['volume_24h']
                    new_dict['usd_mcap']       = quote['quote']['USD']['market_cap']
                    token_quote_dict_list.append(new_dict)

                token_df = pd.DataFrame(token_quote_dict_list)
                token_df['cmc_id'] = data['id']
                token_dfs.append(token_df)
            else:
                print(f"{data['name']} is fiat")        

        # space out calls
        time.sleep(1)

    # build final dataframe
    df = pd.concat(token_dfs)

    return df            

In [129]:
if __name__ == "__main__":
    # set args
    api_fp = '../../admin/cmc.txt'
    start_date = date(2015, 1, 1)
    end_date   = date(2023, 2, 1)
    base_url = "https://pro-api.coinmarketcap.com"

    # import api key
    with open(api_fp) as f:
        API_KEY = f.readlines()
        API_KEY = API_KEY[0].strip()
    
    # confirm api is working
    session = initiateAPI(base_url)

    # obtain potential token ids to include in study
    cmc_ids  = obtainTopCMCTokens(base_url, session, start_date, end_date)
    token_df = formDataframeOfTopCMCTokens(base_url, session, cmc_ids)

    # obtain price, volume, and mcap data for target tokens
    cmc_df = pullPriceMcapVolume(base_url, session, token_df, start_date, end_date)

Processing the 0th token (1/2930.00%): bitcoin
Processing the 1th token (2/2930.00%): litecoin
Processing the 2th token (3/2930.00%): namecoin
Processing the 3th token (4/2930.00%): terracoin
Processing the 4th token (5/2930.00%): peercoin
Processing the 5th token (6/2930.00%): novacoin
Processing the 6th token (7/2930.00%): devcoin
Processing the 7th token (8/2930.00%): feathercoin
Processing the 8th token (9/2930.00%): freicoin
Processing the 9th token (10/2930.00%): bbqcoin
Processing the 10th token (11/2930.00%): ixcoin
Processing the 11th token (12/2930.00%): bitbar
Processing the 12th token (13/2930.00%): worldcoin
Processing the 13th token (14/2930.00%): yacoin
Processing the 14th token (15/2930.00%): digitalcoin
Processing the 15th token (16/2930.00%): franko
Processing the 16th token (17/2930.00%): goldcoin
Processing the 17th token (18/2930.00%): bottlecaps
Processing the 18th token (19/2930.00%): argentum
Processing the 19th token (20/2930.00%): fastcoin
Processing the 20th 

In [None]:
cmc_df

## (3) Pull other data

In [531]:
# OBTAIN CMC COVARIATES AT DAILY LEVEL FOR ALL TOKENS
# NOTE: THIS TAKES 40K CREDITS AND ABOUT 60 MINUTES!

# Form list of strings of all dates in study period
dates = list(pd.date_range('2015-01-01', '2022-01-07', freq='D').strftime('%Y-%m-%d'))
             
# Initialize dictionary for the data
cmc_covars_dict = {'date': [],
                   'cmc_id': [],
                   'num_market_pairs': [],
                   'max_supply': [],
                   'circulating_supply': [],
                   'total_supply': [],
                   'cmc_rank': [],
                   'tags': []}

for date in dates: 
    # Update where we are
    print(date)
    print('\n')
    
    # Set up the call
    endpoint = '/v1/cryptocurrency/listings/historical'
    final_url = base_url+endpoint
    parameters = {'date': date,
                  'limit': 5000,
                  'convert': 'USD',
                  'aux': 'tags,circulating_supply,total_supply,max_supply,cmc_rank,num_market_pairs'}

    # Make the call
    nb_tries = 3
    while True:
        nb_tries -= 1
        try:
            response = session.get(final_url, params=parameters)
            r_json = json.loads(response.text)
            if (r_json['status']['error_message'] == None):
                break
            elif (r_json['status']['error_message'][:29] == 'Search query is out of range.'):
                print('error due to out of range')
                time.sleep(1)
                if nb_tries <= 0:
                    assert(1==0),'out of range error occured several times'
            else:
                assert(1==0),'json has error'

        except (ConnectionError, Timeout, TooManyRedirects) as err:
            if nb_tries <= 0:
                raise err
            else:
                print('error due to connection, timeout, or redirect')
                time.sleep(1)

    # Add the data for that day to the dictionary
    for token in r_json['data']:
        cmc_covars_dict['date'].append(date)
        cmc_covars_dict['cmc_id'].append(token['id'])
        cmc_covars_dict['num_market_pairs'].append(token['num_market_pairs'])
        cmc_covars_dict['max_supply'].append(token['max_supply'])
        cmc_covars_dict['circulating_supply'].append(token['circulating_supply'])
        cmc_covars_dict['total_supply'].append(token['total_supply'])
        cmc_covars_dict['cmc_rank'].append(token['cmc_rank'])
        cmc_covars_dict['tags'].append(token['tags'])

    # Delay next call to not break limits
    time.sleep(1)
    

2020-09-22


2020-09-23


2020-09-24


2020-09-25


2020-09-26


2020-09-27


2020-09-28


2020-09-29


2020-09-30


2020-10-01


2020-10-02


2020-10-03


2020-10-04


2020-10-05


2020-10-06


2020-10-07


2020-10-08


2020-10-09


2020-10-10


2020-10-11


2020-10-12


2020-10-13


2020-10-14


2020-10-15


2020-10-16


2020-10-17


2020-10-18


2020-10-19


2020-10-20


2020-10-21


2020-10-22


2020-10-23


2020-10-24


2020-10-25


2020-10-26


2020-10-27


2020-10-28


2020-10-29


2020-10-30


2020-10-31


2020-11-01


2020-11-02


2020-11-03


2020-11-04


2020-11-05


2020-11-06


2020-11-07


2020-11-08


2020-11-09


2020-11-10


2020-11-11


2020-11-12


2020-11-13


2020-11-14


2020-11-15


2020-11-16


2020-11-17


2020-11-18


2020-11-19


2020-11-20


2020-11-21


2020-11-22


2020-11-23


2020-11-24


2020-11-25


2020-11-26


2020-11-27


2020-11-28


2020-11-29


2020-11-30


2020-12-01


2020-12-02


2020-12-03


2020-12-04


2020-12-05


2020-12-06


2020-12-07



In [532]:
# Convert data into a df
cmc_covars_df = pd.DataFrame(cmc_covars_dict)

In [533]:
# Subset data down to IDs common in the two different pulls of top tokens
unique_ids_1 = final_df.cmc_id.values
unique_ids_2 = np.unique(cmc_covars_df.cmc_id.values)
unique_ids_common = list(set(unique_ids_2).intersection(set(unique_ids_1)))
cmc_covars_df = cmc_covars_df[cmc_covars_df.cmc_id.isin(unique_ids_common)]
final_df = final_df[final_df.cmc_id.isin(unique_ids_common)]

In [534]:
# OBTAIN GLOBAL COINMARKETCAP DATA

# Set up the call
endpoint = '/v1/global-metrics/quotes/historical'
final_url = base_url+endpoint
parameters = {'time_start': '2014-12-30',
              'time_end': '2022-01-07',
              'count': 10,
              'interval': '1d',
              'convert': 'USD',
              'aux': 'btc_dominance,active_cryptocurrencies,active_exchanges,active_market_pairs,total_volume_24h,total_volume_24h_reported,altcoin_market_cap,altcoin_volume_24h,altcoin_volume_24h_reported'}

response = session.get(final_url, params=parameters)
r_json = json.loads(response.text)

# Initialize dictionary for the data
cmc_macro_dict = {'date': [],
                  'total_market_cap': [],
                  'total_volume_24h': [],
                  'total_volume_24h_reported': [],
                  'altcoin_market_cap': [],
                  'altcoin_volume_24h': [],
                  'altcoin_volume_24h_reported': [],
                  'btc_dominance': [],
                  'active_cryptocurrencies': [],
                  'active_exchanges': [],
                  'active_market_pairs': []}

# Convert JSON into dictionary
for token in r_json['data']['quotes']:
    cmc_macro_dict['date'].append(token['timestamp'][:10])
    cmc_macro_dict['total_market_cap'].append(token['quote']['USD']['total_market_cap'])
    cmc_macro_dict['total_volume_24h'].append(token['quote']['USD']['total_volume_24h'])
    cmc_macro_dict['total_volume_24h_reported'].append(token['quote']['USD']['total_volume_24h_reported'])
    cmc_macro_dict['altcoin_market_cap'].append(token['quote']['USD']['altcoin_market_cap'])
    cmc_macro_dict['altcoin_volume_24h'].append(token['quote']['USD']['altcoin_volume_24h'])
    cmc_macro_dict['altcoin_volume_24h_reported'].append(token['quote']['USD']['altcoin_volume_24h_reported'])
    cmc_macro_dict['btc_dominance'].append(token['btc_dominance'])
    cmc_macro_dict['active_cryptocurrencies'].append(token['active_cryptocurrencies'])
    cmc_macro_dict['active_exchanges'].append(token['active_exchanges'])
    cmc_macro_dict['active_market_pairs'].append(token['active_market_pairs'])

# Clean up the dataframe to have all study period dates and interpolate missing dates
macro_df = pd.DataFrame(cmc_macro_dict)[1:-6]
macro_df = macro_df[~macro_df.duplicated(keep='last', subset=['date'])] # One duplicated row to drop
dates = dates = list(pd.date_range('2015-01-01', '2021-12-31', freq='D').strftime('%Y-%m-%d'))
dates_df = pd.DataFrame(data = {'date': dates})
macro_df = macro_df.merge(dates_df, 
                          on='date',
                          how='outer',
                          validate='one_to_one')
macro_df['date'] = macro_df['date'].astype('datetime64[ns]')
macro_df = macro_df.sort_values(by='date')
macro_df = macro_df.interpolate()

In [535]:
# DETERMINE RELEVANT EXCHANGES TO PULL HISTORICAL DATA ON

# Set up the call
endpoint = '/v1/exchange/map'
final_url = base_url+endpoint
parameters = {'listing_status': 'active',
              'limit': 500,
              'aux': 'first_historical_data'}

# Make the call
response = session.get(final_url, params=parameters)
r_json = json.loads(response.text)

# Clean it up
exchange_df = pd.concat([pd.DataFrame(exchange, index=[0]) for exchange in r_json['data']])
exchange_df = exchange_df.reset_index(drop=True)
exchange_df = exchange_df.rename(columns = {'id': 'exchange_id',
                                            'slug': 'exchange_slug'})
exchange_df = exchange_df[['exchange_id', 'exchange_slug']]

In [536]:
# OBTAIN METADATA

# Set up the call
exchange_ids = ','.join([str(ex_id)for ex_id in exchange_df.exchange_id.values])
endpoint = '/v1/exchange/info'
final_url = base_url+endpoint
parameters = {'id': exchange_ids,
              'aux': 'date_launched'}

# Make the call
response = session.get(final_url, params=parameters)
r_json = json.loads(response.text)

# Add date launched to the data frame
for key in r_json['data'].keys():
    exchange_df.loc[exchange_df.exchange_id == int(key), 
                    'date_launched'] = r_json['data'][key]['date_launched']

In [537]:
# Dropping exchanges that do not have historical data
exchange_names_to_drop = ['feg-exchange', 'uniswap-v3-arbitrum', 'huckleberry', 
                          'photonswap-finance', 'maiar-exchange', 'katana', 
                          'kine-protocol-polygon', 'bit2me', 'balancer-v2-polygon',
                          'balancer-v2-arbitrum', 'uniswap-v3-polygon', 'tinyman', 
                          'algebra', 'kine-protocol-bsc', 'btcex-exchange']
exchange_df = exchange_df[~exchange_df.exchange_slug.isin(exchange_names_to_drop)]

In [538]:
# OBTAIN EXCHANGE HISTORICAL DATA

ex_hist_data_dict = {'exchange_id': [],
                     'date': [],
                     'exchange_volume_24h': [],
                     'num_market_pairs': []}

# Loop over all exchanges
for exchange_id in exchange_df.exchange_id.values: 
    print(exchange_df[exchange_df.exchange_id == exchange_id]['exchange_slug'].values[0])

    # Set up the call
    endpoint = '/v1/exchange/quotes/historical'
    final_url = base_url+endpoint
    parameters = {'id': exchange_id,
                  'time_start': '2015-01-01',
                  'time_end': '2021-12-31',
                  'interval': '1d',
                  'count': 10000,
                  'convert': 'USD'}

    # Make the call
    response = session.get(final_url, params=parameters)
    r_json = json.loads(response.text)

    # Add the data to the dictionary
    for ex_data in r_json['data']['quotes']:
        ex_hist_data_dict['exchange_id'].append(exchange_id)
        ex_hist_data_dict['date'].append(ex_data['quote']['USD']['timestamp'])
        ex_hist_data_dict['exchange_volume_24h'].append(ex_data['quote']['USD']['volume_24h'])
        ex_hist_data_dict['num_market_pairs'].append(ex_data['num_market_pairs'])
        
    # Sleep
    time.sleep(1)

poloniex
bittrex
kraken
bleutrade
bittylicious
cex-io
bitfinex
hitbtc
exmo
okcoin
indodax
bitstamp
itbit
zaif
therocktrading
coinmate
zonda
coinbase-exchange
bitex-la
bitonic
yobit
huobi-global
litebit
coincheck
liquid
southxchange
bitso
btcbox
coincorner
bitflyer
isx
gemini
dex-trade
exrates
bitmex
independent-reserve
luno
coinone
bisq
korbit
bithumb
lykke-exchange
kuna
mercatox
p2pb2b
tidex
heat-wallet
freiexchange
btc-markets
paribu
btc-alpha
coingi
ripplefox
gatehub
coss
btcturk-pro
stex
waves-exchange
koinim
stellar-decentralized-exchange
buda
btc-trade-ua
localtrade
bitbank
mercado-bitcoin
altcoin-trader
bancor-network
binance
bits-blockchain
tidebit
cryptomarket
okx
gate-io
idex
kucoin
bitcointrade
topbtc
aex
coinfalcon
coinut
satang-pro
zb-com
bigone
lbank
gopax
bibox
coinbene
coinex
upbit
tradeogre
c-patex
crxzone
fatbtc
paymium
ddex
rudex
zebpay
bitbns
unocoin
latoken
crex24
bithesap
cryptonex
cointiger
b2bx
dragonex
hotbit
switcheo
bitforex
kyber-network
coindeal
bitmart
dig

In [539]:
# Convert to dataframe
ex_historical_df = pd.DataFrame(ex_hist_data_dict)

## (4) Save all the data

In [540]:
# Save cmc token id crosswalk
final_df.to_csv('../3-data/raw/cmc_token_universe.csv', index=False)

In [541]:
# Save cmc price volume mcap panel
df.to_csv('../3-data/raw/cmc_price_vol_mcap_panel.csv', index=False)

In [542]:
# Save cmc token covars panel
cmc_covars_df.to_csv('../3-data/raw/cmc_token_covars_panel.csv', index=False)

In [543]:
# Save cmc macro timeseries data
macro_df.to_csv('../3-data/raw/cmc_macro_timeseries.csv', index=False)

In [544]:
# Save cmc exchange covariates
exchange_df.to_csv('../3-data/raw/cmc_exchange_covar.csv', index=False)

In [545]:
# Save cmc exchange panel data
ex_historical_df.to_csv('../3-data/raw/cmc_exchange_panel.csv', index=False)

In [None]:
# MOVE THESE NOTES TO CLEANING

# manually look through it to confirm they are legit tokens
# or maybe give this task to jacob
# or maybe schedule a time to do this with jacob so we 2x the speed

# Lets look to see if the 0.01% mcap rule is good for the entire time period

# Jan 1 2015 - $5B - $500k
# Jan 1 2016 - $7B - $700k
# Jan 1 2017 - $18B - $1.8M
# Jan 1 2018 - $600B - $60M
# Apr 1 2018 - $300B - $30M
# Jul 1 2018 - $250B - $25M
# Jan 1 2019 - $125B - $12M
# Apr 1 2019 - $145B - $14M
# Jul 1 2019 - $330B - $33M
# Oct 1 2019 - $220B - $22M
# Jan 1 2020 - $200B - $20M
# Apr 1 2020 - $175B - $17M
# Jul 1 2020 - $260B - $26M
# Oct 1 2020 - $340B - $34M
# Jan 1 2021 - $770B - $77M
# Apr 1 2021 - $1.9T - $190M
# Jul 1 2021 - $1.4T - $140M
# Oct 1 2021 - $2T - $200M