In [46]:
# import packages
import pandas as pd
import numpy as np
import time
from datetime import date
from requests import Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
from dateutil.relativedelta import relativedelta
import json.decoder
from typing import Dict, Any, Optional
import datetime


In [33]:
def initiateAPI(base_url: str) -> Session:
    """ confirm the cmc api is working for the set api key.

    Args:
        base_url (str): the url for the pro api at cmc. 
    
    Returns:
        session (requests.Session): request class for pinging cmc.
    """
    endpoint = '/v1/key/info'
    headers = {'Accepts': 'application/json',
               'X-CMC_PRO_API_KEY': API_KEY}
    final_url = base_url + endpoint
    session = Session()
    session.headers.update(headers)
    r = session.get(final_url)
    print(r.json())

    return session


In [34]:
def makeCMCApiCall(session: Session, url: str, params: dict, retries: int=3) -> Optional[Dict[str, Any]]:
    """ makes an API call to CoinMarketCap using the provided requests.Session object.
    
    Args:
        session (requests.Session): A requests.Session object that will be used to make the API call.
        url (str): The API endpoint URL to call.
        params (dict): A dictionary of parameters to include in the API call.
        retries (int): The number of times to retry the API call if it fails. Default is 3.
        
    Returns:
        data (dict): the data from the api response, or None if the api call failed.
    """
    for attempt in range(retries):
        response = session.get(url, params=params)
        if response.ok:
            try:
                return response.json()['data']
            except json.decoder.JSONDecodeError as e:
                print(f'Error decoding JSON response: {str(e)}')
        else:
            # There was an error, retry after a short delay
            print(f'The API call failed with status code {response.status_code}, retrying...')
            time.sleep(0.5)
    
    print('The api call failed after 3 attempts.')
    return None

In [35]:
def obtainTopCMCAssets(base_url: str, session: Session, start_date: date, end_date: date) -> list:
    """ obtain the top cmc assets for each month of the study peiod.

    Args:
        base_url (str): The url for the pro api at cmc. 
        session (Session): A requests.Session object that will be used to make the API call.
        start_date (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.
    
    Returns:
        unique_token_cmc_ids (list): unique cmc asset integer ids.
    """
    # specify the dates to obtain
    dates = [start_date]
    current_date = start_date+relativedelta(months=1)
    while current_date <= end_date:
        dates.append(current_date)
        current_date += relativedelta(months=1)

    # set up target url
    endpoint = '/v1/cryptocurrency/listings/historical'
    url = f"{base_url}{endpoint}"

    # obtain the top 500 assets by cmc ranking for each month in the study period
    asset_cmc_ids = []
    for date in dates:
        # set up params for call
        if date.year <= 2016:
            limit = 50
        elif date.year <= 2019:
            limit = 300
        else:
            limit = 500
        params = {'date': date,
                  'limit': limit,
                  'convert': 'USD',
                  'aux': 'cmc_rank'}

        # make the call
        data = makeCMCApiCall(session, url, params)

        # extract the asset ids
        new_assets = [asset['id'] for asset in data]
        asset_cmc_ids.extend(new_assets)

        # space out calls
        time.sleep(1)
        print(date)

    # drop redundant assets
    unique_asset_cmc_ids = list(np.unique(np.array(asset_cmc_ids)))

    return unique_asset_cmc_ids


In [36]:
def formDataframeOfTopCMCAssets(base_url: str, session: Session, cmc_ids: list) -> pd.DataFrame():
    """ pull all cmc meta data for assets and merge onto universe of top assets in cmc_ids.

    Args:
        base_url (str): the url for the pro api at cmc. 
        session (Session): A requests.Session object that will be used to make the API call.
        cmc_ids (list): top assets by cmc ranking.


    Returns:
        cw_df (pd.DataFrame): dataframe of asset meta data for top assets by cmc ranking.
    """

    # set up target url for obtaining mapping from id to asset info
    endpoint = '/v1/cryptocurrency/map'
    url = f"{base_url}{endpoint}"

    # obtain the CMC mapping of IDs to asset info
    full_data = []
    starts = [1, 5001, 10001, 15001]
    for start in starts:
        # set up params for call
        params = {'listing_status': 'active,inactive,untracked',
                  'limit': 5000,
                  'start': start,
                  'aux': 'platform,first_historical_data,last_historical_data'}

        # make the call
        data = makeCMCApiCall(session, url, params)

        # Append the results
        full_data.extend(data)

        # space out calls
        time.sleep(1)

    # clean up asset info dictionaries
    clean_full_data = []
    for asset_dict in full_data:
        new_dict = {}
        new_dict['cmc_id'] = asset_dict['id']
        new_dict['cmc_symbol'] = asset_dict['symbol']
        new_dict['name'] = asset_dict['name']
        new_dict['cmc_slug'] = asset_dict['slug']
        try:
            new_dict['cmc_first_date'] = asset_dict['first_historical_data']
            new_dict['cmc_last_date'] = asset_dict['last_historical_data']
        except KeyError:
            new_dict['cmc_first_date'] = None
            new_dict['cmc_last_date'] = None
        if asset_dict['platform'] != None:
            new_dict['platform_cmc_slug'] = asset_dict['platform']['slug']
        else:
            new_dict['platform_cmc_slug'] = None
        clean_full_data.append(new_dict)

    cmc_assets_df = pd.DataFrame(clean_full_data)

    # Merge down to just the assets of interest
    target_assets_df = pd.DataFrame(data = {'cmc_id': cmc_ids})
    cw_df = cmc_assets_df.merge(target_assets_df,
                                on='cmc_id',
                                how='inner',
                                validate='one_to_one')

    # reset index and sort
    cw_df = cw_df.sort_values(by='cmc_id', ignore_index=True)

    return cw_df

In [37]:
def pullPriceMcapVolume(base_url: str, session: Session, 
        cw_df: pd.DataFrame, start_date: date, end_date: date) -> pd.DataFrame:
    """ pulls historical price, volume, and mcap data for asset ids in cw_df.
    
    Args:
        base_url (str): The base URL for the CoinMarketCap API.
        session (requests.Session): A requests.Session object to be used to make the API calls.
        cw_df (pd.DataFrame): A pandas DataFrame that contains information about the assets to 
                                 retrieve data for. Must include columns 'cmc_id' and 'cmc_slug'.
        start_date (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.
        
    Returns:
        df (pd.DataFrame): price, volume, and mcap for target assets within specified date range. 
                           The DataFrame has columns 'cmc_id', 'date', 'usd_per_asset', 'usd_mcap',
                           and 'usd_volume_24h'.
    """
    # initialize list to build
    cw_dfs = []

    # set up target url
    endpoint = '/v1/cryptocurrency/quotes/historical'
    url = f"{base_url}{endpoint}"

    # loop over assets
    asset_ids = list(cw_df.cmc_id.values)
    asset_names = list(cw_df.cmc_slug.values)
    for i, (asset_id, asset_name) in enumerate(zip(asset_ids, asset_names)):
        # monitor progress
        print(f"Processing the {i+1}th asset ({(i+1)/len(asset_ids)*100:.2f}%): {asset_name}")

        # build parameters
        params = {'id': str(asset_id),
                'time_start': start_date.strftime('%Y-%m-%d'),
                'time_end': end_date.strftime('%Y-%m-%d'),
                'count': 1,
                'interval': '1d',
                'convert': 'USD'} 
        
        # make the api call
        data = makeCMCApiCall(session, url, params, retries=3)

        # clean the data
        if data != None:
            if data['is_fiat'] == 0:
                asset_quote_dict_list = []
                for quote in data['quotes']:
                    new_dict = {}
                    new_dict['date']           = quote['quote']['USD']['timestamp'][:10]
                    new_dict['usd_per_asset']  = quote['quote']['USD']['price']
                    new_dict['usd_volume_24h'] = quote['quote']['USD']['volume_24h']
                    new_dict['usd_mcap']       = quote['quote']['USD']['market_cap']
                    asset_quote_dict_list.append(new_dict)

                cw_df = pd.DataFrame(asset_quote_dict_list)
                cw_df['cmc_id'] = data['id']
                cw_dfs.append(cw_df)
            else:
                print(f"{data['name']} is fiat")        

        # space out calls
        time.sleep(0.2)

    # build final dataframe
    df = pd.concat(cw_dfs)

    return df            

In [38]:
def initialCleanAssetMetadata(cw_df: pd.DataFrame, column_map: dict, dropna: bool = True) -> pd.DataFrame:
    """ Clean asset metadata to return cleaned dataframe. 

    Args:
        cw_df (pd.DataFrame): DataFrame containing asset metadata to be cleaned.
        column_map (dict): a mapping of the current column names to the desired column names.
        dropna (bool): whether to drop any rows with missing values in key columns.
    
    Returns:
        cw_df (pd.DataFrame): cleaned DataFrame.
    """
    # confirm has required columns
    for k, v in column_map.items():
        if k not in cw_df.columns:
            raise ValueError(f"Input DataFrame must contain '{k}' column.")
        
    # apply column map renaming
    cw_df = cw_df.rename(columns=column_map)

    # subset to useful columns
    cw_df = cw_df[['cmc_id', 'slug_cmc', 'symbol_cmc', 'first_date_cmc', 'last_date_cmc']]

    # convert date columns to date type
    cw_df['first_date_cmc'] = pd.to_datetime(cw_df.first_date_cmc, format='%Y-%m-%d', utc=False)
    cw_df['last_date_cmc'] = pd.to_datetime(cw_df.last_date_cmc, format='%Y-%m-%d', utc=False)

    # drop rows with missing values in key columns
    if dropna:
        cw_df = cw_df.dropna(subset=['cmc_id', 'slug_cmc'])
    
    # assert that each row has a unique `cmc_id` and `slug_cmc` value (if desired)
    if len(cw_df) != len(cw_df['cmc_id'].unique()):
        raise ValueError("Input DataFrame has non-unique 'cmc_id' values.")
    if len(cw_df) != len(cw_df['slug_cmc'].unique()):
        raise ValueError("Input DataFrame has non-unique 'slug_cmc' values.")
    
    # sort values and reset index
    cw_df = cw_df.sort_values(by='cmc_id', ignore_index=True)

    return cw_df
    


In [39]:
def initialCleanPanel(panel_df: pd.DataFrame, start_year: int=2015, end_date: str='2023-02-02', ) -> pd.DataFrame:
    """ clean panel of cmc prices, volume, and mcap data.
     
    Args:
        panel_df (pandas.DataFrame): panel data to clean.
        start_year (int): the minimum year to include in the DataFrame (default: 2015).
        end_date (str): the maximum date (inclusive) to include in the DataFrame (default: '2023-02-02').

    Returns:
        (pd.DataFrame): The cleaned DataFrame.
    """
    # confirm has the right columns
    expected_cols = ['date', 'cmc_id', 'usd_per_token', 'usd_mcap', 'usd_volume_24h']
    if not all(col in panel_df.columns for col in expected_cols):
        raise ValueError(f"Missing expected columns: {expected_cols}")
    
    # rename columns to standard convention (with data source name in it)
    panel_df = panel_df.rename(columns = {'usd_per_token': 'usd_per_token_cmc',
                                          'usd_mcap': 'usd_mcap_cmc',
                                          'usd_volume_24h': 'usd_volume_24h_cmc'})

    # convert columns to correct data type
    panel_df['date'] = pd.to_datetime(panel_df.date, format='%Y-%m-%d', utc=False)

    # set column order
    panel_df = panel_df[['date', 'cmc_id', 'usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc']]

    # drop rows
    panel_df = panel_df[(panel_df.date.dt.year >= 2015) & (panel_df.date <= '2023-02-02')]
    panel_df = panel_df.dropna(how='any', subset=['date', 'cmc_id'])
    panel_df = panel_df.dropna(how='all', subset=['usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc'])

    # form list of data columns to work with
    data_cols = list(panel_df.columns.values)
    data_cols.remove('date')
    data_cols.remove('cmc_id')

    # set negative values to missing and too large values to missing
    for col in data_cols:
        panel_df.loc[panel_df[col] < 0, col] = np.nan
        panel_df.loc[panel_df[col] > 2e12, col] = np.nan

    # drop duplicated rows across id columns
    panel_df = panel_df.drop_duplicates(subset=['date', 'cmc_id'])

    # sort values and reset index
    panel_df = panel_df.sort_values(by=['date', 'cmc_id'], 
                                    ignore_index=True)

    return panel_df

In [40]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    geom_avg_at_given_freq = np.prod(1+returns)**(1/len(returns))-1
    if annualized==False:
        return geom_avg_at_given_freq
    else:
        return (geom_avg_at_given_freq+1)**periods_in_year-1

In [41]:
def prepPanelForInitialInclusiveCriteria(panel_df: pd.DataFrame, cw_df: pd.DataFrame) -> pd.DataFrame:
    """ performs various ad hoc cleaning to prep the panel further for applying inclusion criteria.
    
    Args:
        panel_df (pd.DataFrame): panel of asset prices, trading volumes, and mcaps from cmc.
        cw_df (pd.DataFrame): identifying variables for the assets.
        
    Returns:
        panel_df (pd.DataFrame): cleaned panel.
    """
    # manually remove tokens from panel
    tokens_to_remove = [770, 776, 3787, 8644, 9103]
    panel_df = panel_df[~panel_df.cmc_id.isin(tokens_to_remove)]

    # merge on cmc slug and drop the cmc id
    panel_df = panel_df.merge(cw_df[['cmc_id', 'slug_cmc']],
                              on='cmc_id',
                              how='inner',
                              validate='many_to_one')
    panel_df = panel_df.drop('cmc_id', axis=1)
    panel_df = panel_df[['date', 'slug_cmc', 'usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc']]
    panel_df = panel_df.sort_values(by=['date', 'slug_cmc'], ignore_index=True)

    # adjust particular values
    panel_df.loc[(panel_df.slug_cmc=='uquid-coin') & 
                  panel_df.usd_volume_24h_cmc.isnull(), 'usd_volume_24h_cmc'] = 0

    # ensure no missing in the df
    assert(0==panel_df.isnull().sum().sum())

    # ensure unique on key columns
    dups = panel_df.duplicated(subset=['date', 'slug_cmc'])
    assert(~dups.any()),('there are duplicates in the data on keys date and slug_cmc')

    # drop more tokens manually
    # NOTES: ampleforth is a stablecoin, pax gold is a gold stablecoin, index, and wrapped tokens
    wrapped_tokens_to_drop = ['ampleforth', 'cryptoindex-com-100', 'pax-gold',
                            'wrapped-centrifuge', 'wrapped-luna-token', 'wrapped-ncg', 'wrapped-nxm']
    panel_df = panel_df[~panel_df.slug_cmc.isin(wrapped_tokens_to_drop)]

    return panel_df

In [None]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    if not isinstance(returns, np.ndarray):
        raise TypeError("Input 'returns' must be a NumPy array")
    if annualized and periods_in_year is None:
        raise ValueError("Input 'periods_in_year' must be provided if 'annualized' is True")
    geom_avg_at_given_freq = np.prod(1 + returns) ** (1 / np.size(returns)) - 1
    return (geom_avg_at_given_freq + 1) ** periods_in_year - 1 if annualized else geom_avg_at_given_freq

In [89]:
def buildInitialAssetUniverse(panel_df: pd.DataFrame, start_date: date, end_date: date) -> dict:
    """ build an initial universe of assets to pull data for.
    
    Args:
        panel_df (pd.DataFrame): panel of asset prices, trading volumes, and mcaps from cmc.
        start_date (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.
    
    Returns:
        asset_universe (dict): keys of start of each month in study period with associated value
                               of list of asset names to include.
    """
    # specify the dates to obtain
    dates = [start_date.strftime('%Y-%m-%d')]
    current_date = start_date+relativedelta(months=1)
    while current_date <= end_date:
        dates.append(current_date.strftime('%Y-%m-%d'))
        current_date += relativedelta(months=1)

    # apply suff data, volume, and mcap filters
    asset_universe_per_month = []
    for i in range(len(dates)-1):
        # determine start and end dates for window
        start_window = dates[i]
        end_window   = dates[i+1]

        # build temporary dataframe for this time period
        temp_df = panel_df[(panel_df.date >= start_window) & (panel_df.date <= end_window)].copy()

        # obtain list of tokens to consider
        assets_included = list(np.unique(temp_df[temp_df.date == end_window].slug_cmc.values))

        # figure out tokens removed due to insuff data
        # note: 28 days ensures at least 4 weeks of data 
        asset_ns_df = temp_df.groupby('slug_cmc').size()
        assets_lost_given_insuff_data = list(asset_ns_df[asset_ns_df < 28].index.values)
        for asset in assets_lost_given_insuff_data:
            if asset in assets_included:
                assets_included.remove(asset)

        # Figure out tokens removed due to volume threshold
        temp_vol_df = temp_df.groupby('slug_cmc').usd_volume_24h_cmc.min()
        assets_lost_given_insuff_vol = list(temp_vol_df[temp_vol_df < 10000].index.values)
        for asset in assets_lost_given_insuff_vol:
            if asset in assets_included:
                assets_included.remove(asset)

        # Figure out assets removed due to mcap threshold
        current_year = int(end_window[:4]) 
        if current_year <= 2016:
            mcap_threshold = 750000
        elif current_year == 2017:
            mcap_threshold = 2e6
        elif current_year == 2018:
            mcap_threshold = 30e6
        elif current_year in [2019, 2020]:
            mcap_threshold = 15e6
        elif current_year >= 2021:
            mcap_threshold = 75e6
        temp_mcap_df = temp_df.groupby('slug_cmc').usd_mcap_cmc.min()
        assets_lost_given_mcap_threshold = list(temp_mcap_df[temp_mcap_df < mcap_threshold].index.values)
        for asset in assets_lost_given_mcap_threshold:
            if asset in assets_included:
                assets_included.remove(asset)

        # Report out new asset ever
        print('New assets that we have never had are ')
        if i != 0:
            all_assets = []
            for j in range(i-1,-1,-1):
                all_assets += asset_universe_per_month[j]
            print(np.unique(set(assets_included).difference(set(all_assets))))
        else:
            print(np.unique(assets_included))
        print('\n')

        # Report out assets for this month
        print(f'This month\'s ({end_window}) {len(assets_included)} assets are:')
        print(np.unique(assets_included))
        print('\n\n')

        # Add assets to list
        asset_universe_per_month.append(list(np.unique(assets_included)))

    # build asset universe
    asset_universe_dict = {}
    for i in range(len(dates)-1):
        asset_universe_dict[dates[i+1]] = asset_universe_per_month[i]

    return asset_universe_dict

In [None]:
def determineUniqueAssets(asset_universe_dict) -> list:
    """ determine the unique assets in the universe to return as a list. """
    assets = []
    for k, v in asset_universe_dict.items():
        assets.extend(v)
    assets = list(np.unique(np.array(assets)))
    assets.sort()
    return assets

In [None]:
def pullCMCMacro(base_url: str, session: Session, start_date: date, end_date: date) -> pd.DataFrame:
    """
    Args:
        base_url (str): The base URL for the CoinMarketCap API.
        session (requests.Session): A requests.Session object to be used to make the API calls.
        start_window (date): A datetime.date object representing the start date for the study period.
        end_date (date): A datetime.date object representing the end date for the study period.

    Returns:
        macro_df (pd.DataFrame): time series data of cmc macro covariates.
    """
        
    # set up the call
    endpoint = '/v1/global-metrics/quotes/historical'
    url      = f"{base_url}{endpoint}"
    params = {'time_start': start_date.strftime('%Y-%m-%d'),
            'time_end': end_date.strftime('%Y-%m-%d'),
            'count': 10,
            'interval': '1d',
            'convert': 'USD',
            'aux': 'btc_dominance,active_cryptocurrencies,active_exchanges,active_market_pairs,total_volume_24h,altcoin_market_cap,altcoin_volume_24h'}

    # make the call
    data = makeCMCApiCall(session, url, params, retries=3)

    # initialize dictionary for the data
    cmc_macro_dict = {'date': [],
                    'total_market_cap': [],
                    'total_volume_24h': [],
                    'altcoin_market_cap': [],
                    'altcoin_volume_24h': [],
                    'btc_dominance': [],
                    'active_cryptocurrencies': [],
                    'active_exchanges': [],
                    'active_market_pairs': []}

    # convert JSON into dictionary
    for days_data in data['quotes']:
        cmc_macro_dict['date'].append(days_data['timestamp'])
        cmc_macro_dict['total_market_cap'].append(days_data['quote']['USD']['total_market_cap'])
        cmc_macro_dict['total_volume_24h'].append(days_data['quote']['USD']['total_volume_24h'])
        cmc_macro_dict['altcoin_market_cap'].append(days_data['quote']['USD']['altcoin_market_cap'])
        cmc_macro_dict['altcoin_volume_24h'].append(days_data['quote']['USD']['altcoin_volume_24h'])
        cmc_macro_dict['btc_dominance'].append(days_data['btc_dominance'])
        cmc_macro_dict['active_cryptocurrencies'].append(days_data['active_cryptocurrencies'])
        cmc_macro_dict['active_exchanges'].append(days_data['active_exchanges'])
        cmc_macro_dict['active_market_pairs'].append(days_data['active_market_pairs'])

    # clean up the dataframe to have all study period dates and interpolate missing dates
    macro_df = pd.DataFrame(cmc_macro_dict)
    macro_df['date'] = pd.to_datetime(macro_df.date).dt.ceil('D')
    macro_df['date'] = macro_df.date.dt.strftime('%Y-%m-%d')

    return macro_df

In [None]:
if __name__ == "__main__":
    # set args
    api_fp = '../../admin/cmc.txt'
    start_date = date(2015, 1, 1)
    end_date   = date(2023, 2, 1)
    base_url = "https://pro-api.coinmarketcap.com"
    asset_fp = "../data/raw/cmc_asset_universe.pkl"
    panel_fp = "../data/raw/cmc_price_volume_mcap_panel.pkl"
    cw_new_old_col_mapping  = {'cmc_symbol': 'symbol_cmc',
                               'cmc_slug': 'slug_cmc',
                               'cmc_first_date': 'first_date_cmc',
                               'cmc_last_date': 'last_date_cmc'}

    # import api key
    with open(api_fp) as f:
        API_KEY = f.readlines()
        API_KEY = API_KEY[0].strip()
    
    # confirm api is working
    session = initiateAPI(base_url)

    # obtain potential asset ids to include in study
    cmc_ids  = obtainTopCMCAssets(base_url, session, start_date, end_date)
    cw_df = formDataframeOfTopCMCAssets(base_url, session, cmc_ids)

    # obtain price, volume, and mcap data for target assets
    panel_df = pullPriceMcapVolume(base_url, session, cw_df, start_date, end_date)

    # clean the data
    cw_df = initialCleanAssetMetadata(cw_df, cw_new_old_col_mapping)
    panel_df = initialCleanPanel(panel_df)

    # cut down to initial inclusion criteria so i pull just these across other providers
    panel_df = prepPanelForInitialInclusiveCriteria(panel_df, cw_df)
    asset_universe_dict = buildInitialAssetUniverse(panel_df, start_date, end_date)
    asset_universe_list = determineUniqueAssets(asset_universe_dict)

    # pull remaining cmc data
    # TODO historical crypto metadata
    macro_df = pullCMCMacro(base_url, session, start_date, end_date)
    # TODO historical exchange data

    # save the data
    cw_df.to_pickle(asset_fp)
    panel_df.to_pickle(panel_fp)

In [None]:
# EXPLORE RETURNS OF EQUAL- AND MCAP- WEIGHTED PORTFOLIOS

# Cut the panel down to just the assets of interest
asset_universe_unique = list(np.unique([asset 
                                        for sublist in token_universe_per_month 
                                        for asset in sublist]))
df = df[df.slug_cmc.isin(asset_universe_unique)]

# Drop rows that do not have previous day information
df = df.sort_values(by=['slug_cmc', 'date'], ignore_index=True)
df.loc[1:, 'day_diff'] = (df.date[1:].values - df.date[:-1]).values.astype('timedelta64[D]').astype(int)
df['day_diff2'] = df.day_diff.shift(-1)
num_rows = df[df.day_diff == 1].shape[0]
df = df[(df.day_diff == 1) | (df.day_diff2 == 1)]
assert(num_rows <= df.shape[0])
df = df.drop(['day_diff2'], axis=1)

# Calculate day over day return
df['r_t'] = df.groupby('slug_cmc')['usd_per_token_cmc'].apply(pd.Series.pct_change)
df = df[df.day_diff == 1]
tokens_to_drop = np.unique(df[df.r_t.isnull()].slug_cmc.values)
df = df[~df.slug_cmc.isin(tokens_to_drop)]
df = df.drop('day_diff', axis=1)

# Cut down to time period of interest
df = df[df.date.dt.year >= 2016]
df = df[df.date.dt.year <= 2022]

# Ensure no missings
assert(0 == df.isnull().sum().sum())

# Clean up index and resort
df = df.sort_values(by=['date', 'slug_cmc'], ignore_index=True)

# Calculate equal and mcap weighted returns by quarter
equal_df = pd.DataFrame()
mcap_df  = pd.DataFrame()
for i in range(1,len(dates)):
    # Set up dates and asset universe
    date = dates[i]
    date_plus_1mo = datetime.datetime.strptime(date, '%Y-%m-%d') + relativedelta(months=1)
    asset_universe = asset_universe_dict[date]

    # Subset to relevant data
    temp_df = df[(df.date >= date) & (df.date < date_plus_1mo)]
    temp_df = temp_df[temp_df.slug_cmc.isin(asset_universe)]

    # Form equal weighted returns
    temp_eq_df = temp_df.groupby('date')[['r_t']].mean()
    equal_df = pd.concat((equal_df, temp_eq_df))

    # Form mcap weighted returns
    temp_df['mcap_sum'] = temp_df.groupby('date')['usd_mcap_cmc'].transform('sum')
    temp_df['mcap_weight'] = temp_df.usd_mcap_cmc / temp_df.mcap_sum
    temp_df['mcap_r_t'] = temp_df.r_t * temp_df.mcap_weight
    temp_mcap_df = temp_df.groupby('date')[['mcap_r_t']].sum()
    mcap_df = pd.concat((mcap_df, temp_mcap_df))

# Ensure no missing
assert(0==equal_df.isnull().sum().values)
assert(0==mcap_df.isnull().sum().values)

# Report returns
print('equal weighted return:')
print(equal_df.apply(geometricAverageSimpleReturns, axis=0).values[0])
print('sharpe:')
print(np.mean(equal_df.r_t.values)/np.std(equal_df.r_t.values))
print('mcap weighted return:')
print(mcap_df.apply(geometricAverageSimpleReturns, axis=0).values[0])
print('sharpe:')
print(np.mean(mcap_df.mcap_r_t.values)/np.std(mcap_df.mcap_r_t.values))

# Form the returns by year
equal_df['year'] = equal_df.index.year
mcap_df['year'] = mcap_df.index.year
print('equal weighted return:')
print(equal_df.groupby('year').apply(geometricAverageSimpleReturns))
print('mcap weighted return:')
print(mcap_df.groupby('year').apply(geometricAverageSimpleReturns))
equal_df = equal_df.drop('year', axis=1)
mcap_df  = mcap_df.drop('year', axis=1)


In [None]:
# FOR CLEANING SCRIPT:

# ensure each asset does not appear before first date nor after last date
# cmc_ids = np.unique(panel_df.cmc_id.values)
# for cmc_id in cmc_ids:
#     print(cmc_id)
#     first_date = cw_df[cw_df.cmc_id==cmc_id].first_date_cmc.values[0]
#     last_date  = cw_df[cw_df.cmc_id==cmc_id].last_date_cmc.values[0]
#     assert(0==panel_df[(panel_df.cmc_id==cmc_id)&(panel_df.date<first_date)].shape[0])
#     assert(0==panel_df[(panel_df.cmc_id==cmc_id)&(panel_df.date>last_date)].shape[0])

# ensure each asset has consecutive data, interpolate where needed with forward fill

# group the data by cmc_id to loop over
grouped = panel_df.groupby('cmc_id')

# interate through each cmc_id
dfs = []
for name, group in grouped:
    # find the first and last dates for the current id
    first_date = group['date'].min()
    last_date  = group['date'].max()

    # create a new dataframe with all the possible combinations of cmc_id and date
    dates = pd.date_range(first_date, last_date)
    index = pd.MultiIndex.from_product([[name], dates], names=['cmc_id', 'date'])
    full_df = pd.DataFrame(index=index).reset_index()

    # merge the full dataframe with the original dataframe to fill in missing values with NaNs
    merged_df = pd.merge(full_df, group, on=['cmc_id', 'date'], how='left')

    # interpolate the missing values using forward fill for up to 7 consecutive observations
    interpolated_df = merged_df.fillna(method='ffill', limit=21)

    # Check if there are any missing values in the remaining columns for the current id and date range
    if interpolated_df.isnull().values.any():
        print(f"ID {name} has missing values in the given date range, precisely: {int(interpolated_df.isnull().sum().sum()/3)}.")
        break

    # combine    
    dfs.append(interpolated_df)

# Combine all the dataframes and drop the 'cmc_id' index level
result_df = pd.concat(dfs).reset_index()




In [None]:
# FOR INCLUSION CRIT SCRIPT


# Jan 1 2015 - $5B - $500k
# Jan 1 2016 - $7B - $700k
# Jan 1 2017 - $18B - $1.8M
# Jan 1 2018 - $600B - $60M
# Apr 1 2018 - $300B - $30M
# Jul 1 2018 - $250B - $25M
# Jan 1 2019 - $125B - $12M
# Apr 1 2019 - $145B - $14M
# Jul 1 2019 - $330B - $33M
# Oct 1 2019 - $220B - $22M
# Jan 1 2020 - $200B - $20M
# Apr 1 2020 - $175B - $17M
# Jul 1 2020 - $260B - $26M
# Oct 1 2020 - $340B - $34M
# Jan 1 2021 - $770B - $77M
# Apr 1 2021 - $1.9T - $190M
# Jul 1 2021 - $1.4T - $140M
# Oct 1 2021 - $2T - $200M

In [531]:
# OBTAIN CMC COVARIATES AT DAILY LEVEL FOR ALL TOKENS
# NOTE: THIS TAKES 40K CREDITS AND ABOUT 60 MINUTES!

# Form list of strings of all dates in study period
dates = list(pd.date_range('2015-01-01', '2022-01-07', freq='D').strftime('%Y-%m-%d'))
             
# Initialize dictionary for the data
cmc_covars_dict = {'date': [],
                   'cmc_id': [],
                   'num_market_pairs': [],
                   'max_supply': [],
                   'circulating_supply': [],
                   'total_supply': [],
                   'cmc_rank': [],
                   'tags': []}

for date in dates: 
    # Update where we are
    print(date)
    print('\n')
    
    # Set up the call
    endpoint = '/v1/cryptocurrency/listings/historical'
    final_url = base_url+endpoint
    parameters = {'date': date,
                  'limit': 5000,
                  'convert': 'USD',
                  'aux': 'tags,circulating_supply,total_supply,max_supply,cmc_rank,num_market_pairs'}

    # Make the call
    nb_tries = 3
    while True:
        nb_tries -= 1
        try:
            response = session.get(final_url, params=parameters)
            r_json = json.loads(response.text)
            if (r_json['status']['error_message'] == None):
                break
            elif (r_json['status']['error_message'][:29] == 'Search query is out of range.'):
                print('error due to out of range')
                time.sleep(1)
                if nb_tries <= 0:
                    assert(1==0),'out of range error occured several times'
            else:
                assert(1==0),'json has error'

        except (ConnectionError, Timeout, TooManyRedirects) as err:
            if nb_tries <= 0:
                raise err
            else:
                print('error due to connection, timeout, or redirect')
                time.sleep(1)

    # Add the data for that day to the dictionary
    for token in r_json['data']:
        cmc_covars_dict['date'].append(date)
        cmc_covars_dict['cmc_id'].append(token['id'])
        cmc_covars_dict['num_market_pairs'].append(token['num_market_pairs'])
        cmc_covars_dict['max_supply'].append(token['max_supply'])
        cmc_covars_dict['circulating_supply'].append(token['circulating_supply'])
        cmc_covars_dict['total_supply'].append(token['total_supply'])
        cmc_covars_dict['cmc_rank'].append(token['cmc_rank'])
        cmc_covars_dict['tags'].append(token['tags'])

    # Delay next call to not break limits
    time.sleep(1)
    

2020-09-22


2020-09-23


2020-09-24


2020-09-25


2020-09-26


2020-09-27


2020-09-28


2020-09-29


2020-09-30


2020-10-01


2020-10-02


2020-10-03


2020-10-04


2020-10-05


2020-10-06


2020-10-07


2020-10-08


2020-10-09


2020-10-10


2020-10-11


2020-10-12


2020-10-13


2020-10-14


2020-10-15


2020-10-16


2020-10-17


2020-10-18


2020-10-19


2020-10-20


2020-10-21


2020-10-22


2020-10-23


2020-10-24


2020-10-25


2020-10-26


2020-10-27


2020-10-28


2020-10-29


2020-10-30


2020-10-31


2020-11-01


2020-11-02


2020-11-03


2020-11-04


2020-11-05


2020-11-06


2020-11-07


2020-11-08


2020-11-09


2020-11-10


2020-11-11


2020-11-12


2020-11-13


2020-11-14


2020-11-15


2020-11-16


2020-11-17


2020-11-18


2020-11-19


2020-11-20


2020-11-21


2020-11-22


2020-11-23


2020-11-24


2020-11-25


2020-11-26


2020-11-27


2020-11-28


2020-11-29


2020-11-30


2020-12-01


2020-12-02


2020-12-03


2020-12-04


2020-12-05


2020-12-06


2020-12-07



In [535]:
# DETERMINE RELEVANT EXCHANGES TO PULL HISTORICAL DATA ON

# Set up the call
endpoint = '/v1/exchange/map'
final_url = base_url+endpoint
parameters = {'listing_status': 'active',
              'limit': 500,
              'aux': 'first_historical_data'}

# Make the call
response = session.get(final_url, params=parameters)
r_json = json.loads(response.text)

# Clean it up
exchange_df = pd.concat([pd.DataFrame(exchange, index=[0]) for exchange in r_json['data']])
exchange_df = exchange_df.reset_index(drop=True)
exchange_df = exchange_df.rename(columns = {'id': 'exchange_id',
                                            'slug': 'exchange_slug'})
exchange_df = exchange_df[['exchange_id', 'exchange_slug']]

In [536]:
# OBTAIN METADATA

# Set up the call
exchange_ids = ','.join([str(ex_id)for ex_id in exchange_df.exchange_id.values])
endpoint = '/v1/exchange/info'
final_url = base_url+endpoint
parameters = {'id': exchange_ids,
              'aux': 'date_launched'}

# Make the call
response = session.get(final_url, params=parameters)
r_json = json.loads(response.text)

# Add date launched to the data frame
for key in r_json['data'].keys():
    exchange_df.loc[exchange_df.exchange_id == int(key), 
                    'date_launched'] = r_json['data'][key]['date_launched']

In [537]:
# Dropping exchanges that do not have historical data
exchange_names_to_drop = ['feg-exchange', 'uniswap-v3-arbitrum', 'huckleberry', 
                          'photonswap-finance', 'maiar-exchange', 'katana', 
                          'kine-protocol-polygon', 'bit2me', 'balancer-v2-polygon',
                          'balancer-v2-arbitrum', 'uniswap-v3-polygon', 'tinyman', 
                          'algebra', 'kine-protocol-bsc', 'btcex-exchange']
exchange_df = exchange_df[~exchange_df.exchange_slug.isin(exchange_names_to_drop)]

In [538]:
# OBTAIN EXCHANGE HISTORICAL DATA

ex_hist_data_dict = {'exchange_id': [],
                     'date': [],
                     'exchange_volume_24h': [],
                     'num_market_pairs': []}

# Loop over all exchanges
for exchange_id in exchange_df.exchange_id.values: 
    print(exchange_df[exchange_df.exchange_id == exchange_id]['exchange_slug'].values[0])

    # Set up the call
    endpoint = '/v1/exchange/quotes/historical'
    final_url = base_url+endpoint
    parameters = {'id': exchange_id,
                  'time_start': '2015-01-01',
                  'time_end': '2021-12-31',
                  'interval': '1d',
                  'count': 10000,
                  'convert': 'USD'}

    # Make the call
    response = session.get(final_url, params=parameters)
    r_json = json.loads(response.text)

    # Add the data to the dictionary
    for ex_data in r_json['data']['quotes']:
        ex_hist_data_dict['exchange_id'].append(exchange_id)
        ex_hist_data_dict['date'].append(ex_data['quote']['USD']['timestamp'])
        ex_hist_data_dict['exchange_volume_24h'].append(ex_data['quote']['USD']['volume_24h'])
        ex_hist_data_dict['num_market_pairs'].append(ex_data['num_market_pairs'])
        
    # Sleep
    time.sleep(1)

poloniex
bittrex
kraken
bleutrade
bittylicious
cex-io
bitfinex
hitbtc
exmo
okcoin
indodax
bitstamp
itbit
zaif
therocktrading
coinmate
zonda
coinbase-exchange
bitex-la
bitonic
yobit
huobi-global
litebit
coincheck
liquid
southxchange
bitso
btcbox
coincorner
bitflyer
isx
gemini
dex-trade
exrates
bitmex
independent-reserve
luno
coinone
bisq
korbit
bithumb
lykke-exchange
kuna
mercatox
p2pb2b
tidex
heat-wallet
freiexchange
btc-markets
paribu
btc-alpha
coingi
ripplefox
gatehub
coss
btcturk-pro
stex
waves-exchange
koinim
stellar-decentralized-exchange
buda
btc-trade-ua
localtrade
bitbank
mercado-bitcoin
altcoin-trader
bancor-network
binance
bits-blockchain
tidebit
cryptomarket
okx
gate-io
idex
kucoin
bitcointrade
topbtc
aex
coinfalcon
coinut
satang-pro
zb-com
bigone
lbank
gopax
bibox
coinbene
coinex
upbit
tradeogre
c-patex
crxzone
fatbtc
paymium
ddex
rudex
zebpay
bitbns
unocoin
latoken
crex24
bithesap
cryptonex
cointiger
b2bx
dragonex
hotbit
switcheo
bitforex
kyber-network
coindeal
bitmart
dig

In [539]:
# Convert to dataframe
ex_historical_df = pd.DataFrame(ex_hist_data_dict)

## (4) Save all the data

In [542]:
# Save cmc token covars panel
cmc_covars_df.to_csv('../3-data/raw/cmc_token_covars_panel.csv', index=False)

In [543]:
# Save cmc macro timeseries data
macro_df.to_csv('../3-data/raw/cmc_macro_timeseries.csv', index=False)

In [544]:
# Save cmc exchange covariates
exchange_df.to_csv('../3-data/raw/cmc_exchange_covar.csv', index=False)

In [545]:
# Save cmc exchange panel data
ex_historical_df.to_csv('../3-data/raw/cmc_exchange_panel.csv', index=False)

In [None]:
# MOVE THESE NOTES TO CLEANING

# manually look through it to confirm they are legit tokens
# or maybe give this task to jacob
# or maybe schedule a time to do this with jacob so we 2x the speed

# Lets look to see if the 0.01% mcap rule is good for the entire time period

# Jan 1 2015 - $5B - $500k
# Jan 1 2016 - $7B - $700k
# Jan 1 2017 - $18B - $1.8M
# Jan 1 2018 - $600B - $60M
# Apr 1 2018 - $300B - $30M
# Jul 1 2018 - $250B - $25M
# Jan 1 2019 - $125B - $12M
# Apr 1 2019 - $145B - $14M
# Jul 1 2019 - $330B - $33M
# Oct 1 2019 - $220B - $22M
# Jan 1 2020 - $200B - $20M
# Apr 1 2020 - $175B - $17M
# Jul 1 2020 - $260B - $26M
# Oct 1 2020 - $340B - $34M
# Jan 1 2021 - $770B - $77M
# Apr 1 2021 - $1.9T - $190M
# Jul 1 2021 - $1.4T - $140M
# Oct 1 2021 - $2T - $200M