In [187]:
import pandas as pd
import numpy as np
import pickle
from dateutil.relativedelta import relativedelta
from datetime import date

In [182]:
# Helper function to ensure one-to-one mapping between two ID columns
def isOneToOne(temp_df, col1, col2):
    first = temp_df.groupby(col1)[col2].size().sort_values().values
    second = temp_df.groupby(col2)[col1].size().sort_values().values
    assert(len(first) == len(second))
    return len(first) == np.sum(first == second)

In [183]:
if __name__ == "__main__":
    # set args
    cw_in_fp = "../data/derived/cmc_token_universe.pkl"
    panel_in_fp = "../data/derived/cmc_price_volume_mcap_panel.pkl"

    # import data
    cw_df = pd.read_pickle(cw_in_fp)
    panel_df = pd.read_pickle(panel_in_fp)


In [184]:
# manually remove tokens from panel
tokens_to_remove = [770, 776, 3787, 8644, 9103]
panel_df = panel_df[~panel_df.cmc_id.isin(tokens_to_remove)]

# merge on cmc slug and drop the cmc id
df = panel_df.merge(cw_df[['cmc_id', 'slug_cmc']],
                    on='cmc_id',
                    how='inner',
                    validate='many_to_one')
df = df.drop('cmc_id', axis=1)
df = df[['date', 'slug_cmc', 'usd_per_token_cmc', 'usd_mcap_cmc', 'usd_volume_24h_cmc']]
df = df.sort_values(by=['date', 'slug_cmc'], ignore_index=True)

# adjust particular values
df.loc[(df.slug_cmc=='uquid-coin') & 
       df.usd_volume_24h_cmc.isnull(), 'usd_volume_24h_cmc'] = 0

# ensure no missing in the df
assert(0==df.isnull().sum().sum())

# ensure unique on key columns
dups = df.duplicated(subset=['date', 'slug_cmc'])
assert(~dups.any()),('there are duplicates in the data on keys date and slug_cmc')

# drop more tokens manually
# NOTES: ampleforth is a stablecoin, pax gold is a gold stablecoin, index, and wrapped tokens
wrapped_tokens_to_drop = ['ampleforth', 'cryptoindex-com-100', 'pax-gold',
                          'wrapped-centrifuge', 'wrapped-luna-token', 'wrapped-ncg', 'wrapped-nxm']
df = df[~df.slug_cmc.isin(wrapped_tokens_to_drop)]


In [194]:
# specify the dates to obtain
start_date = date(2015, 1, 1)
end_date   = date(2022, 12, 1)
dates = [start_date.strftime('%Y-%m-%d')]
current_date = start_date+relativedelta(months=1)
while current_date <= end_date:
    dates.append(current_date.strftime('%Y-%m-%d'))
    current_date += relativedelta(months=1)

In [None]:
# APPLY VOLUME AND MCAP FILTERS
token_universe_per_quarter = []

for i in range(1,len(dates)):
    # determine start and end dates for window
    start_date = dates[i-1]
    end_date   = dates[i]

    # build temporary dataframe for this time period
    temp_df = df[(df.date >= start_date) & (df.date <= end_date)].copy()

    # obtain list of tokens to consider
    assets_included = list(np.unique(temp_df[temp_df.date == start_date].slug_cmc.values))

    # figure out tokens removed due to insuff data
    # note: 28 days ensures at least 4 weeks of data 
    asset_ns_df = temp_df.groupby('slug_cmc').size()
    assets_lost_given_insuff_data = list(asset_ns_df[asset_ns_df < 28].index.values)
    print('Coins lost given less than 28 days of data:')
    print(np.unique(assets_lost_given_insuff_data))
    print('\n')
    for asset in assets_lost_given_insuff_data:
        if asset in assets_included:
            assets_included.remove(asset)   

    # Figure out tokens removed due to volume threshold
    temp_vol_df = temp_df.groupby('slug_cmc').usd_volume_24h.min()
    assets_lost_given_insuff_vol = list(temp_vol_df[temp_vol_df < 10000].index.values)
    print('Coins lost given less than \$10,000 on some day:')
    print(np.unique(assets_lost_given_insuff_vol))
    print('\n')
    for asset in assets_lost_given_insuff_vol:
        if asset in assets_included:
            assets_included.remove(asset)

    # Figure out tokens removed due to mcap threshold
    mcap_threshold = 0.00001*temp_df[temp_df.date == date].total_market_cap_cmc.values[0]
    print('MCAP threshold is: ' + str(round(mcap_threshold, 0)))
    temp_mcap_df = temp_df.groupby('slug_cmc').usd_mcap.mean()
    assets_lost_given_mcap_threshold = list(temp_mcap_df[temp_mcap_df < mcap_threshold].index.values)
    print('Coins lost given less than mcap threshold:')
    print(np.unique(assets_lost_given_mcap_threshold))
    print('\n')
    for asset in assets_lost_given_mcap_threshold:
        if asset in assets_included:
            assets_included.remove(asset)
            
    # Figure out tokens removed due to cmc rank threshold
    assets_below_rank_threshold = np.unique(temp_df[temp_df.rank_cmc > 1000].slug_cmc.values)
    print('Coins removed due to being below CMC rank of 1000:')
    for asset in assets_below_rank_threshold:
        if asset in assets_included:
            print(asset)
            assets_included.remove(asset)
    print('\n')

    # Report out new tokens ever
    print('New tokens that we have never had are ')
    if (date != '2016-01-01'):
        all_tokens = []
        for j in range(i-1,-1,-1):
            all_tokens += token_universe_per_quarter[j]
        print(np.unique(set(assets_included).difference(set(all_tokens))))
    else:
        print(np.unique(assets_included))
    print('\n')

    # Report out tokens lost from last quarter
    if date != '2016-01-01':
        print('Tokens lost from last quarter are ')
        print(set(token_universe_per_quarter[i-1]).difference(set(assets_included)))
        print('\n')

    # Report out tokens for this quarter
    print('This quarter\'s tokens are:')
    print(np.unique(assets_included))
    print(len(assets_included))
    print('\n\n\n')

    # Add tokens to list
    token_universe_per_quarter.append(list(np.unique(assets_included)))

In [None]:
# ensure each asset does not appear before first date nor after last date
cmc_ids = np.unique(panel_df.cmc_id.values)
for cmc_id in cmc_ids:
    print(cmc_id)
    first_date = cw_df[cw_df.cmc_id==cmc_id].first_date_cmc.values[0]
    last_date  = cw_df[cw_df.cmc_id==cmc_id].last_date_cmc.values[0]
    assert(0==panel_df[(panel_df.cmc_id==cmc_id)&(panel_df.date<first_date)].shape[0])
    assert(0==panel_df[(panel_df.cmc_id==cmc_id)&(panel_df.date>last_date)].shape[0])

In [None]:
# BUILD TOKEN UNIVERSE DICTIONARY
asset_universe_dict = {}
for i in range(len(dates)):
    asset_universe_dict[dates[i]] = token_universe_per_quarter[i]

In [None]:
# EXPLORE RETURNS OF EQUAL- AND MCAP- WEIGHTED PORTFOLIOS

# Cut the panel down to just the assets of interest
asset_universe_unique = list(np.unique([asset 
                                        for sublist in token_universe_per_quarter 
                                        for asset in sublist]))
df = df[df.slug_cmc.isin(asset_universe_unique)]

# Keep columns of interest
df = df[['date', 'slug_cmc', 'usd_per_token', 'usd_mcap', 'usd_volume_24h']]

# Drop rows that do not have previous day information
df = df.sort_values(by=['slug_cmc', 'date'], ignore_index=True)
df.loc[1:, 'day_diff'] = (df.date[1:].values - df.date[:-1]).values.astype('timedelta64[D]').astype(int)
df['day_diff2'] = df.day_diff.shift(-1)
num_rows = df[df.day_diff == 1].shape[0]
df = df[(df.day_diff == 1) | (df.day_diff2 == 1)]
assert(num_rows <= df.shape[0])
df = df.drop(['day_diff2'], axis=1)

# Calculate day over day return
df['r_t'] = df.groupby('slug_cmc')['usd_per_token'].apply(pd.Series.pct_change)
df = df[df.day_diff == 1]
tokens_to_drop = np.unique(df[df.r_t.isnull()].slug_cmc.values)
df = df[~df.slug_cmc.isin(tokens_to_drop)]
df = df.drop('day_diff', axis=1)

# Cut down to time period of interest
df = df[df.date.dt.year >= 2016]
df = df[df.date.dt.year <= 2021]

# Ensure no missings
assert(0 == df.isnull().sum().sum())

# Clean up index and resort
df = df.sort_values(by=['date', 'slug_cmc'], ignore_index=True)

# Calculate equal and mcap weighted returns by quarter
equal_df = pd.DataFrame()
mcap_df  = pd.DataFrame()
for i in range(len(dates)):
    # Set up dates and asset universe
    date = dates[i]
    date_plus_3mo = datetime.strptime(date, '%Y-%m-%d') + relativedelta(months=3)
    asset_universe = asset_universe_dict[date]

    # Subset to relevant data
    temp_df = df[(df.date >= date) & (df.date < date_plus_3mo)]
    temp_df = temp_df[temp_df.slug_cmc.isin(asset_universe)]

    # Form equal weighted returns
    temp_eq_df = temp_df.groupby('date')[['r_t']].mean()
    equal_df = pd.concat((equal_df, temp_eq_df))

    # Form mcap weighted returns
    temp_df['mcap_sum'] = temp_df.groupby('date')['usd_mcap'].transform('sum')
    temp_df['mcap_weight'] = temp_df.usd_mcap / temp_df.mcap_sum
    temp_df['mcap_r_t'] = temp_df.r_t * temp_df.mcap_weight
    temp_mcap_df = temp_df.groupby('date')[['mcap_r_t']].sum()
    mcap_df = pd.concat((mcap_df, temp_mcap_df))

# Ensure no missing
assert(0==equal_df.isnull().sum().values)
assert(0==mcap_df.isnull().sum().values)

# Report returns
print('equal weighted return:')
print(equal_df.apply(geometricAverageSimpleReturns, axis=0).values[0])
print('sharpe:')
print(np.mean(equal_df.r_t.values)/np.std(equal_df.r_t.values))
print('mcap weighted return:')
print(mcap_df.apply(geometricAverageSimpleReturns, axis=0).values[0])
print('sharpe:')
print(np.mean(mcap_df.mcap_r_t.values)/np.std(mcap_df.mcap_r_t.values))

# Form the returns by year
equal_df['year'] = equal_df.index.year
mcap_df['year'] = mcap_df.index.year
print('equal weighted return:')
print(equal_df.groupby('year').apply(geometricAverageSimpleReturns))
print('mcap weighted return:')
print(mcap_df.groupby('year').apply(geometricAverageSimpleReturns))
equal_df = equal_df.drop('year', axis=1)
mcap_df  = mcap_df.drop('year', axis=1)

In [None]:
# Save object and clear memory
with open('../3-data/clean/asset_universe_dates_and_lists.pkl', 'wb') as f:
    pickle.dump(asset_universe_dict, f)
del df

In [None]:
# MOVE THESE NOTES TO CLEANING

# manually look through it to confirm they are legit tokens
# or maybe give this task to jacob
# or maybe schedule a time to do this with jacob so we 2x the speed

# Lets look to see if the 0.01% mcap rule is good for the entire time period

# Jan 1 2015 - $5B - $500k
# Jan 1 2016 - $7B - $700k
# Jan 1 2017 - $18B - $1.8M
# Jan 1 2018 - $600B - $60M
# Apr 1 2018 - $300B - $30M
# Jul 1 2018 - $250B - $25M
# Jan 1 2019 - $125B - $12M
# Apr 1 2019 - $145B - $14M
# Jul 1 2019 - $330B - $33M
# Oct 1 2019 - $220B - $22M
# Jan 1 2020 - $200B - $20M
# Apr 1 2020 - $175B - $17M
# Jul 1 2020 - $260B - $26M
# Oct 1 2020 - $340B - $34M
# Jan 1 2021 - $770B - $77M
# Apr 1 2021 - $1.9T - $190M
# Jul 1 2021 - $1.4T - $140M
# Oct 1 2021 - $2T - $200M

In [None]:
# ensure each asset has consecutive data, interpolate where needed with forward fill

# group the data by cmc_id to loop over
grouped = panel_df.groupby('cmc_id')

# interate through each cmc_id
dfs = []
for name, group in grouped:
    # find the first and last dates for the current id
    first_date = group['date'].min()
    last_date  = group['date'].max()

    # create a new dataframe with all the possible combinations of cmc_id and date
    dates = pd.date_range(first_date, last_date)
    index = pd.MultiIndex.from_product([[name], dates], names=['cmc_id', 'date'])
    full_df = pd.DataFrame(index=index).reset_index()

    # merge the full dataframe with the original dataframe to fill in missing values with NaNs
    merged_df = pd.merge(full_df, group, on=['cmc_id', 'date'], how='left')

    # interpolate the missing values using forward fill for up to 7 consecutive observations
    interpolated_df = merged_df.fillna(method='ffill', limit=21)

    # Check if there are any missing values in the remaining columns for the current id and date range
    if interpolated_df.isnull().values.any():
        print(f"ID {name} has missing values in the given date range, precisely: {int(interpolated_df.isnull().sum().sum()/3)}.")
        break

    # combine    
    dfs.append(interpolated_df)

# Combine all the dataframes and drop the 'cmc_id' index level
result_df = pd.concat(dfs).reset_index()


