In [324]:
import pandas as pd
import numpy as np
import pickle
import datetime

In [325]:
# sets args
cmc_asset_universe_fp = "../data/raw/cmc_asset_universe.pkl"
cmc_cw_fp = "../data/raw/cmc_cw.pkl"
cmc_panel_fp = "../data/raw/cmc_price_volume_mcap_panel.pkl"
cg_cw_fp = "../data/raw/coingecko_cmc_cw.pkl"
cg_panel_fp = "../data/raw/coingecko_price_volume_mcap_panel.pkl"
cm_cw_fp = "../data/raw/coinmetrics_cmc_cw.pkl"
cm_asset_info_fp = '../data/raw/coinmetrics_assets_first_tradable.pkl'
cm_panel_fp = "../data/raw/coinmetrics_initial_panel.pkl"

# import data
with open(cmc_asset_universe_fp, 'rb') as f:
    cmc_asset_universe_dict = pickle.load(f)
cmc_cw_df =  pd.read_pickle(cmc_cw_fp)
cmc_panel_df = pd.read_pickle(cmc_panel_fp)
cg_cw_df =  pd.read_pickle(cg_cw_fp)
cg_panel_df = pd.read_pickle(cg_panel_fp)
cm_cw_df = pd.read_pickle(cm_cw_fp)
cm_asset_df = pd.read_pickle(cm_asset_info_fp)
cm_panel_df = pd.read_pickle(cm_panel_fp)


In [326]:
# TODO FORM SINGLE PANEL

# clean cws and panels before merge
cm_cw_df = cm_cw_df[~cm_cw_df.asset_cm.isin(['xno'])]
cm_cw_df = cm_cw_df[~cm_cw_df.asset_cmc.isin(['aave-old'])]
cm_cw_df = cm_cw_df.rename(columns = {'asset_cmc': 'slug_cmc'})
cg_cw_df = cg_cw_df[~cg_cw_df.asset_cmc.isin(['cronos', 'aave-old', 'yearn-finance-ii'])]
cg_cw_df = cg_cw_df.rename(columns={'asset_cmc': 'slug_cmc'})
cmc_panel_df['date'] = cmc_panel_df['date'].dt.date
assert type(cg_panel_df.date.values[0]) == datetime.date
cm_panel_df['time'] = pd.to_datetime(cm_panel_df['time']).dt.date
cm_panel_df = cm_panel_df.rename(columns={'asset': 'asset_cm', 'time': 'date'})
cm_asset_df = cm_asset_df.rename(columns={'asset': 'asset_cm'})

# merge panels togethers
panel_df = cmc_panel_df.merge(cg_cw_df,
                              on='slug_cmc',
                              how='left',
                              validate='many_to_one')
assert cmc_panel_df.shape[0]==panel_df.shape[0]
panel_df = panel_df.merge(cg_panel_df, on=['date', 'asset_gecko'], how='outer', validate='many_to_one')
panel_df = panel_df.merge(cm_cw_df[~cm_cw_df.slug_cmc.isnull()], on='slug_cmc', how='left', validate='many_to_one')
panel_df = panel_df.merge(cm_panel_df, on=['date', 'asset_cm'], how='outer', validate='many_to_one')

# cut down to window of interest
panel_df = panel_df[panel_df.date.apply(lambda x: x.year) >=2015]

# subset to assets with tradable market on coinmetrics and the dates after it is first tradable
panel_df = panel_df[panel_df.asset_cm.isin(cm_asset_df.asset.values)].copy()
assets = list(np.unique(panel_df.asset_cm.values))

for asset in assets:
    date_first_tradable = cm_asset_df[cm_asset_df.asset_cm==asset].date_first_tradable.values[0]
    date_first_tradable = datetime.datetime.strptime(date_first_tradable, '%Y-%m-%d').date()
    rows_before = panel_df.shape[0]
    panel_df = panel_df[~((panel_df.asset_cm==asset) & (panel_df.date<date_first_tradable))]
    if rows_before != panel_df.shape[0]:
        print(f"For {asset} we lost {(rows_before - panel_df.shape[0])} rows.")


In [None]:
# CLEAN PRICE COLUMN

# drop rows where we have no price data
panel_df = panel_df[~(panel_df.ReferenceRateUSD.isnull() 
                    & panel_df.usd_per_token_cmc.isnull() 
                    & panel_df.usd_per_token_cg.isnull())]

# form the price column
panel_df['usd_per_token'] = np.nan
panel_df.loc[~panel_df.ReferenceRateUSD.isnull(), 'usd_per_token'] = panel_df['ReferenceRateUSD']
panel_df.loc[panel_df.usd_per_token.isnull(), 'usd_per_token'] = panel_df[['usd_per_token_cmc', 'usd_per_token_cg']].mean(axis=1, skipna=True)

# remove rows where the price between cmc and cg is different by more than 50%
panel_df = panel_df[~(panel_df.ReferenceRateUSD.isnull() 
                    & ~panel_df.usd_per_token_cmc.isnull() 
                    & ~panel_df.usd_per_token_cg.isnull()
                    & (np.abs((panel_df.usd_per_token_cmc-panel_df.usd_per_token_cg)/panel_df.usd_per_token_cmc) > 0.5))]

# keep just the final price
panel_df = panel_df.drop(columns=['usd_per_token_cmc', 'usd_per_token_cg', 
                     'PriceUSD', 'ReferenceRate', 'ReferenceRateUSD'], axis=1)

# convert dtype
panel_df['usd_per_token'] = panel_df.usd_per_token.astype(float)

# CLEAN MCAP COLUMN

# drop if there is no mcap data
panel_df = panel_df[~(panel_df.usd_mcap_cmc.isnull()
                    & panel_df.usd_mcap_cg.isnull()
                    & panel_df.CapMrktEstUSD.isnull())]

# set any zeros to missing
panel_df.loc[panel_df.CapMrktEstUSD==0, 'CapMrktEstUSD'] = np.nan
panel_df.loc[panel_df.usd_mcap_cg==0, 'usd_mcap_cg'] = np.nan
panel_df.loc[panel_df.usd_mcap_cmc==0, 'usd_mcap_cmc'] = np.nan

# form the mcap column
panel_df['CapMrktEstUSD'] = panel_df.CapMrktEstUSD.astype(float)
panel_df['usd_mcap'] = panel_df[['CapMrktEstUSD', 'usd_mcap_cg', 'usd_mcap_cmc']].mean(axis=1, skipna=True)
assert 0 == panel_df.usd_mcap.isnull().sum()

# drop rows where mcaps between cg and cmc are more than order of magnitude off when we are missing CM values
panel_df = panel_df[~(panel_df.CapMrktEstUSD.isnull() & ~panel_df.usd_mcap_cg.isnull() & ~panel_df.usd_mcap_cmc.isnull()
                      & (np.abs((panel_df.usd_mcap_cg - panel_df.usd_mcap_cmc)/panel_df.usd_mcap_cmc) > 10))]

# keep just the final price
panel_df = panel_df.drop(columns=['usd_mcap_cmc', 'usd_mcap_cg', 
                                  'CapMrktCurUSD', 'CapMrktEstUSD', 'CapMrktFFUSD', 'CapRealUSD'], axis=1)

In [None]:
# TODO come back to create the cm market cap measure using supply for where it is missing if useful

In [202]:
# TODO COINAPI
# for a broad cm asset universe
# pull all exchange prices from a set of legit exchanges (maybe the cm set?)
# pull asociated volume data
# use this as my legit volume estimate 
# pull any mcap data that they have too

In [None]:
# TODO use coinapi for my volume estimate of LEGIT exchanges; maybe cross check with others.

In [None]:

# TODO subset down to date, asset, price, mcap, and volume and other useful variables
# TODO look for continuity within asset. look at returns to see if anything crazy. look if mcap jump is way diff than price jump.
# TODO make sure ranges of values looks good
# TODO go scope old cleaning scripts to make sure i do all of that too
# TODO apply the inclusion criteria on the first on each month

In [None]:

# TODO manually check that the universe makes sense for maybe 6-10 of the random sampling of hte first years and 
# the last 3-6 random sampling over 2-3 years?


# TODO output the cmc, coingecko, and cm cw for this universe as well as a dictionary of the cmc ids at the start of each month

# TODO convert all the code to functions with professional documentation