In [301]:
import pandas as pd
import pickle
from helper_functions import Helper
from typing import List

In [302]:
def cleanMessari(m_df: pd.DataFrame, asset_universe_cm: List[str]) -> pd.DataFrame:
    """ Clean the Messari industry and consensus mech covariates with a ton of
        manual fixes... :("""
    # fix some assset errors
    m_df.loc[m_df.symbol=='XDCE', 'symbol'] = 'XDC'
    m_df.loc[m_df.name=='Flux', 'symbol'] = 'FLUX'

    # append missing assets
    m_df = pd.concat([m_df, pd.DataFrame(data={'name': ['Crypterium', 'Creditcoin', 'Grin', 'Kucoin'],
                                            'symbol': ['CRPT', 'CTC', 'GRIN', 'KCS']})])

    # clean up messari names
    m_df['asset_m'] = m_df.symbol.str.lower()
    m_df = m_df[~m_df.duplicated(subset='name')]
    m_df = m_df[~m_df.name.str.contains('(Wormhole)')]
    m_df = m_df[~m_df.name.str.contains('Peg')]

    # form coinmetrics asset ids
    cm_df = pd.DataFrame(data={'asset_cm': asset_universe_cm})

    # merge
    m_df = m_df.merge(cm_df, left_on='asset_m', right_on='asset_cm',
                    how='inner', validate='many_to_one')

    # drop duplicates
    assets_to_drop = ['THORChain (ERC20)', 'MASK Vault (NFTX)', 'POLY Maximus',
        'Kyber Network', 'PlayChip', 'SubGame', 'Rare', 'Truebit', 'MIR COIN',
        'Acash Coin', 'Reef', 'Aztec Nodes SUN', 'SuperCoin', 'Qi Dao', 'ARPA',
        'Gas DAO', 'Jupiter', 'Anoncoin', 'Oxycoin', 'Wrapped Terra',
        'GoMining Token', 'Mercury Protocol', 'f(x) Coin']
    m_df = m_df[~m_df.name.isin(assets_to_drop)]

    # keep only necessary columns
    m_df = m_df[['asset_cm', 'sector', 'category', 'tokenUsage', 'consensusAlgorithm']]

    # clean consensus algo
    m_df['pow_messari'] = 0
    m_df.loc[m_df.consensusAlgorithm.str.contains('Proof-of-Work') 
        & m_df.consensusAlgorithm.notnull(), 'pow_messari'] = 1
    m_df.loc[m_df.consensusAlgorithm.str.contains('PoW') 
        & m_df.consensusAlgorithm.notnull(), 'pow_messari'] = 1
    m_df['pos_messari'] = 0
    m_df.loc[m_df.consensusAlgorithm.str.contains('Proof-of-Stake') 
        & m_df.consensusAlgorithm.notnull(), 'pos_messari'] = 1
    m_df.loc[m_df.consensusAlgorithm.str.contains('PoS') 
        & m_df.consensusAlgorithm.notnull(), 'pos_messari'] = 1
    m_df = m_df.drop('consensusAlgorithm', axis=1)

    # clean token usage to indicators 
    m_df['asset_usage_payments_messari'] = 0
    m_df['asset_usage_vote_messari'] = 0
    m_df['asset_usage_work_messari'] = 0
    m_df['asset_usage_dividends_messari'] = 0
    m_df['asset_usage_access_messari'] = 0
    m_df['asset_usage_discount_messari'] = 0
    m_df.loc[m_df.tokenUsage.str.contains('Payments')
        & m_df.tokenUsage.notnull(), 'asset_usage_payments_messari'] = 1
    m_df.loc[m_df.tokenUsage.isnull(), 'asset_usage_payments_messari'] = 1
    m_df.loc[m_df.tokenUsage.str.contains('Vote')
        & m_df.tokenUsage.notnull(), 'asset_usage_vote_messari'] = 1
    m_df.loc[m_df.tokenUsage.str.contains('Work')
        & m_df.tokenUsage.notnull(), 'asset_usage_work_messari'] = 1
    m_df.loc[m_df.tokenUsage.str.contains('Dividends')
        & m_df.tokenUsage.notnull(), 'asset_usage_dividends_messari'] = 1
    m_df.loc[m_df.tokenUsage.str.contains('Access')
        & m_df.tokenUsage.notnull(), 'asset_usage_access_messari'] = 1
    m_df.loc[m_df.tokenUsage.str.contains('Discount')
        & m_df.tokenUsage.notnull(), 'asset_usage_discount_messari'] = 1
    m_df.loc[m_df.asset_cm.isin(['fxs', 'luna', 'cream', 'aleph', 
        'knc', 'rbn', 'cvx']), 'asset_usage_vote_messari'] = 1
    m_df.loc[m_df.asset_cm=='kcs', 'asset_usage_discount_messari'] = 1
    m_df.loc[m_df.asset_cm=='kp3r', 'asset_usage_work_messari'] = 1
    m_df.loc[m_df.asset_cm=='flux', 'asset_usage_access_messari'] = 1
    m_df = m_df.drop('tokenUsage', axis=1)

    # clean industry covars
    m_df['industry_messari'] = ''
    m_df.loc[m_df.sector=='Smart Contract Platforms', 'industry_messari'] = 'smart_contract'
    m_df.loc[m_df.sector.isin(['Data Management',
        'File Storage']), 'industry_messari'] = 'data_mgmt'
    m_df.loc[m_df.sector.isin(['Interoperability']), 'industry_messari'] = 'interop'
    m_df.loc[m_df.sector.isin(['Shared Compute', 'Shared compute']), 'industry_messari'] = 'cloud_compute'
    m_df.loc[(m_df.industry_messari=='') &
        m_df.category.isin(['Infrastructure']), 'industry_messari'] = 'infra'
    m_df.loc[m_df.sector.isin(['Decentralized Exchanges']), 'industry_messari'] = 'dex'
    m_df.loc[m_df.sector.isin(['Lending']), 'industry_messari'] = 'lending'
    m_df.loc[m_df.sector.isin(['Centralized Exchanges']), 'industry_messari'] = 'cex'
    m_df.loc[m_df.sector.isin(['Gaming']), 'industry_messari'] = 'gaming'
    m_df.loc[m_df.sector.isin(['Asset Management']), 'industry_messari'] = 'asset_mgmt'
    m_df.loc[m_df.sector.isin(['Currencies']), 'industry_messari'] = 'currency'
    m_df.loc[(m_df.industry_messari=='') &
        m_df.category.isin(['Financial']), 'industry_messari'] = 'other_defi'
    m_df.loc[(m_df.industry_messari=='') &
        m_df.category.isin(['Media and Entertainment']), 'industry_messari'] = 'media'
    m_df.loc[(m_df.industry_messari=='') &
        m_df.sector.isin(['Payment Platforms']), 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='xdc', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='cvx', 'industry_messari'] = 'dex'
    m_df.loc[m_df.asset_cm=='jasmy', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='bico', 'industry_messari'] = 'interop'
    m_df.loc[m_df.asset_cm=='flux', 'industry_messari'] = 'cloud_compute'
    m_df.loc[m_df.asset_cm=='rbn', 'industry_messari'] = 'dex'
    m_df.loc[m_df.asset_cm=='stg', 'industry_messari'] = 'interop'
    m_df.loc[m_df.asset_cm=='knc', 'industry_messari'] = 'dex'
    m_df.loc[m_df.asset_cm=='tribe', 'industry_messari'] = 'other_defi'
    m_df.loc[m_df.asset_cm=='pundix', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='people', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='c98', 'industry_messari'] = 'other_defi'
    m_df.loc[m_df.asset_cm=='mbox', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='gal', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='prom', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='pha', 'industry_messari'] = 'cloud_compute'
    m_df.loc[m_df.asset_cm=='tlm', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='xyo', 'industry_messari'] = 'interop'
    m_df.loc[m_df.asset_cm=='klv', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='ogn', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='sun', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='super', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='kp3r', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='boson', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='tvk', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='dar', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='aleph', 'industry_messari'] = 'cloud_compute'
    m_df.loc[m_df.asset_cm=='krl', 'industry_messari'] = 'asset_mgmt'
    m_df.loc[m_df.asset_cm=='inv', 'industry_messari'] = 'lending'
    m_df.loc[m_df.asset_cm=='luna', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='samo', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='gmt', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='t', 'industry_messari'] = 'lending'
    m_df.loc[m_df.asset_cm=='fx', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='loka', 'industry_messari'] = 'gaming'
    m_df.loc[m_df.asset_cm=='crpt', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='ctc', 'industry_messari'] = 'lending'
    m_df.loc[m_df.asset_cm=='grin', 'industry_messari'] = 'currency'
    m_df.loc[m_df.asset_cm=='kcs', 'industry_messari'] = 'cex'
    m_df.loc[m_df.asset_cm=='cro', 'industry_messari'] = 'cex'
    m_df.loc[m_df.asset_cm=='mpl', 'industry_messari'] = 'asset_mgmt'
    m_df.loc[m_df.asset_cm=='ava', 'industry_messari'] = 'smart_contract'
    m_df.loc[m_df.asset_cm=='shib', 'industry_messari'] = 'media'
    m_df.loc[m_df.asset_cm=='miota', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='cvc', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='sgb', 'industry_messari'] = 'interop'
    m_df.loc[m_df.asset_cm=='powr', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='ata', 'industry_messari'] = 'cloud_compute'
    m_df.loc[m_df.asset_cm=='stmx', 'industry_messari'] = 'currency'
    m_df.loc[m_df.asset_cm=='qsp', 'industry_messari'] = 'infra'
    m_df.loc[m_df.asset_cm=='suku', 'industry_messari'] = 'infra'
    m_df = m_df.drop(['sector', 'category'], axis=1)

    # ensure no missing
    assert 0 == m_df.isnull().sum().sum()

    # sort and reindex
    m_df = m_df.sort_values(by='asset_cm', ignore_index=True)

    return m_df

In [303]:
if __name__ == "__main__":
    # set args
    M_IN_FP = '../data/raw/messari.csv'
    ASSET_IN_FP = '../data/derived/asset_universe_dict.pickle'
    OUT_FP = '../data/derived/messari.pkl'

    # import
    m_df = pd.read_csv(M_IN_FP)
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    asset_universe_cm = Helper.findUniqueAssets(asset_universe_dict)

    # clean
    m_df = cleanMessari(m_df, asset_universe_cm)
    
    # output
    m_df.to_pickle(OUT_FP)


  m_df = m_df[~m_df.name.str.contains('(Wormhole)')]
