In [None]:
# TODO map asset universe to santiment assets
# TODO find all available metrics for these assets
# TODO manually cut down those metrics to just ones of interest to me
# TODO pull all metrics for all assets
# TODO figure out all possible macro metrics and cut down to ones of interest
# TODO pull all the macro metrics to for macro timeseries

In [1]:
# Import packages
import pandas as pd
import numpy as np
import requests
import time
import datetime as dt
from requests.exceptions import ConnectionError
import san
from graphql import GraphQLError
import multiprocessing


In [2]:
# santiment api key
fp = '../0-admin/santiment.txt'

with open(fp) as f:
    API_KEY = f.readlines()
    API_KEY = API_KEY[0].strip()


In [3]:
# Set up API
san.ApiConfig.api_key = API_KEY


In [4]:
# Test it is working
san.get("projects/all")


Unnamed: 0,marketSegment,name,slug,ticker,totalSupply
0,Blockchain Network,Enecuum,enecuum,ENQ,202238971.618
1,Interoperability,Cosmos,cosmos,ATOM,0
2,Financial,ZeuxCoin,zeuxcoin,ZUC,750000000
3,Energy,Treelion,treelion,TRN,1000000000
4,Lending,Kava,kava,KAVA,251944719
...,...,...,...,...,...
2463,DeFi,Synthetix [on Optimism],o-synthetix-network-token,SNX,215258834.2449152
2464,Stablecoin,USD Coin [on Optimism],o-usd-coin,USDC,53609131421.414055
2465,Stablecoin,Tether [on Optimism],o-tether,USDT,69158976373.90933
2466,Stablecoin,Dai [on Optimism],o-multi-collateral-dai,DAI,7268771320.62765


## (1) Determine the relevant universe from the coinmarketcap data

In [6]:
# Pull in the cmc data
cmc_df = pd.read_csv('../3-data/raw/cmc_token_universe.csv')
cmc_mcap_df = pd.read_csv('../3-data/raw/cmc_price_vol_mcap_panel.csv')
macro_df = pd.read_csv('../3-data/raw/cmc_macro_timeseries.csv')
macro_df = macro_df[['date', 'total_market_cap']]
cmc_mcap_df = cmc_mcap_df.merge(macro_df,
                                on='date',
                                how='inner',
                                validate='many_to_one')


In [7]:
# Create the mcap threshold and drop tokens below mcap and and volume thresholds
cmc_mcap_df['mcap_threshold'] = cmc_mcap_df.total_market_cap * 0.0001
cmc_mcap_df = cmc_mcap_df[cmc_mcap_df.usd_mcap > cmc_mcap_df.mcap_threshold]
cmc_mcap_df = cmc_mcap_df[cmc_mcap_df.usd_volume_24h >= 100000]


In [8]:
# Cut down the cmc identifer df to those that pass threshold
cmc_df = cmc_df[cmc_df.cmc_id.isin(list(np.unique(cmc_mcap_df.cmc_id.values)))]


## (2) Map cmc tokens to santiment token crosswalk

In [253]:
# Pull in santiment ids
san_ids_df = san.get("projects/all")


In [10]:
# Cut down to just relevant columns in each and create new dataframes
san_cw_df = san_ids_df[['name', 'slug', 'ticker']].copy()
san_cw_df = san_cw_df.rename(columns = {'slug': 'san_slug',
                                        'name': 'san_name',
                                        'ticker': 'san_ticker'})
cmc_cw_df = cmc_df[['cmc_symbol', 'name', 'cmc_slug']].copy()
cmc_cw_df = cmc_cw_df.rename(columns = {'name': 'cmc_name'})


In [11]:
# Merge on cmc data by slug

san_cw_df['san_slug'] = san_cw_df.san_slug.str.lower()
cmc_cw_df['cmc_slug'] = cmc_cw_df.cmc_slug.str.lower()
cw_df = cmc_cw_df.merge(san_cw_df[['san_slug']],
                        left_on='cmc_slug',
                        right_on='san_slug',
                        how='left',
                        validate='one_to_one')


In [12]:
# Update the dataframes with matched and unmatched
cmc_cw_df = cw_df[cw_df.san_slug.isnull()].copy()
cw_df     = cw_df[~cw_df.san_slug.isnull()].copy()
cw_df     = cw_df.reset_index(drop=True)
cmc_cw_df = cmc_cw_df.drop(columns='san_slug')
cmc_cw_df = cmc_cw_df.reset_index(drop=True)


In [13]:
# Add the other covariates to the matched crosswalk
cw_df = cw_df.merge(san_cw_df,
                    on='san_slug',
                    how='inner',
                    validate='one_to_one')

In [14]:
# Clean up the cmc side to drop duplicated symbols
cmc_cw_df = cmc_cw_df[~cmc_cw_df.cmc_slug.isin(['next', 'metacoin', 
                                                'bittup', 'bitclave', 'blockcat'])]


In [15]:
# Merge on cmc data by ticker
san_cw_df['san_ticker'] = san_cw_df['san_ticker'].str.upper()
cmc_cw_df['cmc_symbol'] = cmc_cw_df['cmc_symbol'].str.upper()
cw_df2 = cmc_cw_df.merge(san_cw_df,
                         left_on='cmc_symbol',
                         right_on='san_ticker',
                         how='left',
                         validate='one_to_many')

# Update the dataframes with matched and unmatched
cmc_cw_df = cw_df2[cw_df2.san_ticker.isnull()].copy()
cmc_cw_df = cmc_cw_df[['cmc_symbol', 'cmc_name', 'cmc_slug']].reset_index(drop=True)
cw_df2    = cw_df2[~cw_df2.san_ticker.isnull()].reset_index(drop=True)
cw_df2    = cw_df2[~cw_df2.san_slug.isin(['stox', 'zerobank', 
                                          'six-domain-chain', 'sdchain', 
                                          'wonderland', 'mint-club', 'public-mint'])]
cw_df     = cw_df.append(cw_df2)
cw_df     = cw_df[~cw_df.cmc_slug.isin(['nas', 'synereo', 'the-dao', 
                                        'cos', 'envion', 'cube', 
                                        'lina-network', 'compound-coin', 'soda-coin', 
                                        'metaverse-dualchain-network-architecture'])] 

In [16]:
# Merge on cmc data by name
san_cw_df['san_name'] = san_cw_df['san_name'].str.lower()
cmc_cw_df['cmc_name'] = cmc_cw_df['cmc_name'].str.lower()
cw_df3 = cmc_cw_df.merge(san_cw_df,
                         left_on='cmc_name',
                         right_on='san_name',
                         how='left',
                         validate='one_to_many')

# Update the dataframes
cmc_cw_df = cw_df3[cw_df3.san_name.isnull()].copy()
cmc_cw_df = cmc_cw_df[['cmc_symbol', 'cmc_name', 'cmc_slug']].reset_index(drop=True)
cw_df3 = cw_df3[cw_df3.cmc_slug == 'firo'].reset_index(drop=True)
cw_df  = cw_df.append(cw_df3)
cw_df = cw_df.reset_index(drop=True)

In [17]:
# Fix one slug that was lower cased
cw_df.loc[cw_df.san_slug == 'btrst', 'san_slug'] = 'BTRST'

In [257]:
# Add back the Santiment metadata
n_b4 = cw_df.shape[0]
cw_df = cw_df.merge(san_ids_df[['marketSegment', 'slug', 'totalSupply']],
                    left_on='san_slug',
                    right_on='slug',
                    how='inner',
                    validate='one_to_one')
assert(cw_df.shape[0] == n_b4)

883

## (3) Find the available metrics for each asset

In [18]:
san_slugs_list = list(cw_df.san_slug.values)

In [19]:
def obtainMetricsForSlug(san_slug):
    san_slug_metrics_dict[san_slug] = san.available_metrics_for_slug(san_slug)
    time.sleep(.121)

# Initiate object to store results
manager = multiprocessing.Manager()
san_slug_metrics_dict = manager.dict()
failed_pulls_list = []

i=0
num_slugs = len(san_slugs_list)
for san_slug in san_slugs_list:
    print('Completed ' + str(np.round(i/num_slugs * 100, 2)) + '% of the slugs.')
    i += 1
    print(san_slug)

    # Initiate the call as a process
    p = multiprocessing.Process(target=obtainMetricsForSlug, args=(san_slug, ))
    p.start()

    time.sleep(2)
    # If thread is active
    if p.is_alive():
        print("api call is still running... let's kill it...")
        p.terminate()

        # Try the call again
        p = multiprocessing.Process(target=obtainMetricsForSlug, args=(san_slug, ))
        p.start()
        time.sleep(3)
        if p.is_alive():
            print('api call failed twice')
            p.terminate()

            # Try the call one last time
            p = multiprocessing.Process(target=obtainMetricsForSlug, args=(san_slug, ))
            p.start()
            time.sleep(4)
            if p.is_alive():
                print('api call failed thrice\n')
                p.terminate()
                failed_pulls_list.append(san_slug)
                continue

    # Cleanup
    p.join()


Completed 0.0% of the slugs.
bitcoin
Completed 0.11% of the slugs.
litecoin
Completed 0.23% of the slugs.
namecoin
Completed 0.34% of the slugs.
peercoin
Completed 0.45% of the slugs.
novacoin
Completed 0.57% of the slugs.
feathercoin
Completed 0.68% of the slugs.
infinitecoin
Completed 0.79% of the slugs.
primecoin
Completed 0.91% of the slugs.
nxt
Completed 1.02% of the slugs.
unobtanium
Completed 1.13% of the slugs.
dogecoin
Completed 1.25% of the slugs.
omni
Completed 1.36% of the slugs.
mooncoin
Completed 1.47% of the slugs.
dimecoin
Completed 1.59% of the slugs.
vertcoin
Completed 1.7% of the slugs.
digibyte
Completed 1.81% of the slugs.
dash
Completed 1.93% of the slugs.
counterparty
Completed 2.04% of the slugs.
dnotes
Completed 2.15% of the slugs.
einsteinium
Completed 2.27% of the slugs.
eccoin
Completed 2.38% of the slugs.
monacoin
Completed 2.49% of the slugs.
faircoin
Completed 2.6% of the slugs.
solarcoin
Completed 2.72% of the slugs.
gulden
Completed 2.83% of the slugs.


Completed 22.88% of the slugs.
etherparty
Completed 22.99% of the slugs.
enjin-coin
Completed 23.1% of the slugs.
power-ledger
Completed 23.22% of the slugs.
revain
Completed 23.33% of the slugs.
electroneum
Completed 23.44% of the slugs.
minexcoin
Completed 23.56% of the slugs.
shield-xsh
Completed 23.67% of the slugs.
aeron
Completed 23.78% of the slugs.
raiden-network-token
Completed 23.9% of the slugs.
delphy
Completed 24.01% of the slugs.
decent-bet
Completed 24.12% of the slugs.
genesis-vision
Completed 24.24% of the slugs.
encrypgen
Completed 24.35% of the slugs.
ink
Completed 24.46% of the slugs.
bodhi
Completed 24.58% of the slugs.
quantstamp
Completed 24.69% of the slugs.
qash
Completed 24.8% of the slugs.
spankchain
Completed 24.92% of the slugs.
bitcoin-diamond
Completed 25.03% of the slugs.
blockv
Completed 25.14% of the slugs.
time-new-bank
Completed 25.25% of the slugs.
dragonchain
Completed 25.37% of the slugs.
presearch
Completed 25.48% of the slugs.
cybermiles
Complet

Completed 45.19% of the slugs.
mvl
Completed 45.3% of the slugs.
arbitrage
Completed 45.41% of the slugs.
stasis-euro
Completed 45.53% of the slugs.
nix
Completed 45.64% of the slugs.
vethor-token
Completed 45.75% of the slugs.
bhp-coin
Completed 45.87% of the slugs.
zel
Completed 45.98% of the slugs.
you-coin
Completed 46.09% of the slugs.
bitcapitalvendor
Completed 46.21% of the slugs.
litex
Completed 46.32% of the slugs.
vechain
Completed 46.43% of the slugs.
proximax
Completed 46.55% of the slugs.
dxchain-token
Completed 46.66% of the slugs.
ubex
Completed 46.77% of the slugs.
hycon
api call is still running... let's kill it...
Completed 46.89% of the slugs.
davinci-coin
Completed 47.0% of the slugs.
quant
Completed 47.11% of the slugs.
pumapay
Completed 47.23% of the slugs.
maro
Completed 47.34% of the slugs.
nasdacoin
Completed 47.45% of the slugs.
ontology-gas
Completed 47.57% of the slugs.
energi
Completed 47.68% of the slugs.
timicoin
Completed 47.79% of the slugs.
amo-coin
Co

Process Process-525:
Traceback (most recent call last):
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 382, in _make_request
    self._validate_conn(conn)
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 1010, in _validate_conn
    conn.connect()
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/connection.py", line 411, in connect
    self.sock = ssl_wrap_socket(
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 428, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
  File "/home/baybutt/anaconda3/lib/python3.8/site-packages/urllib3/util/ssl_.py", line 472, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
  File "/home/baybutt/anaconda3

Completed 57.76% of the slugs.
algorand
Completed 57.87% of the slugs.
contentos
Completed 57.98% of the slugs.
moviebloc
Completed 58.1% of the slugs.
arpa-chain
Completed 58.21% of the slugs.
mx-token
Completed 58.32% of the slugs.
ampleforth
Completed 58.44% of the slugs.
chiliz
Completed 58.55% of the slugs.
silverway
Completed 58.66% of the slugs.
wirex-token
Completed 58.78% of the slugs.
dusk-network
Completed 58.89% of the slugs.
coinmetro-token
Completed 59.0% of the slugs.
prometeus
Completed 59.12% of the slugs.
sapphire
Completed 59.23% of the slugs.
akropolis
Completed 59.34% of the slugs.
thorchain
Completed 59.46% of the slugs.
realio-network
Completed 59.57% of the slugs.
ultra
Completed 59.68% of the slugs.
ftx-token
Completed 59.8% of the slugs.
sharetoken
Completed 59.91% of the slugs.
chainx
Completed 60.02% of the slugs.
wink
Completed 60.14% of the slugs.
mb8-coin
Completed 60.25% of the slugs.
bosagora
Completed 60.36% of the slugs.
klaytn
Completed 60.48% of the

Completed 79.61% of the slugs.
wemix
Completed 79.73% of the slugs.
dvision-network
Completed 79.84% of the slugs.
saffron-finance
Completed 79.95% of the slugs.
oasis-network
Completed 80.07% of the slugs.
redfox-labs
Completed 80.18% of the slugs.
radix
Completed 80.29% of the slugs.
truefi-token
Completed 80.41% of the slugs.
bifrost
Completed 80.52% of the slugs.
vai
Completed 80.63% of the slugs.
mirror-protocol
Completed 80.75% of the slugs.
badger-dao
Completed 80.86% of the slugs.
mobilecoin
Completed 80.97% of the slugs.
efforce
Completed 81.09% of the slugs.
bonfida
Completed 81.2% of the slugs.
biopassport-token
Completed 81.31% of the slugs.
rally
Completed 81.43% of the slugs.
steth
Completed 81.54% of the slugs.
1inch
Completed 81.65% of the slugs.
safepal
api call is still running... let's kill it...
Completed 81.77% of the slugs.
shopping
Completed 81.88% of the slugs.
nftx
Completed 81.99% of the slugs.
zkswap
Completed 82.11% of the slugs.
quickswap
Completed 82.22% o

In [20]:
# Loop over the dictionary to find all the metrics in universe of slugs
san_metric_universe = []
for san_slug_key in san_slug_metrics_dict.keys():
    san_metric_universe += san_slug_metrics_dict[san_slug_key]

san_metric_universe = list(np.unique(np.array(san_metric_universe)))

In [21]:
# Drop anything with 5m (minute) or 1h (hour) in the name confirming the drops
metrics_1h_to_drop = [metric for metric in san_metric_universe if '1h' in metric]
for metric in metrics_1h_to_drop:
    print(metric)
    san_metric_universe.remove(metric)
    
metrics_5m_to_drop = [metric for metric in san_metric_universe if '5m' in metric]
for metric in metrics_5m_to_drop:
    print(metric)
    san_metric_universe.remove(metric)

active_addresses_1h
price_usd_change_1h
social_dominance_bitcointalk_1h_moving_average
social_dominance_reddit_1h_moving_average
social_dominance_telegram_1h_moving_average
social_dominance_total_1h_moving_average
social_dominance_total_1h_moving_average_change_1d
social_dominance_total_1h_moving_average_change_30d
social_dominance_total_1h_moving_average_change_7d
social_dominance_twitter_1h_moving_average
unique_social_volume_total_1h
active_deposits_5m
active_withdrawals_5m
average_fees_usd_5m
average_transfer_5m
deposit_transactions_5m
median_fees_usd_5m
median_transfer_5m
nvt_5min
price_usd_5m
unique_social_volume_total_5m
withdrawal_transactions_5m


In [22]:
# Go thru the metrics to drop ones that I manually just don't want
metrics_to_drop = ["30d_moving_avg_dev_activity_change_1d",
                   "active_addresses_24h_change_1d",
                   "active_addresses_24h_change_30d",
                   "active_addresses_24h_change_7d",
                   "bitmex_perpetual_funding_rate_change_1d",
                   "bitmex_perpetual_funding_rate_change_30d",
                   "bitmex_perpetual_funding_rate_change_7d",
                   "cexes_to_defi_flow_change_1d",
                   "cexes_to_defi_flow_change_30d",
                   "cexes_to_defi_flow_change_7d",
                   "cexes_to_dex_flow_change_30d",
                   "cexes_to_dex_traders_flow_change_1d",
                   "cexes_to_dex_traders_flow_change_30d",
                   "cexes_to_dex_traders_flow_change_7d",
                   "cexes_to_other_flow_change_1d",
                   "cexes_to_other_flow_change_30d",
                   "cexes_to_other_flow_change_7d",
                   "cexes_to_whale_flow_change_1d",
                   "cexes_to_whale_flow_change_30d",
                   "cexes_to_whale_flow_change_7d",
                   "circulation_180d_change_1d",
                   "circulation_180d_change_30d",
                   "circulation_180d_change_7d",
                   "circulation_change_1d",
                   "circulation_change_30d",
                   "circulation_change_7d",
                   "circulation_usd_180d_change_1d",
                   "circulation_usd_180d_change_30d",
                   "circulation_usd_180d_change_7d",
                   "daily_closing_marketcap_usd",
                   "daily_closing_price_usd",
                   "daily_high_price_usd",
                   "daily_low_price_usd",
                   "daily_opening_price_usd",
                   "defi_to_cexes_flow_change_1d",
                   "defi_to_cexes_flow_change_30d",
                   "defi_to_cexes_flow_change_7d",
                   "defi_to_dex_traders_flow_change_1d",
                   "defi_to_dex_traders_flow_change_30d",
                   "defi_to_dex_traders_flow_change_7d",
                   "defi_to_dexes_flow_change_1d",
                   "defi_to_dexes_flow_change_30d",
                   "defi_to_dexes_flow_change_7d",
                   "defi_to_other_flow_change_1d",
                   "defi_to_other_flow_change_30d",
                   "defi_to_other_flow_change_7d",
                   "defi_to_whale_flow_change_1d",
                   "defi_to_whale_flow_change_30d",
                   "defi_to_whale_flow_change_7d",
                   "dev_activity_change_1d",
                   "dev_activity_change_30d",
                   "dev_activity_change_7d",
                   'dex_to_cexes_flow_change_30d',
                   'dex_to_cexes_flow_change_7d',
                   "dex_traders_to_cexes_flow_change_1d",
                   "dex_traders_to_cexes_flow_change_30d",
                   "dex_traders_to_cexes_flow_change_7d",
                   "dex_traders_to_defi_flow_change_1d",
                   "dex_traders_to_defi_flow_change_30d",
                   "dex_traders_to_defi_flow_change_7d",
                   "dex_traders_to_dexes_flow_change_1d",
                   "dex_traders_to_dexes_flow_change_30d",
                   "dex_traders_to_dexes_flow_change_7d",
                   "dex_traders_to_other_flow_change_1d",
                   "dex_traders_to_other_flow_change_30d",
                   "dex_traders_to_other_flow_change_7d",
                   "dex_traders_to_whale_flow_change_1d",
                   "dex_traders_to_whale_flow_change_30d",
                   "dex_traders_to_whale_flow_change_7d",
                   "dexes_to_defi_flow_change_1d",
                   "dexes_to_defi_flow_change_30d",
                   "dexes_to_defi_flow_change_7d",
                   "dexes_to_dex_traders_flow_change_1d",
                   "dexes_to_dex_traders_flow_change_30d",
                   "dexes_to_dex_traders_flow_change_7d",
                   "dexes_to_other_flow_change_1d",
                   "dexes_to_other_flow_change_30d",
                   "dexes_to_other_flow_change_7d",
                   "dexes_to_whale_flow_change_1d",
                   "dexes_to_whale_flow_change_30d",
                   "dexes_to_whale_flow_change_7d",
                   "dormant_circulation_365d_change_1d",
                   "dormant_circulation_365d_change_30d",
                   "dormant_circulation_365d_change_7d",
                   "dormant_circulation_usd_180d_change_1d",
                   "dormant_circulation_usd_180d_change_30d",
                   "dormant_circulation_usd_180d_change_7d",
                   "exchange_balance_change_1d",
                   "exchange_balance_change_30d",
                   "exchange_balance_change_7d",
                   "exchange_inflow_change_1d",
                   "exchange_inflow_change_30d",
                   "exchange_inflow_change_7d",
                   "exchange_inflow_usd_change_1d",
                   "exchange_inflow_usd_change_30d",
                   "exchange_inflow_usd_change_7d",
                   "exchange_outflow_change_1d",
                   "exchange_outflow_change_30d",
                   "exchange_outflow_change_7d",
                   "exchange_outflow_usd_change_1d",
                   "exchange_outflow_usd_change_30d",
                   "exchange_outflow_usd_change_7d",
                   'historical_balance',
                   'historical_balance_changes',
                   "marketcap_usd_change_1d",
                   "marketcap_usd_change_30d",
                   "marketcap_usd_change_7d",
                   "mean_dollar_invested_age_change_1d",
                   "mean_dollar_invested_age_change_30d",
                   "mean_dollar_invested_age_change_7d",
                   "mvrv_usd_180d_change_1d",
                   "mvrv_usd_180d_change_30d",
                   "mvrv_usd_180d_change_7d",
                   "mvrv_usd_30d_change_1d",
                   "mvrv_usd_30d_change_30d",
                   "mvrv_usd_30d_change_7d",
                   "mvrv_usd_365d_change_1d",
                   "mvrv_usd_365d_change_30d",
                   "mvrv_usd_365d_change_7d",
                   "mvrv_usd_change_1d",
                   "mvrv_usd_change_30d",
                   "mvrv_usd_change_7d",
                   "network_profit_loss_change_1d",
                   "network_profit_loss_change_30d",
                   "network_profit_loss_change_7d",
                   "other_to_cexes_flow_change_1d",
                   "other_to_cexes_flow_change_30d",
                   "other_to_cexes_flow_change_7d",
                   "other_to_defi_flow_change_1d",
                   "other_to_defi_flow_change_30d",
                   "other_to_defi_flow_change_7d",
                   "other_to_dexes_flow_change_1d",
                   "other_to_dexes_flow_change_30d",
                   "other_to_dexes_flow_change_7d",
                   "other_to_traders_flow_change_1d",
                   "other_to_traders_flow_change_30d",
                   "other_to_traders_flow_change_7d",
                   "other_to_whale_flow_change_1d",
                   "other_to_whale_flow_change_30d",
                   "other_to_whale_flow_change_7d",
                   "price_btc_change_1d",
                   "price_btc_change_30d",
                   "price_btc_change_7d",
                   "price_eth_change_1d",
                   "price_eth_change_30d",
                   "price_eth_change_7d",
                   "price_usd_change_1d",
                   "price_usd_change_30d",
                   "price_usd_change_7d",
                   "sentiment_balance_total_change_1d",
                   "sentiment_balance_total_change_30d",
                   "sentiment_balance_total_change_7d",
                   "sentiment_volume_consumed_total_change_1d",
                   "sentiment_volume_consumed_total_change_30d",
                   "sentiment_volume_consumed_total_change_7d",
                   'social_active_users',
                   "social_dominance_bitcointalk_24h_moving_average",
                   "social_dominance_reddit_24h_moving_average",
                   "social_dominance_telegram_24h_moving_average",
                   "social_dominance_total_24h_moving_average",
                   "social_dominance_total_24h_moving_average_change_1d",
                   "social_dominance_total_24h_moving_average_change_30d",
                   "social_dominance_total_24h_moving_average_change_7d",
                   "social_dominance_total_change_1d",
                   "social_dominance_total_change_30d",
                   "social_dominance_total_change_7d",
                   "social_dominance_twitter_24h_moving_average",
                   "social_volume_total_change_1d",
                   "social_volume_total_change_30d",
                   "social_volume_total_change_7d",
                   "traders_to_other_flow_change_1d",
                   "traders_to_other_flow_change_30d",
                   "traders_to_other_flow_change_7d",
                   "traders_to_whale_flow_change_1d",
                   "traders_to_whale_flow_change_30d",
                   "traders_to_whale_flow_change_7d",
                   "transaction_volume_change_1d",
                   "transaction_volume_change_30d",
                   "transaction_volume_change_7d",
                   "transaction_volume_usd_change_1d",
                   "transaction_volume_usd_change_30d",
                   "transaction_volume_usd_change_7d",
                   "volume_usd_change_1d",
                   "volume_usd_change_30d",
                   "volume_usd_change_7d",
                   "whale_to_cexes_flow_change_1d",
                   "whale_to_cexes_flow_change_30d",
                   "whale_to_cexes_flow_change_7d",
                   "whale_to_defi_flow_change_1d",
                   "whale_to_defi_flow_change_30d",
                   "whale_to_defi_flow_change_7d",
                   "whale_to_dex_traders_flow_change_1d",
                   "whale_to_dex_traders_flow_change_30d",
                   "whale_to_dex_traders_flow_change_7d",
                   "whale_to_dexes_flow_change_1d",
                   "whale_to_dexes_flow_change_30d",
                   "whale_to_dexes_flow_change_7d",
                   "whale_to_other_flow_change_1d",
                   "whale_to_other_flow_change_30d",
                   "whale_to_other_flow_change_7d",
                   "whale_to_traders_flow_change_1d",
                   "whale_to_traders_flow_change_30d",
                   "whale_to_traders_flow_change_7d",
                   "whale_transaction_count_100k_usd_to_inf_change_1d",
                   "whale_transaction_count_100k_usd_to_inf_change_30d",
                   "whale_transaction_count_100k_usd_to_inf_change_7d",
                   "whale_transaction_count_1m_usd_to_inf_change_1d",
                   "whale_transaction_count_1m_usd_to_inf_change_30d",
                   "whale_transaction_count_1m_usd_to_inf_change_7d",
                   'withdrawal_balance']

for metric in metrics_to_drop: 
    san_metric_universe.remove(metric)

In [23]:
# Create a data frame of how many assets are available for each metric
assets_per_metric_dict = {'metric': [],
                          'num_assets': []}

for metric in san_metric_universe:
    # Initialize counter to zero
    num_assets = 0
    
    # Loop over all the assets to see if each has the metric to increment if so
    for asset in san_slug_metrics_dict:
        if metric in san_slug_metrics_dict[asset]:
            num_assets += 1
  
    # Update result dictionary with number of assets that have the metric
    assets_per_metric_dict['metric'].append(metric)
    assets_per_metric_dict['num_assets'].append(num_assets)
    
# Clean up into a df
assets_per_metric_df = pd.DataFrame(assets_per_metric_dict)
assets_per_metric_df = assets_per_metric_df.sort_values(by='num_assets', ascending=False).reset_index(drop=True)


In [25]:
# Drop the lower metrics
index_to_keep_to = 366
assert assets_per_metric_df.metric.values[index_to_keep_to] == 'stock_to_flow'
timeseries_metrics = list(assets_per_metric_df[:(index_to_keep_to+1)].metric.values)
timeseries_metrics.remove('age_distribution')
histogram_metrics = ['age_distribution']


In [41]:
def obtainSlugsForMetric(metric):
    san_slugs_for_metric_dict['san_slugs'] = san.metadata(metric, arr=['availableSlugs'])['availableSlugs']
    time.sleep(1.21)

def obtainStartTimeforMetric(metric, san_slug):
    time_start_dict[san_slug] = san.available_metric_for_slug_since(metric=metric, slug=san_slug)
    time.sleep(1.21)

# Drop metrics that are not available early enough
valid_timeseries_metrics     = []
failed_pulls_list            = []
failed_pulls_time_start_list = []
i=0
num_metrics = len(timeseries_metrics)
for metric in timeseries_metrics[i:]:
    print(metric)
    print('Completed ' + str(np.round(i/num_metrics * 100, 2)) + '% of the metrics.')
    i += 1
    
    # Obtain a slug that is available for the metric
    manager = multiprocessing.Manager()
    san_slugs_for_metric_dict = manager.dict()
    p = multiprocessing.Process(target=obtainSlugsForMetric, args=(metric, ))
    p.start()
    
    # if thread is active
    time.sleep(2)
    if p.is_alive():
        print("api call is still running... let's kill it...")
        p.terminate()
        p = multiprocessing.Process(target=obtainSlugsForMetric, args=(metric, ))
        p.start()
        time.sleep(3)
        if p.is_alive():
            print('api call failed twice\n')
            p.terminate()
            failed_pulls_list.append(metric)
            continue
    
    p.join()
    san_slugs_for_metric = san_slugs_for_metric_dict['san_slugs']
    if 'bitcoin' in san_slugs_for_metric:
        san_slug = 'bitcoin'
    elif 'ethereum' in san_slugs_for_metric:
        san_slug = 'ethereum'
    elif 'cardano' in san_slugs_for_metric:
        san_slug = 'cardano'
    elif 'decentraland' in san_slugs_for_metric:
        san_slug = 'decentraland'
    else:
        print(san_slugs_for_metric)
        assert(1==0)
        
    # Obtain the time the metric starts
    manager = multiprocessing.Manager()
    time_start_dict = manager.dict()
    p = multiprocessing.Process(target=obtainStartTimeforMetric, args=(metric, san_slug))
    p.start()
    
    # if thread is active
    time.sleep(2)
    if p.is_alive():
        print("api call is still running... let's kill it...")
        p.terminate()
        p = multiprocessing.Process(target=obtainStartTimeforMetric, args=(metric, san_slug))
        p.start()
        time.sleep(3)
        if p.is_alive():
            print('api call failed twice\n')
            p.terminate()
            failed_pulls_time_start_list.append(metric)
            continue
    
    p.join()
    time_start = dt.datetime.strptime(time_start_dict[san_slug], '%Y-%m-%dT%H:%M:%SZ')
    
    if time_start.year >= 2018:
        print('We are losing ' + metric + ' given it starts in ' + str(time_start.year) + '.')
    elif time_start.year <= 2017:
        valid_timeseries_metrics.append(metric)
    else:
        assert (1==0),('PROBLEM!')
        
    time.sleep(2)
    print('\n')
        
print(failed_pulls_list)
print(failed_pulls_time_start_list)

ftx_perpetual_funding_rate
Completed 95.9% of the metrics.
We are losing ftx_perpetual_funding_rate given it starts in 2019.


ftx_perpetual_open_interest
Completed 96.17% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing ftx_perpetual_open_interest given it starts in 2022.


traders_defi_balance
Completed 96.45% of the metrics.
api call is still running... let's kill it...


usdt_binance_funding_rate
Completed 96.72% of the metrics.
api call is still running... let's kill it...
We are losing usdt_binance_funding_rate given it starts in 2019.


usdt_bnb_funding_rates
Completed 96.99% of the metrics.
api call is still running... let's kill it...
We are losing usdt_bnb_funding_rates given it starts in 2019.


usdt_binance_open_value
Completed 97.27% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing usdt_binance_open_value given it starts in 2022.

In [44]:
# Clean it up
valid_timeseries_metrics = list(np.unique(np.array(valid_timeseries_metrics)))

In [None]:
# Function to obtain timeseries data
def obtainTimeseriesData(metric, san_slug):
    temp_df = san.get(metric + '/' + san_slug,
                      from_date="2015-01-01",
                      to_date="2022-01-07",
                      interval="1d")
    time.sleep(.121)
    if not temp_df.empty:
        temp_df['san_slug'] = san_slug
        metric_dict['date'] = temp_df.index.values
        metric_dict['san_slug'] = temp_df['san_slug'].values
        metric_dict[metric] = temp_df['value'].values

# Initialize dataframe to store all results
ts_df = pd.DataFrame(data={'date': [],
                           'san_slug': []})

# Initialize dictionary to keep track of the data we missed
failed_pulls_dict = {}

# Loop over all the metrics to pull the data
i = 0
num_metrics = len(valid_timeseries_metrics)
for metric in valid_timeseries_metrics[i:]:
    print(metric)
    print('Completed ' + str(np.round(i/num_metrics * 100, 2)) + '% of the metrics.\n')
    i += 1

    # Initialize dataframe to store results for this metric
    metric_df = pd.DataFrame()
    failed_pulls_dict[metric] = []

    # Obtain the list of slugs that have this metric
    san_slugs = []
    for san_slug in san_slug_metrics_dict:
        if metric in san_slug_metrics_dict[san_slug]:
            san_slugs.append(san_slug)

    # Drop san slugs that are not in our universe
    san_slugs = list(set(san_slugs).intersection(set(san_slugs_list)))

    # Loop over all the slugs to pull the data
    j = 0
    num_slugs = len(san_slugs)
    for san_slug in san_slugs:
        print('Completed ' + str(np.round(j/num_slugs * 100, 2)) + '% of the slugs for this metric.')
        j += 1

        # Initiate the call as a process
        manager = multiprocessing.Manager()
        metric_dict = manager.dict()
        p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
        p.start()

        time.sleep(3)
        # If thread is active
        if p.is_alive():
            print("api call is still running... let's kill it...")
            p.terminate()

            # Try the call again
            manager = multiprocessing.Manager()
            metric_dict = manager.dict()
            p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
            p.start()
            time.sleep(4)
            if p.is_alive():
                print('api call failed twice')
                p.terminate()

                # Try the call one last time
                manager = multiprocessing.Manager()
                metric_dict = manager.dict()
                p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
                p.start()
                time.sleep(5)
                if p.is_alive():
                    print('api call failed thrice\n')
                    p.terminate()
                    failed_pulls_dict[metric].append(san_slug)
                    continue

        # Cleanup
        p.join()

        # Add results for this slug and metric to metric dataframe
        # only if the pull was successful
        if len(metric_dict.keys()) >= 1:
            metric_df = metric_df.append(pd.DataFrame(data={'date': metric_dict['date'],
                                                            'san_slug': metric_dict['san_slug'],
                                                            metric: metric_dict[metric]}))
            assert(0==metric_df[metric_df.duplicated(subset=['date', 'san_slug'])].shape[0]),('duped rows')
        else:
            failed_pulls_dict[metric].append(san_slug)
            print('For '+san_slug+' pull of '+metric+', we received no data so it is skipped.\n')
            
    # Merge each metrics dataframe onto the master df
    ts_df = ts_df.merge(metric_df,
                        on=['date', 'san_slug'],
                        how='outer',
                        validate='one_to_one')

    # Space out prints
    print('\n\n\n')

In [99]:
# Clean up ts_df 
ts_df = ts_df[~ts_df.duplicated(subset=['date', 'san_slug'])]
ts_df = ts_df.sort_values(by = ['date', 'san_slug'])


In [126]:
# Build dictionary of macro metrics and corresponding slugs to pull data for

macro_metric_slugs_dict = {}
macro_metric_slugs_dict['mcd_collat_ratio'] = ["wrapped-bitcoin", "gemini-dollar",
                                               "usd-coin", "paxos-standard",
                                               "decentraland", "trueusd",
                                               "yearn-finance", "chainlink",
                                               "weth", "balancer"]
macro_metric_slugs_dict['defi_total_value_locked_usd'] = ['ethereum']
macro_metric_slugs_dict['nft_trade_volume_usd'] = ['ethereum']
macro_metric_slugs_dict['nft_trades_count'] = ['ethereum']
macro_metric_slugs_dict['nft_retail_trade_volume_usd'] = ['ethereum']
macro_metric_slugs_dict['nft_whale_trade_volume_usd'] = ['ethereum']
macro_metric_slugs_dict['nft_whale_trades_count'] = ['ethereum']
macro_metric_slugs_dict['percent_of_whale_stablecoin_total_supply'] = ['ethereum']
macro_metric_slugs_dict['average_fees_usd'] = ['ethereum']
macro_metric_slugs_dict['fees_usd'] = ['ethereum']
macro_metric_slugs_dict['eth2_roi'] = ['ethereum']
macro_metric_slugs_dict['median_fees_usd'] = ['ethereum']
macro_metric_slugs_dict['miners_to_exchanges_flow'] = ['ethereum', 'bitcoin']
macro_metric_slugs_dict['miners_exchange_balance'] = ['ethereum', 'bitcoin']

# Determine slugs that are available for the following macro metrics
macro_metrics = ['defi_to_dexes_flow',
                 'defi_dex_balance',
                 'cexes_to_dex_flow',
                 'dexes_to_defi_flow',
                 'defi_to_cexes_flow',
                 'defi_to_exchanges_flow',
                 'exchanges_to_defi_flow',
                 'whale_to_defi_flow', 
                 'dex_traders_to_defi_flow',
                 'whale_defi_balance',
                 'mvrv_usd_intraday']

def obtainSlugsForMetric(metric):
    san_slugs_for_metric_dict['san_slugs'] = san.metadata(metric, arr=['availableSlugs'])['availableSlugs']
    time.sleep(1.21)

def obtainStartTimeforMetric(metric, san_slug):
    time_start_dict[san_slug] = san.available_metric_for_slug_since(metric=metric, slug=san_slug)
    time.sleep(1.21)

# Drop metrics that are not available early enough
failed_pulls_list_macro            = []
failed_pulls_time_start_list_macro = []
i=0
num_metrics = len(macro_metrics)
for metric in macro_metrics[i:]:
    print(metric)
    print('Completed ' + str(np.round(i/num_metrics * 100, 2)) + '% of the metrics.')
    i += 1
    
    # Obtain a slug that is available for the metric
    manager = multiprocessing.Manager()
    san_slugs_for_metric_dict = manager.dict()
    p = multiprocessing.Process(target=obtainSlugsForMetric, args=(metric, ))
    p.start()
    
    # if thread is active
    time.sleep(2)
    if p.is_alive():
        print("api call is still running... let's kill it...")
        p.terminate()
        p = multiprocessing.Process(target=obtainSlugsForMetric, args=(metric, ))
        p.start()
        time.sleep(3)
        if p.is_alive():
            print('api call failed twice\n')
            p.terminate()
            failed_pulls_list.append(metric)
            continue
    
    p.join()
    san_slugs_for_metric = san_slugs_for_metric_dict['san_slugs']
    if 'bitcoin' in san_slugs_for_metric:
        san_slug = 'bitcoin'
    elif 'ethereum' in san_slugs_for_metric:
        san_slug = 'ethereum'
    elif 'cardano' in san_slugs_for_metric:
        san_slug = 'cardano'
    elif 'tether' in san_slugs_for_metric:
        san_slug = 'decentraland'
    else:
        print(san_slugs_for_metric)
        assert(1==0)
        
    # Obtain the time the metric starts
    manager = multiprocessing.Manager()
    time_start_dict = manager.dict()
    p = multiprocessing.Process(target=obtainStartTimeforMetric, args=(metric, san_slug))
    p.start()
    
    # if thread is active
    time.sleep(2)
    if p.is_alive():
        print("api call is still running... let's kill it...")
        p.terminate()
        p = multiprocessing.Process(target=obtainStartTimeforMetric, args=(metric, san_slug))
        p.start()
        time.sleep(3)
        if p.is_alive():
            print('api call failed twice\n')
            p.terminate()
            failed_pulls_time_start_list.append(metric)
            continue
    
    p.join()
    time_start = dt.datetime.strptime(time_start_dict[san_slug], '%Y-%m-%dT%H:%M:%SZ')
    
    if time_start.year >= 2018:
        print('We are losing ' + metric + ' given it starts in ' + str(time_start.year) + '.')
    elif time_start.year <= 2017:
        macro_metric_slugs_dict[metric] = san_slugs_for_metric
    else:
        assert (1==0),('PROBLEM!')
        
    time.sleep(2)
    print('\n')
        
print(failed_pulls_list_macro)
print(failed_pulls_time_start_list_macro)


defi_to_dexes_flow
Completed 0.0% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing defi_to_dexes_flow given it starts in 2018.


defi_dex_balance
Completed 9.09% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing defi_dex_balance given it starts in 2018.


cexes_to_dex_flow
Completed 18.18% of the metrics.


dexes_to_defi_flow
Completed 27.27% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing dexes_to_defi_flow given it starts in 2018.


defi_to_cexes_flow
Completed 36.36% of the metrics.
api call is still running... let's kill it...
We are losing defi_to_cexes_flow given it starts in 2018.


defi_to_exchanges_flow
Completed 45.45% of the metrics.
api call is still running... let's kill it...
api call is still running... let's kill it...
We are losing defi_to_exchanges_flow

In [128]:
# Obtain the macro timeseries data
def obtainTimeseriesData(metric, san_slug):
    temp_df = san.get(metric + '/' + san_slug,
                      from_date="2015-01-01",
                      to_date="2022-01-07",
                      interval="1d")
    time.sleep(.121)
    if not temp_df.empty:
        temp_df['san_slug'] = san_slug
        metric_dict['date'] = temp_df.index.values
        metric_dict['san_slug'] = temp_df['san_slug'].values
        metric_dict[metric] = temp_df['value'].values

# Initialize dataframe to store all results
macro_df = pd.DataFrame(data={'date': [],
                              'san_slug': []})

# Initialize dictionary to keep track of the data we missed
failed_pulls_dict = {}

# Loop over all the metrics to pull the data
i = 0
num_metrics = len(list(macro_metric_slugs_dict.keys()))
for metric in list(macro_metric_slugs_dict.keys())[i:]:
    print(metric)
    print('Completed ' + str(np.round(i/num_metrics * 100, 2)) + '% of the metrics.\n')
    i += 1

    # Initialize dataframe to store results for this metric
    metric_df = pd.DataFrame()
    failed_pulls_dict[metric] = []

    # Obtain the list of slugs that have this metric
    san_slugs = macro_metric_slugs_dict[metric]
    
    # Drop wrapped-bitcoin
    if ('bitcoin' in san_slugs) and ('wrapped-bitcoin' in san_slugs):
        san_slugs.remove('wrapped-bitcoin')
        
    # Loop over all the slugs to pull the data
    j = 0
    num_slugs = len(san_slugs)
    for san_slug in san_slugs:
        print('Completed ' + str(np.round(j/num_slugs * 100, 2)) + '% of the slugs for this metric.')
        j += 1

        # Initiate the call as a process
        manager = multiprocessing.Manager()
        metric_dict = manager.dict()
        p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
        p.start()

        time.sleep(3)
        # If thread is active
        if p.is_alive():
            print("api call is still running... let's kill it...")
            p.terminate()

            # Try the call again
            manager = multiprocessing.Manager()
            metric_dict = manager.dict()
            p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
            p.start()
            time.sleep(4)
            if p.is_alive():
                print('api call failed twice')
                p.terminate()

                # Try the call one last time
                manager = multiprocessing.Manager()
                metric_dict = manager.dict()
                p = multiprocessing.Process(target=obtainTimeseriesData, args=(metric, san_slug))
                p.start()
                time.sleep(5)
                if p.is_alive():
                    print('api call failed thrice\n')
                    p.terminate()
                    failed_pulls_dict[metric].append(san_slug)
                    continue

        # Cleanup
        p.join()

        # Add results for this slug and metric to metric dataframe
        # only if the pull was successful
        if len(metric_dict.keys()) >= 1:
            metric_df = metric_df.append(pd.DataFrame(data={'date': metric_dict['date'],
                                                            'san_slug': metric_dict['san_slug'],
                                                            metric: metric_dict[metric]}))
            assert(0==metric_df[metric_df.duplicated(subset=['date', 'san_slug'])].shape[0]),('duped rows')
        else:
            failed_pulls_dict[metric].append(san_slug)
            print('For '+san_slug+' pull of '+metric+', we received no data so it is skipped.\n')
            
    # Merge each metrics dataframe onto the master df
    macro_df = macro_df.merge(metric_df,
                              on=['date', 'san_slug'],
                              how='outer',
                              validate='one_to_one')

    # Space out prints
    print('\n\n\n')

mcd_collat_ratio
Completed 0.0% of the metrics.

Completed 0.0% of the slugs for this metric.
Completed 10.0% of the slugs for this metric.
Completed 20.0% of the slugs for this metric.
Completed 30.0% of the slugs for this metric.
Completed 40.0% of the slugs for this metric.
Completed 50.0% of the slugs for this metric.
Completed 60.0% of the slugs for this metric.
Completed 70.0% of the slugs for this metric.
Completed 80.0% of the slugs for this metric.
Completed 90.0% of the slugs for this metric.




scd_collat_ratio
Completed 4.76% of the metrics.

Completed 0.0% of the slugs for this metric.




defi_total_value_locked_usd
Completed 9.52% of the metrics.

Completed 0.0% of the slugs for this metric.




nft_trade_volume_usd
Completed 14.29% of the metrics.

Completed 0.0% of the slugs for this metric.




nft_trades_count
Completed 19.05% of the metrics.

Completed 0.0% of the slugs for this metric.




nft_retail_trade_volume_usd
Completed 23.81% of the metrics.

Completed 0.0

Completed 94.44% of the slugs for this metric.
Completed 95.37% of the slugs for this metric.
api call is still running... let's kill it...
Completed 96.3% of the slugs for this metric.
Completed 97.22% of the slugs for this metric.
Completed 98.15% of the slugs for this metric.
Completed 99.07% of the slugs for this metric.




dex_traders_to_defi_flow
Completed 85.71% of the metrics.

Completed 0.0% of the slugs for this metric.
Completed 0.83% of the slugs for this metric.
Completed 1.67% of the slugs for this metric.
Completed 2.5% of the slugs for this metric.
Completed 3.33% of the slugs for this metric.
Completed 4.17% of the slugs for this metric.
Completed 5.0% of the slugs for this metric.
Completed 5.83% of the slugs for this metric.
Completed 6.67% of the slugs for this metric.
Completed 7.5% of the slugs for this metric.
Completed 8.33% of the slugs for this metric.
Completed 9.17% of the slugs for this metric.
Completed 10.0% of the slugs for this metric.
Completed 10.83%

Completed 30.66% of the slugs for this metric.
Completed 31.39% of the slugs for this metric.
Completed 32.12% of the slugs for this metric.
Completed 32.85% of the slugs for this metric.
Completed 33.58% of the slugs for this metric.
Completed 34.31% of the slugs for this metric.
Completed 35.04% of the slugs for this metric.
Completed 35.77% of the slugs for this metric.
Completed 36.5% of the slugs for this metric.
Completed 37.23% of the slugs for this metric.
Completed 37.96% of the slugs for this metric.
Completed 38.69% of the slugs for this metric.
Completed 39.42% of the slugs for this metric.
Completed 40.15% of the slugs for this metric.
Completed 40.88% of the slugs for this metric.
Completed 41.61% of the slugs for this metric.
Completed 42.34% of the slugs for this metric.
Completed 43.07% of the slugs for this metric.
Completed 43.8% of the slugs for this metric.
Completed 44.53% of the slugs for this metric.
Completed 45.26% of the slugs for this metric.
api call is sti

In [133]:
# CLEAN THE MACRO DATA

# Move over some data from the timeseries df to the macro df
temp_df = ts_df[['date', 'san_slug', 'stock_to_flow', 'traders_to_defi_flow', 'traders_defi_balance']]
ts_df = ts_df.drop(['stock_to_flow', 'traders_to_defi_flow', 'traders_defi_balance'], axis=1)
macro_df = macro_df.merge(temp_df,
                          on=['date', 'san_slug'],
                          how='outer',
                          validate='one_to_one')

# Form mcap weighted average variables
temp_df = macro_df[['date', 'san_slug', 'stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio']]
temp_df = temp_df.dropna(how='all', subset=['stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio'])
stf_tokens  = list(np.unique(temp_df[~temp_df.stock_to_flow.isnull()].san_slug.values))
mvrv_tokens = list(np.unique(temp_df[~temp_df.mvrv_usd_intraday.isnull()].san_slug.values))
mcd_tokens  = list(np.unique(temp_df[~temp_df.mcd_collat_ratio.isnull()].san_slug.values))
mcap_needed_tokens = list(np.unique(np.array(stf_tokens + mvrv_tokens + mcd_tokens + scd_tokens)))
temp_mcap_df = ts_df[ts_df.san_slug.isin(mcap_needed_tokens)][['date', 'san_slug', "marketcap_usd"]]
temp_df = temp_df.merge(temp_mcap_df,
                        on=['date', 'san_slug'],
                        how='inner',
                        validate='one_to_one')
for col in ['stock_to_flow', 'mvrv_usd_intraday', 'mcd_collat_ratio']:
    mcap_avg_temp_df = temp_df[['date', 'san_slug', col, 'marketcap_usd']]
    mcap_avg_temp_df = mcap_avg_temp_df.dropna()
    mcap_avg_temp_df['total_mcap'] = mcap_avg_temp_df.groupby('date')['marketcap_usd'].transform('sum')
    mcap_avg_temp_df['mcap_fraction'] = mcap_avg_temp_df.marketcap_usd / mcap_avg_temp_df.total_mcap
    mcap_avg_temp_df['temp'] = mcap_avg_temp_df.mcap_fraction * mcap_avg_temp_df[col]
    mcap_avg_temp_df = mcap_avg_temp_df.groupby('date')[['temp']].sum()
    mcap_avg_temp_df['san_slug'] = 'macro'
    mcap_avg_temp_df = mcap_avg_temp_df.reset_index()
    mcap_avg_temp_df = mcap_avg_temp_df.rename(columns = {'temp': ('santiment_token_mcap_avg_'+col)})
    macro_df = macro_df.merge(mcap_avg_temp_df,
                              on=['date', 'san_slug'],
                              how='outer',
                              validate='one_to_one')
    
# Form columns of dollar sum across tokens
sum_columns = ['cexes_to_dex_flow', 'exchanges_to_defi_flow', 'whale_to_defi_flow', 'dex_traders_to_defi_flow', 
               'whale_defi_balance', 'traders_to_defi_flow', 'traders_defi_balance']
for col in sum_columns: 
    temp_df = macro_df[['date', col]]
    temp_df = temp_df.groupby('date')[[col]].sum()
    temp_df = temp_df.rename(columns={col: 'santiment_token_sum_'+col})
    temp_df['san_slug'] = 'macro'
    temp_df = temp_df.reset_index()
    macro_df = macro_df.merge(temp_df,
                              on=['date', 'san_slug'],
                              how='outer',
                              validate='one_to_one')
    
# Form macro variables from column with just bitcoin and/or ethereum
columns = ['defi_total_value_locked_usd', 'nft_trade_volume_usd', 
           'nft_trades_count', 'nft_retail_trade_volume_usd', 
           'nft_whale_trade_volume_usd', 'nft_whale_trades_count', 
           'percent_of_whale_stablecoin_total_supply', 
           'average_fees_usd', 'fees_usd', 'eth2_roi', 'median_fees_usd',
           'miners_to_exchanges_flow', 'miners_exchange_balance', 
           'traders_to_defi_flow', 'traders_defi_balance', 
           'mvrv_usd_intraday', 'stock_to_flow']
for col in columns:
    temp_df = macro_df[['date', 'san_slug', col]]
    temp_df = temp_df.dropna()
    btc_eth_token = list(np.unique(temp_df.san_slug.values))
    if col in ['traders_defi_balance', 'traders_to_defi_flow',
               'mvrv_usd_intraday', 'stock_to_flow']:
        btc_eth_token = ['ethereum']
    for token in btc_eth_token:
        temp_token_df = temp_df[temp_df.san_slug == token]
        temp_token_df['san_slug'] = 'macro'
        temp_token_df = temp_token_df.rename(columns = {col: 'santiment_'+token+'_'+col})
        macro_df = macro_df.merge(temp_token_df,
                                  on=['date', 'san_slug'],
                                  how='outer',
                                  validate='one_to_one')

# Drop sum_columns and columns from macro_df
macro_df = macro_df.drop((sum_columns + columns), axis=1)
macro_df = macro_df.drop(['mcd_collat_ratio'], axis=1)

# Keep just the new rows
macro_df = macro_df[macro_df.san_slug == 'macro']
macro_df = macro_df.drop('san_slug', axis=1)

# Clean it up
macro_df = macro_df.reset_index(drop=True)
macro_df = macro_df.sort_values(by='date')

In [246]:
# SAVE DATA
ts_df.to_pickle('../3-data/raw/santiment_panel.pkl')
macro_df.to_pickle('../3-data/raw/santiment_macro.pkl')
cw_df.to_pickle('../3-data/raw/santiment_cw.pkl')