In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
import numpy as np

from tqdm import tqdm

from sqlalchemy import create_engine, Column, Integer, String, Double, DateTime, Boolean, BigInteger, Float, ARRAY, CHAR
from sqlalchemy.orm import sessionmaker, declarative_base

from typing import cast


load_dotenv()

from datetime import datetime, timezone
from preload_pool_cache import load_pool_from_blob

In [3]:
postgres_uri_mp = os.environ["POSTGRESQL_URI_MP"]
postgres_uri_us = os.environ["POSTGRESQL_URI_US"]
azure_storage_uri = os.environ["AZURE_STORAGE_CONNECTION_STRING"]

## Get data

In [4]:
minmax_block = pd.read_sql_query(
    """
    SELECT
        MIN(block_number) AS min_block,
        MAX(block_number) AS max_block
    FROM
        mev_boost
    """,
    postgres_uri_mp,
)

min_block = minmax_block["min_block"][0]
max_block = minmax_block["max_block"][0]

print(f"Min block: {min_block}, Max block: {max_block}")

Min block: 15537940, Max block: 17959956


In [6]:
addresses = pd.read_sql_query(
    f"""
    SELECT DISTINCT(address) FROM swap_counts
    WHERE block_number >= {min_block}
    AND block_number <= {max_block}
    ORDER BY address ASC
    """,
    postgres_uri_us,
)

print(addresses.shape)
addresses.head()

(8131, 1)


Unnamed: 0,address
0,0xb6c945c5c3473f70968fe4de3222561e0d465a58
1,0x9d2713fa2f387ed1284a4176e7841253b4da2a71
2,0x3fae0f474145a1a771f36bd188d1cc7057a91b06
3,0x44f9469d0d5393d3a01a0d4fa14fe7713c1ad1f7
4,0xf6ed2390be39c783ae78893c91669eeb635d0429


In [5]:
def get_swaps_for_address(address, min_block, max_block):
    return pd.read_sql_query(
        f"""
        SELECT * FROM swaps
        WHERE block_number >= {min_block}
        AND block_number <= {max_block}
        AND address = '{address}'
        """,
        postgres_uri_us,
    )


In [6]:
decimals = pd.read_sql_query(
    f"""
    WITH tok AS (
    SELECT token0 AS token, decimals0 AS decimals FROM token_info
    UNION
    SELECT token1 AS token, decimals1 AS decimals FROM token_info
    )
    SELECT * FROM tok
    WHERE tok.decimals IS NOT NULL;
    """,
    postgres_uri_us,
).set_index("token")

decimals = decimals.decimals.to_dict()

len(decimals)

9536

In [7]:
token_info = pd.read_sql_query(
    f"""
    SELECT * FROM token_info
    """,
    postgres_uri_us,
).set_index("pool")[["token0", "token1", "decimals0", "decimals1"]]

token_info = token_info.to_dict()

## Calculate the metric for each block-pool pair

The three only interesting p-norms to focus on is l_1, l_2, and infinity-norm (i.e., max).

I want the following columns in the metric table:
- block number (bigint)
- pool address (string)
- number of transactions in the block-pool pair (int)
- From MEV-boost (bool) 
- MEV-boost amount (double)
- baseline price p_0 (double) # to check correlations
- realized order 
- realized l_1 (double)
- realized l_2 (double)
- realized l_infinity (double)
- Volume heuristic l_1 (double)
- Volume heuristic l_2 (double)
- Volume heuristic l_infinity (double)

... and add more columns for l_1, l_2, and l_infinity when new heuristics are added.

Plan:
- Find the number of pool-blocknumber pairs and split them into equal sized based on the number of cores
- Query to get a list of pool-blocknumber pairs for each core
- For each pool-blocknumber pair, query the transactions and calculate the metrics
- Save the metrics to the database

In [8]:
df = get_swaps_for_address("0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8", min_block, max_block)

swap = df.iloc[0, :]

print(swap.tx_hash)

swap

0x248c4c26f2a3c3b9025c6581a5b4f576ed1f3a78f857a8f4df81b0234dcd1dea


block_ts                                           2022-12-15 05:57:59
block_number                                                  16188153
tx_hash              0x248c4c26f2a3c3b9025c6581a5b4f576ed1f3a78f857...
log_index                                                            2
sender                      0x000000000035b5e5ad9019092c665357240f594e
recipient                   0x000000000035b5e5ad9019092c665357240f594e
amount0                                                     -128632716
amount1                                             100000000000000000
sqrtpricex96                        2205725684563943950859979373655521
liquidity                                         13312430621130245758
tick                                                            204694
address                     0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8
to_address                  0x26ccc3a2052be5898d60683c7bb621047153bb19
from_address                0x000000000035b5e5ad9019092c665357240f594e
transa

In [10]:
int(swap.amount0) / int(swap.amount1) / 10**(token_info["decimals0"][swap.address] - token_info["decimals1"][swap.address])

-1286.32716

In [11]:
def get_mev_boost_values() -> dict[int, float]:
    res = pd.read_sql_query(
        """
        SELECT block_number, mevboost_value
        FROM
            mev_boost
        """,
        postgres_uri_mp,
    )
    return dict(
        zip(res.block_number, res.mevboost_value)   
    )


In [12]:
def get_pool_block_pairs(limit, offset) -> pd.DataFrame:
    return pd.read_sql_query(
        f"""
        SELECT address, block_number FROM swap_counts
        WHERE block_number >= 15537940 AND block_number <= 17959956
        ORDER BY address ASC, block_number ASC
        LIMIT {limit} OFFSET {offset}
        """,
        postgres_uri_us,
    )

In [13]:
mev_boost_values = get_mev_boost_values()

In [68]:
token_in = token_info["token0"][swap.address] if int(swap.amount0) > 0 else token_info["token1"][swap.address]
input_amount = int(swap.amount0) if int(swap.amount0) > 0 else int(swap.amount1)

token_in, input_amount

('0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2', 100000000000000000)

In [14]:
def get_price(sqrt_price, pool_addr):
    return 1/(sqrt_price**2) / 10**(token_info["decimals0"][pool_addr] - token_info["decimals1"][pool_addr])

In [21]:
pool = load_pool_from_blob(
    swap.address,
    postgres_uri_us,
    azure_storage_uri,
    "uniswap-v3-pool-cache",
    verbose=True,
    invalidate_before_date=datetime(2023, 8, 18, tzinfo=timezone.utc),
)

Loading pool from Azure blob storage cache


In [None]:
out, heur = pool.swapIn({
    "tokenIn": token_in,
    "input": input_amount,
    "as_of": swap.block_number,
    "gas": True,
    # "givenPrice": curr_price,
})
out, heur

In [61]:
get_price(heur.sqrtP_next, swap.address)

1290.1974130644476

In [22]:
def get_pool(address, it):
    return load_pool_from_blob(
        address,
        postgres_uri_us,
        azure_storage_uri,
        "uniswap-v3-pool-cache",
        verbose=False,
        invalidate_before_date=datetime(2023, 8, 23, tzinfo=timezone.utc),
        pbar=it,
    )

The three only interesting p-norms to focus on is l_1, l_2, and infinity-norm (i.e., max).

I want the following columns in the metric table:
- block number (bigint)
- pool address (string)
- number of transactions in the block-pool pair (int)
- From MEV-boost (bool) 
- MEV-boost amount (double)
- baseline price p_0 (double) # to check correlations
- realized order (ARRAY(CHAR(7))) # (CONCAT(transaction_index, _, log_index))
- realized prices (ARRAY(double))
- realized l_1 (double)
- realized l_2 (double)
- realized l_infinity (double)
- Volume heuristic order (ARRAY(CHAR(7))) # (CONCAT(transaction_index, _, log_index))
- Volume heuristic prices (ARRAY(double))
- Volume heuristic l_1 (double)
- Volume heuristic l_2 (double)
- Volume heuristic l_infinity (double)

In [16]:
from sqlalchemy import Boolean


engine_mp = create_engine(postgres_uri_mp)

SessionLocalMP = sessionmaker(bind=engine_mp)

program_start = datetime.now()
Base = declarative_base()

class BlockMetrics(Base):
    __tablename__ = "block_metrics"

    # Meta Data
    block_number = Column(Integer, primary_key=True)
    pool_address = Column(String, primary_key=True)
    num_transactions = Column(Integer)
    baseline_price = Column(Double)

    # MEV Data
    mev_boost = Column(Boolean)
    mev_boost_amount = Column(Double)

    # Realized Data
    realized_order = Column(ARRAY(CHAR(7)))
    realized_prices = Column(ARRAY(Double))
    realized_l1 = Column(Double)
    realized_l2 = Column(Double)
    realized_linf = Column(Double)

    # Volume Heuristic Data
    volume_heur_order = Column(ARRAY(CHAR(7)))
    volume_heur_prices = Column(ARRAY(Double))
    volume_heur_l1 = Column(Double)
    volume_heur_l2 = Column(Double)
    volume_heur_linf = Column(Double)
    

Base.metadata.create_all(engine_mp)

In [17]:
def norm(prices, norm):
    if norm == 1:
        return np.sum(np.abs(prices))
    elif norm == 2:
        return np.sqrt(np.sum(prices**2))
    elif norm == np.inf:
        return np.max(np.abs(prices))
    else:
        raise ValueError("Invalid norm")
    

In [18]:
def do_swap(swap, curr_price, pool):            
    token_in = token_info["token0"][swap.address] if int(swap.amount0) > 0 else token_info["token1"][swap.address]
    input_amount = int(swap.amount0) if int(swap.amount0) > 0 else int(swap.amount1)

    out, heur = pool.swapIn({
        "tokenIn": token_in,
        "input": input_amount,
        "as_of": swap.block_number,
        "gas": True,
        "givenPrice": curr_price,
    })

    return out, heur

In [23]:
pool_block_pairs = get_pool_block_pairs(1_000, 0)

it = tqdm(total=pool_block_pairs.shape[0])
pool = get_pool(pool_block_pairs.address[0], it)


for pool_addr, df in pool_block_pairs.groupby("address"):

    if pool_addr != pool.pool:
        pool = get_pool(pool_addr, it)

    swaps_for_pool = get_swaps_for_address(pool_addr, df.block_number.min(), df.block_number.max())

    block_numbers = df.block_number.unique()

    for block_number in df.block_number.unique():
        swaps = swaps_for_pool[swaps_for_pool.block_number == block_number].sort_values("transaction_index")

        if swaps.shape[0] == 0:
            continue

        curr_price = pool.getPriceAt(block_number)

        swap_metric = BlockMetrics(
            block_number=block_number,
            pool_address=pool_addr,
            num_transactions=swaps.shape[0],
            mev_boost=block_number in mev_boost_values,
            mev_boost_amount=mev_boost_values.get(block_number, 0),
            baseline_price=get_price(curr_price, pool_addr),
        )

        # Run the baseline measurement
        prices = np.zeros(swaps.shape[0])
        ordering = []
        for i, (_, swap) in enumerate(swaps.iterrows()):

            _, heur = do_swap(swap, curr_price, pool)

            prices[i] = get_price(heur.sqrtP_next, swap) - swap_metric.baseline_price
            ordering.append(f"{swap.transaction_index:03}_{swap.log_index:03}")
            curr_price = heur.sqrtP_next

        swap_metric.realized_prices = prices
        swap_metric.realized_order = ordering
        swap_metric.realized_l1 = norm(prices, 1)
        swap_metric.realized_l2 = norm(prices, 2)
        swap_metric.realized_linf = norm(prices, np.inf)

        if swaps.shape[0] == 1:
            swap_metric.volume_heur_prices = prices
            swap_metric.volume_heur_order = ordering
            swap_metric.volume_heur_l1 = swap_metric.realized_l1
            swap_metric.volume_heur_l2 = swap_metric.realized_l2
            swap_metric.volume_heur_linf = swap_metric.realized_linf
            continue

        # Run the volume heuristic measurement
        curr_price = cast(float, pool.getPriceAt(block_number))
        baseline_price = curr_price
        prices = np.zeros(swaps.shape[0])
        ordering = []

        # Split the swaps into the set of buys and sells and order by volume ascending
        buys = [row for _, row in swaps[swaps.amount0 > 0].sort_values("amount0", ascending=False).iterrows()]
        sells = [row for _, row in swaps[swaps.amount1 > 0].sort_values("amount1", ascending=False).iterrows()]
        i = 0

        # While wer're still in the core
        while len(buys) > 0 and len(sells) > 0:
            if curr_price >= baseline_price:
                swap = buys.pop(-1)
            else:
                swap = sells.pop(-1)

            _, heur = do_swap(swap, curr_price, pool)

            prices[i] = get_price(heur.sqrtP_next, swap) - swap_metric.baseline_price
            ordering.append(f"{swap.transaction_index:03}_{swap.log_index:03}")
            curr_price = heur.sqrtP_next

            i += 1

        # Process whatever is left in the tail
        while len(buys) > 0:
            swap = buys.pop(-1)

            _, heur = do_swap(swap, curr_price, pool)

            prices[i] = get_price(heur.sqrtP_next, swap) - swap_metric.baseline_price
            ordering.append(f"{swap.transaction_index:03}_{swap.log_index:03}")
            curr_price = heur.sqrtP_next

            i += 1

        while len(sells) > 0:
            swap = sells.pop(-1)

            _, heur = do_swap(swap, curr_price, pool)

            prices[i] = get_price(heur.sqrtP_next, swap) - swap_metric.baseline_price
            ordering.append(f"{swap.transaction_index:03}_{swap.log_index:03}")
            curr_price = heur.sqrtP_next

            i += 1

        swap_metric.volume_heur_prices = prices
        swap_metric.volume_heur_order = ordering
        swap_metric.volume_heur_l1 = norm(prices, 1)
        swap_metric.volume_heur_l2 = norm(prices, 2)
        swap_metric.volume_heur_linf = norm(prices, np.inf)

        with SessionLocalMP() as session:
            session.add(swap_metric)
            session.commit()
            session.close()

        it.update(1)


  0%|          | 0/1000 [00:00<?, ?it/s, Loading pool from database]                      

KeyboardInterrupt: 