In [1]:
import polars as pl

pl.Config.set_tbl_width_chars(100)

polars.config.Config

In [2]:
df = pl.read_parquet(
    "/data/scratch/ankile/surveillance-metric-2/data_original/swaps*/*.parquet"
)

In [3]:
f"{df.shape[0]:,}"

'41,692,163'

In [4]:
df.columns

['block_timestamp',
 'block_number',
 'transaction_hash',
 'log_index',
 'sender',
 'recipient',
 'amount0',
 'amount1',
 'sqrtPriceX96',
 'liquidity',
 'tick',
 'address',
 'from_address',
 'to_address',
 'transaction_index']

In [5]:
df[0].get_column("address").item()

'0x4b5ab61593a2401b1075b90c04cbcdd3f87ce011'

In [6]:
# Sort the df before storing to speed up subsequent reads
df = df.sort(by=["address", "block_number", "transaction_index", "log_index"])

In [7]:
# Write as a hive-partitioned dataset based on the address column
df.write_parquet(
    "/data/scratch/ankile/surveillance-metric-2/data_original/swaps",
    partition_by=["address"],
    compression="lz4",
)

In [8]:
!rm -rf /data/scratch/ankile/surveillance-metric-2/data_original/partitioned_swaps

/shells/zsh: /data/scratch/ankile/miniconda3/envs/rr/lib/libtinfo.so.6: no version information available (required by /shells/zsh)


In [16]:
del df

In [3]:
pool_addr = "0xCBCdF9626bC03E24f779434178A73a0B4bad62eD".lower()

In [7]:
# Check performance of reading a dataframe filtering on the address with and without partitioning
df = (
    pl.scan_parquet(
        "/data/scratch/ankile/surveillance-metric-2/data_original/swaps/",
        hive_partitioning=True,
    )
    .filter(pl.col("address") == pool_addr)
    # .sort(by=["block_number", "transaction_index", "log_index"])
    .collect()
    .to_pandas()
)

In [5]:
del df

In [6]:
df = (
    pl.scan_parquet(
        "/data/scratch/ankile/surveillance-metric-2/data_original/swaps*/*.parquet",
    )
    .filter(pl.col("address") == pool_addr)
    .sort(by=["block_number", "transaction_index", "log_index"])
    .collect()
    .to_pandas()
)

In [8]:
f"{df.shape[0]:,}"

'41,692,163'

In [5]:
# Measure size in memory
import sys

f"DF is {sys.getsizeof(df) / 1e6:.2f} MB"

'DF is 0.00 MB'

In [7]:
import os

os.environ["POLARS_MAX_THREADS"]

KeyError: 'POLARS_MAX_THREADS'