In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
import polars as pl
import lightgbm as lgb
import numpy as np

In [3]:
import logging
import sys
import psutil

CURRENT_PROCESS = psutil.Process()

logger = logging.getLogger(__name__)

def log_memory_consumption():
    current_memory = CURRENT_PROCESS.memory_info().rss / (2**30)
    peak_memory = _get_peak_memory_gib()
    print(
        f"Currently using {current_memory:.2f} GiB memory (max {peak_memory:.2f} GiB)",
    )

def _get_peak_memory_gib() -> float | None:
    match sys.platform:
        case "win32":
            # Return value is bytes
            return CURRENT_PROCESS.memory_info().peak_wset / (2**30)
        case "linux":
            import resource  # NOTE: This package is not available on Windows

            # Return value is kilobytes
            return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (2**20)
        case "darwin":
            import resource  # NOTE: This package is not available on Windows

            # Return value is bytes
            return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (2**30)
        case _:
            return None


In [4]:
log_memory_consumption()

Currently using 0.17 GiB memory (max 0.17 GiB)


In [5]:
df = pl.DataFrame({
    f"col_{i}": np.random.randn(1000000)
    for i in range(100)
})
log_memory_consumption()

Currently using 0.92 GiB memory (max 0.92 GiB)


In [6]:
df.write_parquet("df.parquet")
log_memory_consumption()

Currently using 0.94 GiB memory (max 0.95 GiB)


In [7]:
ldf = pl.scan_parquet("df.parquet")
log_memory_consumption()

Currently using 0.95 GiB memory (max 0.95 GiB)


In [8]:
for _ in range(10):
    log_memory_consumption()
    a = ldf.collect()
    del a
log_memory_consumption()

Currently using 0.95 GiB memory (max 0.95 GiB)
Currently using 1.77 GiB memory (max 2.50 GiB)
Currently using 1.80 GiB memory (max 2.53 GiB)
Currently using 1.82 GiB memory (max 2.55 GiB)
Currently using 1.83 GiB memory (max 2.55 GiB)
Currently using 1.83 GiB memory (max 2.56 GiB)
Currently using 1.83 GiB memory (max 2.56 GiB)
Currently using 1.83 GiB memory (max 2.56 GiB)
Currently using 1.85 GiB memory (max 2.57 GiB)
Currently using 1.85 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)


In [9]:
for _ in range(10):
    log_memory_consumption()
    a = df.to_arrow()
    del a
log_memory_consumption()

Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.86 GiB memory (max 2.58 GiB)


In [10]:
for _ in range(10):
    log_memory_consumption()
    a = ldf.collect().to_arrow()
    del a
log_memory_consumption()

Currently using 1.86 GiB memory (max 2.58 GiB)
Currently using 1.87 GiB memory (max 2.59 GiB)
Currently using 1.87 GiB memory (max 2.60 GiB)
Currently using 1.88 GiB memory (max 2.60 GiB)
Currently using 1.88 GiB memory (max 2.61 GiB)
Currently using 1.89 GiB memory (max 2.62 GiB)
Currently using 1.90 GiB memory (max 2.63 GiB)
Currently using 1.91 GiB memory (max 2.63 GiB)
Currently using 1.92 GiB memory (max 2.64 GiB)
Currently using 1.48 GiB memory (max 2.64 GiB)
Currently using 1.57 GiB memory (max 2.64 GiB)


In [11]:
for _ in range(10):
    log_memory_consumption()
    a = ldf.collect().to_numpy()
    del a
log_memory_consumption()

Currently using 1.56 GiB memory (max 2.64 GiB)
Currently using 1.46 GiB memory (max 2.95 GiB)
Currently using 1.47 GiB memory (max 2.96 GiB)
Currently using 1.47 GiB memory (max 2.96 GiB)
Currently using 1.48 GiB memory (max 2.97 GiB)
Currently using 1.49 GiB memory (max 2.98 GiB)
Currently using 1.50 GiB memory (max 2.99 GiB)
Currently using 1.50 GiB memory (max 2.99 GiB)
Currently using 1.50 GiB memory (max 2.99 GiB)
Currently using 1.50 GiB memory (max 2.99 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)


In [12]:
for _ in range(10):
    log_memory_consumption()
    ds = lgb.Dataset(ldf.collect().to_numpy())
    del ds
log_memory_consumption()

Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.00 GiB)
Currently using 1.51 GiB memory (max 3.01 GiB)
Currently using 1.52 GiB memory (max 3.01 GiB)
Currently using 1.53 GiB memory (max 3.02 GiB)
Currently using 1.51 GiB memory (max 3.02 GiB)
Currently using 1.52 GiB memory (max 3.02 GiB)


In [13]:
for _ in range(10):
    log_memory_consumption()
    ds = lgb.Dataset(ldf.collect().to_numpy())
    ds.construct()
    del ds
log_memory_consumption()

Currently using 1.52 GiB memory (max 3.02 GiB)
Currently using 2.01 GiB memory (max 3.50 GiB)
Currently using 2.23 GiB memory (max 3.72 GiB)
Currently using 2.06 GiB memory (max 3.77 GiB)
Currently using 2.05 GiB memory (max 3.77 GiB)
Currently using 2.01 GiB memory (max 3.77 GiB)
Currently using 2.10 GiB memory (max 3.77 GiB)
Currently using 2.12 GiB memory (max 3.77 GiB)
Currently using 1.71 GiB memory (max 3.77 GiB)
Currently using 1.77 GiB memory (max 3.77 GiB)
Currently using 1.62 GiB memory (max 3.77 GiB)


In [14]:
for _ in range(10):
    log_memory_consumption()
    ds = lgb.Dataset(ldf.collect().to_arrow())
    del ds
log_memory_consumption()

Currently using 1.62 GiB memory (max 3.77 GiB)
Currently using 1.65 GiB memory (max 3.77 GiB)
Currently using 1.65 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.66 GiB memory (max 3.77 GiB)
Currently using 1.67 GiB memory (max 3.77 GiB)
Currently using 1.67 GiB memory (max 3.77 GiB)


In [8]:
for _ in range(10):
    log_memory_consumption()
    ds = lgb.Dataset(ldf.collect().to_arrow())
    ds.construct()
    del ds
log_memory_consumption()

Currently using 0.95 GiB memory (max 0.95 GiB)
Currently using 2.09 GiB memory (max 2.49 GiB)
Currently using 2.71 GiB memory (max 3.34 GiB)
Currently using 3.30 GiB memory (max 3.94 GiB)
Currently using 3.85 GiB memory (max 4.53 GiB)
Currently using 4.42 GiB memory (max 5.08 GiB)
Currently using 4.92 GiB memory (max 5.65 GiB)
Currently using 5.44 GiB memory (max 6.15 GiB)
Currently using 5.86 GiB memory (max 6.67 GiB)
Currently using 6.42 GiB memory (max 7.09 GiB)
Currently using 6.97 GiB memory (max 7.65 GiB)
