In [None]:
import polars as pl
# No an dependency. This is another package I developed
import polars_ds as pds 
import numpy as np
import pocache

import logging
logging.basicConfig(level=logging.INFO)

ss_temp = pocache.Session(mode = "tempfile", verbose=True, secure=False)
ss_mem = pocache.Session(mode = "mem", verbose=True, secure=False)

In [2]:
df = pds.frame(size = 300_000).with_columns(
    a = pds.random()
    , b = pds.random()
    , c = pds.random_str(min_size = 1, max_size = 3)
)
df.head()

row_num,a,b,c
i64,f64,f64,str
0,0.896763,0.603227,"""oX1"""
1,0.276445,0.766225,"""h3"""
2,0.089248,0.932558,"""L0"""
3,0.989181,0.814625,"""4Sx"""
4,0.305183,0.922261,"""Fj"""


In [3]:
@ss_mem.pocache
def expensive_df_transform1(df: pl.LazyFrame, threshold: float) -> pl.DataFrame:
    import time
    time.sleep(2)

    return df.lazy().filter(
        (pl.col("c").str.len_chars() == 1)
        & (pl.col("a") < threshold)
    ).with_columns(
        deciles = pl.col("a").qcut(10, labels = [f"decile_{str(i+1).zfill(3)}" for i in range(10)])
        , d = (pl.col("a") + pl.col("b")) / 2
    ).group_by("deciles").agg(
        pl.len().alias("cnt")
        , pl.col("a").min().alias("a_min")
        , pl.col("a").max().alias("a_max")
        , pl.col("d").std().alias("d_std")
        , pl.col("b").sum().alias("b_sum")
        , pl.col("c").n_unique().alias("c_n_unique")
    ).sort("deciles").collect()

In [4]:
@ss_temp.pocache(
    serializer = lambda df, path: df.write_parquet(path)
    , deserializer = lambda path: pl.read_parquet(path)
)
def expensive_df_transform2(df: pl.LazyFrame, threshold: float) -> pl.DataFrame:
    import time
    time.sleep(2)

    return df.lazy().filter(
        (pl.col("c").str.len_chars() == 1)
        & (pl.col("a") < threshold)
    ).with_columns(
        deciles = pl.col("a").qcut(10, labels = [f"decile_{str(i+1).zfill(3)}" for i in range(10)])
        , d = (pl.col("a") + pl.col("b")) / 2
    ).group_by("deciles").agg(
        pl.len().alias("cnt")
        , pl.col("a").min().alias("a_min")
        , pl.col("a").max().alias("a_max")
        , pl.col("d").std().alias("d_std")
        , pl.col("b").sum().alias("b_sum")
        , pl.col("c").n_unique().alias("c_n_unique")
    ).sort("deciles").collect()

In [5]:
expensive_df_transform1(df, threshold=0.8)

INFO:pocache.session:Hashing took: 0.001933s. The function took: 2.007752s. Saving cache took: 0.000005s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7960,2.3e-05,0.081664,0.145218,4011.218072,62
"""decile_002""",7960,0.081673,0.161986,0.145002,3960.409791,62
"""decile_003""",7960,0.161988,0.241833,0.144214,3985.257775,62
"""decile_004""",7959,0.241838,0.320648,0.143484,4013.64743,62
"""decile_005""",7960,0.320665,0.400318,0.144888,3927.385731,62
"""decile_006""",7960,0.400329,0.481005,0.144418,3994.853296,62
"""decile_007""",7959,0.481005,0.559261,0.145066,3989.405837,62
"""decile_008""",7960,0.559267,0.639255,0.145566,3978.256048,62
"""decile_009""",7960,0.63926,0.720597,0.146212,3972.478518,62
"""decile_010""",7960,0.720599,0.799992,0.14501,3986.523202,62


In [6]:
expensive_df_transform1(df, threshold = 0.8)

INFO:pocache.session:Cache hit. Hashing took: 0.001305s. Retrieval took: 0.000003s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7960,2.3e-05,0.081664,0.145218,4011.218072,62
"""decile_002""",7960,0.081673,0.161986,0.145002,3960.409791,62
"""decile_003""",7960,0.161988,0.241833,0.144214,3985.257775,62
"""decile_004""",7959,0.241838,0.320648,0.143484,4013.64743,62
"""decile_005""",7960,0.320665,0.400318,0.144888,3927.385731,62
"""decile_006""",7960,0.400329,0.481005,0.144418,3994.853296,62
"""decile_007""",7959,0.481005,0.559261,0.145066,3989.405837,62
"""decile_008""",7960,0.559267,0.639255,0.145566,3978.256048,62
"""decile_009""",7960,0.63926,0.720597,0.146212,3972.478518,62
"""decile_010""",7960,0.720599,0.799992,0.14501,3986.523202,62


In [7]:
expensive_df_transform2(df, threshold = 0.8)

INFO:pocache.session:Hashing took: 0.001282s. The function took: 2.006223s. Saving cache took: 0.003051s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7960,2.3e-05,0.081664,0.145218,4011.218072,62
"""decile_002""",7960,0.081673,0.161986,0.145002,3960.409791,62
"""decile_003""",7960,0.161988,0.241833,0.144214,3985.257775,62
"""decile_004""",7959,0.241838,0.320648,0.143484,4013.64743,62
"""decile_005""",7960,0.320665,0.400318,0.144888,3927.385731,62
"""decile_006""",7960,0.400329,0.481005,0.144418,3994.853296,62
"""decile_007""",7959,0.481005,0.559261,0.145066,3989.405837,62
"""decile_008""",7960,0.559267,0.639255,0.145566,3978.256048,62
"""decile_009""",7960,0.63926,0.720597,0.146212,3972.478518,62
"""decile_010""",7960,0.720599,0.799992,0.14501,3986.523202,62


In [8]:
expensive_df_transform2(df, threshold = 0.8)

INFO:pocache.session:Cache hit. Hashing took: 0.001127s. Retrieval took: 0.001165s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7960,2.3e-05,0.081664,0.145218,4011.218072,62
"""decile_002""",7960,0.081673,0.161986,0.145002,3960.409791,62
"""decile_003""",7960,0.161988,0.241833,0.144214,3985.257775,62
"""decile_004""",7959,0.241838,0.320648,0.143484,4013.64743,62
"""decile_005""",7960,0.320665,0.400318,0.144888,3927.385731,62
"""decile_006""",7960,0.400329,0.481005,0.144418,3994.853296,62
"""decile_007""",7959,0.481005,0.559261,0.145066,3989.405837,62
"""decile_008""",7960,0.559267,0.639255,0.145566,3978.256048,62
"""decile_009""",7960,0.63926,0.720597,0.146212,3972.478518,62
"""decile_010""",7960,0.720599,0.799992,0.14501,3986.523202,62
