In [1]:
import polars as pl
# No an dependency. This is another package I developed
import polars_ds as pds 
import numpy as np
import pocache
import logging
logging.basicConfig(level=logging.INFO)


In [2]:
df = pds.frame(size = 300_000).with_columns(
    a = pds.random()
    , b = pds.random()
    , c = pds.random_str(min_size = 1, max_size = 3)
)
df.head()

row_num,a,b,c
i64,f64,f64,str
0,0.656581,0.880789,"""Cng"""
1,0.064467,0.471656,"""fd"""
2,0.680488,0.665623,"""6l"""
3,0.333714,0.310059,"""NB"""
4,0.620768,0.032985,"""QGm"""


# Example 1 - Cache in mem or cache in tempfile

In [3]:
ss_temp = pocache.Session(mode = "tempfile", verbose=True, secure=False)
ss_mem = pocache.Session(mode = "mem", verbose=True, secure=False)

In [4]:
@ss_mem.pocache
def expensive_df_transform1(df: pl.LazyFrame, threshold: float) -> pl.DataFrame:
    import time
    # Complicated stuff
    time.sleep(2)

    return df.lazy().filter(
        (pl.col("c").str.len_chars() == 1)
        & (pl.col("a") < threshold)
    ).with_columns(
        deciles = pl.col("a").qcut(10, labels = [f"decile_{str(i+1).zfill(3)}" for i in range(10)])
        , d = (pl.col("a") + pl.col("b")) / 2
    ).group_by("deciles").agg(
        pl.len().alias("cnt")
        , pl.col("a").min().alias("a_min")
        , pl.col("a").max().alias("a_max")
        , pl.col("d").std().alias("d_std")
        , pl.col("b").sum().alias("b_sum")
        , pl.col("c").n_unique().alias("c_n_unique")
    ).sort("deciles").collect()

In [5]:
@ss_temp.pocache(
    serializer = lambda df, path: df.write_parquet(path)
    , deserializer = lambda path: pl.read_parquet(path)
)
def expensive_df_transform2(df: pl.LazyFrame, threshold: float) -> pl.DataFrame:
    import time
    # Complicated stuff
    time.sleep(2)

    return df.lazy().filter(
        (pl.col("c").str.len_chars() == 1)
        & (pl.col("a") < threshold)
    ).with_columns(
        deciles = pl.col("a").qcut(10, labels = [f"decile_{str(i+1).zfill(3)}" for i in range(10)])
        , d = (pl.col("a") + pl.col("b")) / 2
    ).group_by("deciles").agg(
        pl.len().alias("cnt")
        , pl.col("a").min().alias("a_min")
        , pl.col("a").max().alias("a_max")
        , pl.col("d").std().alias("d_std")
        , pl.col("b").sum().alias("b_sum")
        , pl.col("c").n_unique().alias("c_n_unique")
    ).sort("deciles").collect()

In [6]:
expensive_df_transform1(df, threshold=0.8)

INFO:pocache:Hashing took: 0.002180s. The function took: 2.007324s. Saving cache took: 0.000007s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7979,9e-06,0.080298,0.144441,3983.194822,62
"""decile_002""",7979,0.080319,0.159742,0.144877,3983.747248,62
"""decile_003""",7979,0.159769,0.238678,0.144076,3989.527263,62
"""decile_004""",7979,0.238681,0.319048,0.145089,3986.290477,62
"""decile_005""",7979,0.319073,0.398476,0.143668,3986.178715,62
"""decile_006""",7979,0.398484,0.4789,0.144784,3968.961304,62
"""decile_007""",7979,0.478904,0.559243,0.144315,3965.739979,62
"""decile_008""",7979,0.559264,0.640237,0.144461,4024.7048,62
"""decile_009""",7979,0.640262,0.720348,0.145331,3955.49539,62
"""decile_010""",7979,0.720373,0.799999,0.143922,4039.763006,62


In [7]:
expensive_df_transform1(df, threshold = 0.8)

INFO:pocache:Cache hit. Hashing took: 0.001433s. Retrieval took: 0.000004s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7979,9e-06,0.080298,0.144441,3983.194822,62
"""decile_002""",7979,0.080319,0.159742,0.144877,3983.747248,62
"""decile_003""",7979,0.159769,0.238678,0.144076,3989.527263,62
"""decile_004""",7979,0.238681,0.319048,0.145089,3986.290477,62
"""decile_005""",7979,0.319073,0.398476,0.143668,3986.178715,62
"""decile_006""",7979,0.398484,0.4789,0.144784,3968.961304,62
"""decile_007""",7979,0.478904,0.559243,0.144315,3965.739979,62
"""decile_008""",7979,0.559264,0.640237,0.144461,4024.7048,62
"""decile_009""",7979,0.640262,0.720348,0.145331,3955.49539,62
"""decile_010""",7979,0.720373,0.799999,0.143922,4039.763006,62


In [8]:
expensive_df_transform2(df, threshold = 0.8)

INFO:pocache:Hashing took: 0.001238s. The function took: 2.006468s. Saving cache took: 0.003373s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7979,9e-06,0.080298,0.144441,3983.194822,62
"""decile_002""",7979,0.080319,0.159742,0.144877,3983.747248,62
"""decile_003""",7979,0.159769,0.238678,0.144076,3989.527263,62
"""decile_004""",7979,0.238681,0.319048,0.145089,3986.290477,62
"""decile_005""",7979,0.319073,0.398476,0.143668,3986.178715,62
"""decile_006""",7979,0.398484,0.4789,0.144784,3968.961304,62
"""decile_007""",7979,0.478904,0.559243,0.144315,3965.739979,62
"""decile_008""",7979,0.559264,0.640237,0.144461,4024.7048,62
"""decile_009""",7979,0.640262,0.720348,0.145331,3955.49539,62
"""decile_010""",7979,0.720373,0.799999,0.143922,4039.763006,62


In [9]:
expensive_df_transform2(df, threshold = 0.8)

INFO:pocache:Cache hit. Hashing took: 0.001575s. Retrieval took: 0.000951s.


deciles,cnt,a_min,a_max,d_std,b_sum,c_n_unique
cat,u32,f64,f64,f64,f64,u32
"""decile_001""",7979,9e-06,0.080298,0.144441,3983.194822,62
"""decile_002""",7979,0.080319,0.159742,0.144877,3983.747248,62
"""decile_003""",7979,0.159769,0.238678,0.144076,3989.527263,62
"""decile_004""",7979,0.238681,0.319048,0.145089,3986.290477,62
"""decile_005""",7979,0.319073,0.398476,0.143668,3986.178715,62
"""decile_006""",7979,0.398484,0.4789,0.144784,3968.961304,62
"""decile_007""",7979,0.478904,0.559243,0.144315,3965.739979,62
"""decile_008""",7979,0.559264,0.640237,0.144461,4024.7048,62
"""decile_009""",7979,0.640262,0.720348,0.145331,3955.49539,62
"""decile_010""",7979,0.720373,0.799999,0.143922,4039.763006,62


In [10]:
del ss_temp
del ss_mem

# Example 2 - Plotting

In [11]:
import altair as alt
import polars as pl
# No an dependency. This is another package I developed
import polars_ds as pds 
import pocache
import logging
logging.basicConfig(level=logging.INFO)

ss = pocache.Session(mode = "mem", verbose=True, secure=False)

In [12]:
df = pds.frame(size = 5_000).with_columns(
    a = pds.random()
    , b = pds.random()
).with_columns(
    category = pl.when(pl.col('a') < 0.5).then(pl.lit('category_1')).otherwise(pl.lit('category_2'))
)
df.head()

row_num,a,b,category
i64,f64,f64,str
0,0.403823,0.171936,"""category_1"""
1,0.29717,0.095339,"""category_1"""
2,0.92852,0.808666,"""category_2"""
3,0.582293,0.950341,"""category_2"""
4,0.310831,0.613845,"""category_1"""


In [13]:
@ss.pocache
def complicated_plotting(df, tooltip:list) -> alt.Chart:
    import time
    # Complicated stuff
    time.sleep(2)

    return alt.Chart(df).mark_circle(size=60).encode(
        x='a',
        y='b',
        color='category',
        tooltip=tooltip
    ).interactive()

In [14]:
complicated_plotting(df, tooltip = ["row_num"])

INFO:pocache:Hashing took: 0.000893s. The function took: 2.007511s. Saving cache took: 0.000002s.


In [15]:
complicated_plotting(df, tooltip = ["row_num"])

INFO:pocache:Cache hit. Hashing took: 0.000560s. Retrieval took: 0.000001s.
