In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import polars as pl
from datetime import datetime, timedelta

start_date = datetime(2016, 1, 1)
end_date = datetime(2016, 6, 1)
data_dir = "../data/favorita"

In [4]:
def remove_returns_data(lf: pl.LazyFrame) -> pl.LazyFrame:
    lf = lf.filter(pl.col("unit_sales").min().over("traj_id") >= 0)
    lf = lf.with_columns(open=pl.lit(1).cast(pl.Int8))
    return lf


def filter_dates(lf: pl.LazyFrame) -> pl.LazyFrame:
    # Filter dates to reduce storage space requirements
    if start_date is not None:
        lf = lf.filter(pl.col("date") >= start_date)
    if end_date is not None:
        lf = lf.filter(pl.col("date") <= end_date)
    return lf


df = (
    pl.scan_parquet(f"{data_dir}/train.parquet")
    .drop("id")
    .pipe(filter_dates)
    .with_columns([pl.col("onpromotion").map(lambda x: None if x is None else x == "True")])
    .with_columns(
        [
            pl.col("onpromotion").cast(pl.UInt8),
            pl.col("store_nbr").cast(pl.UInt8),
            pl.col("item_nbr").cast(pl.UInt32),
            pl.col("unit_sales").cast(pl.Float32),
        ]
    )
    .with_columns([pl.format("{}_{}", "store_nbr", "item_nbr").alias("traj_id")])
    .pipe(remove_returns_data)
    .sort("date", "traj_id")
    .collect(streaming=True)
    .shrink_to_fit(in_place=True)
    .rechunk()
    .upsample("date", every="1d", by="traj_id")
    .fill_null(strategy="forward")
    .with_columns(pl.col("unit_sales").log())
    .rename({"unit_sales": "log_sales"})
    .lazy()
)

In [5]:
df.collect()

[autoreload of polars.datatypes.classes failed: Traceback (most recent call last):
  File "/Users/artemsereda/miniconda3/envs/tft/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/artemsereda/miniconda3/envs/tft/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/artemsereda/miniconda3/envs/tft/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/Users/artemsereda/miniconda3/envs/tft/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "/Users/artemsereda/miniconda3/envs/tft/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 323, in update_instances
    object.__setattr__(ref, "__class__", new)
TypeError: can't apply this __setattr__ to DataTypeClass o

date,store_nbr,item_nbr,log_sales,onpromotion,traj_id,open
date,u8,u32,f32,u8,str,i8
2016-01-02,6,1975562,0.693147,0,"""6_1975562""",1
2016-01-03,6,1975562,0.0,0,"""6_1975562""",1
2016-01-04,6,1975562,0.693147,0,"""6_1975562""",1
2016-01-05,6,1975562,0.693147,0,"""6_1975562""",1
2016-01-06,6,1975562,2.197225,0,"""6_1975562""",1
2016-01-07,6,1975562,0.0,0,"""6_1975562""",1
2016-01-08,6,1975562,0.0,0,"""6_1975562""",1
2016-01-09,6,1975562,0.693147,0,"""6_1975562""",1
2016-01-10,6,1975562,0.0,0,"""6_1975562""",1
2016-01-11,6,1975562,0.0,1,"""6_1975562""",1


In [7]:
df.describe()

AttributeError: 'LazyFrame' object has no attribute 'describe'