In [1]:
import duckdb
import polars as pl
import pyarrow
from deltalake import DeltaTable
from deltalake.writer import write_deltalake

In [2]:
FILEPATH = '/Users/sonle/Documents/Data/delivery_orders_march.csv'
con = duckdb.connect()

In [9]:
def read_data(file_path: str):
    """this function read .csv file from local storage then transforms it to
    delta_table then return a delta table

    Args:
        file_path (str): local path of file
        
    """
    con = duckdb.connect()
    arrow_table = con.execute(
        f"""
        SELECT *
        FROM read_csv_auto("{file_path}")
        """
    ).arrow()
    
    write_deltalake(
        data= arrow_table,
        table_or_uri= "../ServerlessPolars/deltatable/",
        mode= "overwrite",
        overwrite_schema= True,
    )
    return None
    
    

In [8]:
read_data(file_path= FILEPATH)
dt_delta= DeltaTable("../ServerlessPolars/deltatable/", version=1)
df = pl.scan_delta("../ServerlessPolars/deltatable/", version=0).select(
		pl.col("*").shrink_dtype()
)

In [14]:
df.select(pl.col("orderid")).collect()

orderid
i64
2215676524
2219624609
2220979489
2221066352
2222478803
2222597288
2222738456
2224695304
2224704587
2225138267


In [71]:
dt = con.execute(
    f"""
     SELECT * 
     FROM read_csv_auto("{FILEPATH}")
    """
).arrow()

write_deltalake(
    data=dt,
    table_or_uri="../ServerlessPolars/deltatable",
    mode="append",

    # overwrite_schema=True,
)

In [29]:
dt_parquet = con.execute(
    f"""
    COPY 
    (
        SELECT * 
        FROM read_csv_auto("{FILEPATH}")
    )
    TO 'delivery_orders_march.parquet' (FORMAT 'PARQUET')
    """
)

In [72]:
dt_delta= DeltaTable("../ServerlessPolars/deltatable/")

In [73]:
dt_delta.files()

['0-ba62b272-1535-47ae-864c-81675d7bef68-0.parquet']

In [23]:
dt_delta.vacuum(retention_hours=0, enforce_retention_duration=False, dry_run=False)

['3-4498b0ee-ddf3-4470-8bbf-b7950bbd061f-0.parquet']

In [74]:
dt_delta.schema().to_pyarrow()

orderid: int64
pick: int64
1st_deliver_attempt: double
2nd_deliver_attempt: double
buyeraddress: string
selleraddress: string

In [30]:
%%time
df: pl.DataFrame = pl.scan_delta("../ServerlessPolars/deltatable/").select(
		pl.col("*").shrink_dtype()
)

CPU times: user 6.61 ms, sys: 9.3 ms, total: 15.9 ms
Wall time: 39.9 ms


In [31]:
%%time
df_parquet = pl.scan_parquet("delivery_orders_march.parquet").select(
    pl.col("*").shrink_dtype()
)

CPU times: user 894 µs, sys: 1.6 ms, total: 2.5 ms
Wall time: 10.1 ms


In [33]:
%%time
df_csv = pl.scan_csv(f"{FILEPATH}").select(
    pl.col("*").shrink_dtype()
)

CPU times: user 2.07 ms, sys: 3.06 ms, total: 5.13 ms
Wall time: 17.1 ms
