In [1]:
import numpy as np
import polars as pl
import polars.selectors as cs
# import pandas as pd
# import duckdb


In [3]:
# con = duckdb.connect()
# FILEPATH = "/Users/sonle/Documents/Data/delivery_orders_march.csv"
# con.execute(
#     f"""
#     COPY 
#     (
#         SELECT * 
#         FROM read_csv_auto("{FILEPATH}")
#     )
#     TO 'delivery_orders_march.parquet' (FORMAT 'PARQUET')
#     

In [2]:
PARQUETPATH = "../notebook/delivery_orders_march.parquet"
def lazyload_data(path: str = PARQUETPATH, is_delta:bool = False, version: int | None = None) -> pl.LazyFrame:
    """Lazy mode loading data file into variable, if delta is True then read delta lake table

    Args:
        path (str, optional): parquet file path. Defaults to PARQUETPATH.
        is_delta (bool, optional): Boolean parameter to read delta lake table if True, otherwise read parquet file. Defaults to False.
        version (int, optional): version of delta lake table. Defaults to 0.

    Returns:
        pl.DataFrame: _description_
    """

    # dtypes = {
    #     "orderid": pl.Int64,
    #     "pick": pl.Int64,
    #     "1st_deliver_attempt": pl.Float64,
    #     "2nd_deliver_attempt": pl.Float64,
    #     "buyeraddress": pl.Categorical,
    #     "selleraddress": pl.Categorical,
    # }
    
    if not is_delta:
        # Here we read version 0, version 1 is used for backup in case of making mistake
        df_ = pl.scan_delta("/Users/sonle/Documents/GitHub/ServerlessPolars/deltatable", version = version)
    else:
        df_ = pl.scan_parquet(source=path)
    
   
    return (df_
            .select(cs.all())
            )

In [3]:
# read version 0 of dataset, version 1 is for backup, in case we make mistake and need to reload data.
df = lazyload_data()
type(df)

polars.lazyframe.frame.LazyFrame

In [6]:
df.schema

{'orderid': Int64,
 'pick': Int64,
 '1st_deliver_attempt': Float64,
 '2nd_deliver_attempt': Float64,
 'buyeraddress': Utf8,
 'selleraddress': Utf8}

In [4]:
WORKDAYS = "1111110"
HOLIDAYS = ["2020-03-08", "2020-03-25", "2020-03-30", "2020-03-31"]

GMT8_OFFSET = 3600 * 8
DURATION_1DAY = 3600 * 24

sla_matrix_1st_attempt = [[3, 5, 7, 7], [5, 5, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]]

sla_matrix_2nd_attempt = [[3, 3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3], [3, 3, 3, 3]]
locations = ["Metro Manila", "Luzon", "Visayas", "Mindanao"]
locations = [loc.lower() for loc in locations]
location_to_index = {loc: i for i, loc in enumerate(locations)}
print(location_to_index)

min_length = min(map(len, locations))
trunc_location_to_index = {loc [-min_length:]: i for i, loc in enumerate(locations)}
print(trunc_location_to_index)
# transform to native python type for easily mapping
map_to_dict = dict(enumerate(np.array(sla_matrix_1st_attempt).flatten().tolist()))
print(map_to_dict)

{'metro manila': 0, 'luzon': 1, 'visayas': 2, 'mindanao': 3}
{'anila': 0, 'luzon': 1, 'sayas': 2, 'danao': 3}
{0: 3, 1: 5, 2: 7, 3: 7, 4: 5, 5: 5, 6: 7, 7: 7, 8: 7, 9: 7, 10: 7, 11: 7, 12: 7, 13: 7, 14: 7, 15: 7}


In [10]:
def map_address(map: dict[str, int]) -> pl.Expr:
    """A function to slice from the end of a string with given offset
    then map string according to a location_to_index dict

    Args:
        map (dict[str, int]): location to index dictionary

    Returns:
        pl.Expr: 
    """
    return (
        cs
        .string().str.to_lowercase().str.extract(r"(.{0,5})$")
        .map_dict(map)
        # .string().str.to_lowercase().apply(lambda x: x[-min_length:])
    )
    


In [59]:
df.select(cs
        .string().str.to_lowercase().str.extract(r"(.{0,5})$").map_dict(trunc_location_to_index)).fetch(n_rows=10)

buyeraddress,selleraddress
i64,i64
0,0
0,0
0,0
0,0
1,0
0,0
0,0
0,0
1,0
2,0


In [6]:
from typing import Iterable


def convert_time_date(column: str | Iterable[str]) -> pl.Expr:
	"""Convert time to specific format

	Args:
		column (str): column name

	Returns:
		pl.Expr: _description_
	"""
	return (
		pl.col(column)
		.map(lambda x: (x + GMT8_OFFSET) / DURATION_1DAY)
		.cast(pl.Int32)
	)

In [7]:
def compute_working_days(df: pl.LazyFrame):
	t1 = (
    	df.select(convert_time_date("pick"))
    ).collect().to_numpy().astype("datetime64[D]")
 
	t2 = (
		df
  		.select(convert_time_date("1st_deliver_attempt"))
	).collect().to_numpy().astype("datetime64[D]")
 
	t3 = (
		df
  		.select(convert_time_date("2nd_deliver_attempt").fill_null(strategy="zero"))		
	).collect().to_numpy().astype("datetime64[D]")

	num_days1 = np.busday_count(t1, t2, weekmask=WORKDAYS, holidays=HOLIDAYS).flatten()
	num_days2 = np.busday_count(t2, t3, weekmask=WORKDAYS, holidays=HOLIDAYS).flatten()
	return num_days1, num_days2

In [8]:
num_days1, num_days2 = compute_working_days(df)
print(num_days1)

[3 2 2 ... 6 4 2]


In [39]:
df.schema

{'orderid': Int64,
 'pick': Int64,
 '1st_deliver_attempt': Float64,
 '2nd_deliver_attempt': Float64,
 'buyeraddress': Utf8,
 'selleraddress': Utf8}

In [None]:
(
    df
    
    .select([
        pl.col("2nd").fill_null(0),
        pl.from_epoch(pl.col(["pick", "1st_deliver_attempt", "2nd_deliver_attempt"]),time_unit="s")
    ])
    
).fetch(n_rows=10)

In [11]:
(
    df
    .with_columns([
        map_address(trunc_location_to_index),
    ])
    .with_columns(
        (4 * pl.col("buyeraddress") + pl.col("selleraddress")).alias("sla").map_dict(map_to_dict),
        pl.Series(name="num_days1", values=num_days1),
        pl.Series(name="num_days2", values=num_days2),
    )
    .collect()
    .with_columns([
        pl.when((pl.col("num_days1") > pl.col("sla")) | (pl.col("num_days2") > 3))
				.then(pl.lit(1, pl.Int32))
				.otherwise(pl.lit(0, pl.Int32))
				.alias("is_late"),
        pl.from_epoch(pl.col(["pick", "1st_deliver_attempt", "2nd_deliver_attempt"]), time_unit="s"),
    ])
    .lazy()
    .groupby("is_late").agg(pl.count("is_late").alias("count_order"))
    .with_columns(
				(pl.col("count_order") / pl.col("count_order").sum()).alias("percent_slate")
		)
).collect()

is_late,count_order,percent_slate
i32,u32,f64
0,2413891,0.759966
1,762422,0.240034


In [None]:
# This code uses predictate condition in polars to map data with given index

# (
#     df.with_columns(
#         [
#             pl.when(
#                 pl.col("buyeraddress").str.slice(-min_length).str.to_lowercase()
#                 == "anila"
#             )
#             .then(pl.lit(0))
#             .when(
#                 pl.col("buyeraddress").str.slice(-min_length).str.to_lowercase()
#                 == "luzon"
#             )
#             .then(pl.lit(1))
#             .when(
#                 pl.col("buyeraddress").str.slice(-min_length).str.to_lowercase()
#                 == "sayas"
#             )
#             .then(pl.lit(2))
#             .otherwise(pl.lit(3))
#             .alias("buyer_index")
#         ]
#     ).drop("buyeraddress")
# )

In [None]:
# (
#     df.select(
#         cs.string().str.to_lowercase().apply(lambda x: x[-min_length:])
#         # pl.col("buyeraddress").str.to_lowercase().str.slice(-min_length, length=test_length)
#         # cs.string().str.to_lowercase().str.split(' ').list.slice(-1).list.first()
#         # .map_dict(trunc_location_to_index, default=pl.lit(0))
        
#     ).fetch(n_rows=5)
    

# )
# # .str.slice(-min_length).str.to_lowercase()

In [32]:
# # Given a mapping list, we need a function to directly map them into our pl.Dataframe
# def replace_map(column: str, mapping: dict) -> pl.Expr:
#     """A function to map string following a mapping list

#     Args:
#             column (str): name of column
#             mapping (dict): a dictionary to map
#     Raises:

#             Exception: _description_
#             TypeError: _description_

#     Returns:
#             pl.Expr: _description_
#     """
#     if not mapping:
#         raise Exception("mapping can't be empty")
#     elif not isinstance(mapping, dict):
#         TypeError(f"maping must be of type dict, but is type: {type(mapping)}")
#     if not isinstance(column, str):
#         raise TypeError(f"column must be of type str, but is type: {type(column)}")

#     # initiate the expression with pl.when
#     branch = pl.when(pl.col(column) == list(mapping.keys())[0]).then(
#         list(mapping.values())[0]
#     )
#     # for every value add a when.then
#     for from_val, to_val in mapping.items():
#         branch = branch.when(pl.col(column) == from_val).then(to_val)

#     return branch.otherwise(list(mapping.values())[-1]).alias(column)