In [1]:
import polars as pl

In [2]:
# joining
# default is inner
# then the usual suspects with left, right, full, cross, semi and anti

# we'll be using these two dfs
df_left = pl.DataFrame({"key": ["A", "B", "C", "D"], "value": [1, 2, 3, 4]})

df_right = pl.DataFrame(
    {
        "key": ["B", "C", "D", "E"],
        "value": [5, 6, 7, 8],
    }
)

In [3]:
# inner

df_left.join(df_right, on="key", how="inner")

key,value,value_right
str,i64,i64
"""B""",2,5
"""C""",3,6
"""D""",4,7


In [4]:
# what about with no how
df_left.join(df_right, on="key")

key,value,value_right
str,i64,i64
"""B""",2,5
"""C""",3,6
"""D""",4,7


In [5]:
campaigns = pl.scan_csv("data/campaigns.csv")
campaigns.head(1).collect()

Campaign Name,Campaign Date,Product Type
str,str,str
"""Launch""","""2023-01-01 20:00:00""","""Electronics"""


In [6]:
campaigns.collect_schema()

Schema([('Campaign Name', String),
        ('Campaign Date', String),
        ('Product Type', String)])

In [7]:
campaigns.group_by("Campaign Name").len().collect()

Campaign Name,len
str,u32
"""Seasonal Sale""",1
"""Launch""",1
"""Clearance""",1
"""New Arrivals""",1
"""Discount""",1


In [8]:
campaigns.select(pl.col("Product Type").unique()).collect()

Product Type
str
"""Furniture"""
"""Books"""
"""Electronics"""
"""Clothing"""


In [9]:
transactions = pl.scan_csv("data/transactions.csv")
transactions.collect_schema()
transactions.head(1).collect()

Sale Date,Product Type,Quantity
str,str,i64
"""2023-01-01 02:00:00.000000000""","""Books""",7


In [10]:
transactions = transactions.with_columns(
    pl.col("Sale Date").str.to_datetime(
        "%Y-%m-%d %H:%M:%S%.f").cast(pl.Datetime("us"))
)

In [11]:
campaigns.head(1).collect()

Campaign Name,Campaign Date,Product Type
str,str,str
"""Launch""","""2023-01-01 20:00:00""","""Electronics"""


In [12]:
campaigns = campaigns.with_columns(
    pl.col("Campaign Date").str.to_datetime("%Y-%m-%d %H:%M:%S")
)

In [13]:
sales_with_campaign_df = (
    transactions.sort("Sale Date")
    .join_asof(
        campaigns.sort("Campaign Date"),
        left_on="Sale Date",
        right_on="Campaign Date",
        by="Product Type",
        strategy="backward",
        tolerance="60d",
    )
    .collect()
)
sales_with_campaign_df

Sale Date,Product Type,Quantity,Campaign Name,Campaign Date
datetime[μs],str,i64,str,datetime[μs]
2023-01-01 01:26:12.558627,"""Electronics""",2,,
2023-01-01 02:00:00,"""Books""",7,,
2023-01-01 06:14:30.703535,"""Toys""",9,,
2023-01-01 06:52:25.117255,"""Clothing""",9,,
2023-01-01 07:44:50.234511,"""Books""",7,,
…,…,…,…,…
2023-12-31 15:45:29.296464,"""Clothing""",10,,
2023-12-31 18:15:09.765488,"""Toys""",4,,
2023-12-31 18:33:47.441372,"""Electronics""",7,,
2023-12-31 18:37:54.413720,"""Books""",6,,


In [14]:
(
    sales_with_campaign_df.group_by("Product Type", "Campaign Name")
    .agg(pl.col("Quantity").mean())
    .sort("Product Type", "Campaign Name")
)

Product Type,Campaign Name,Quantity
str,str,f64
"""Books""",,5.527716
"""Clothing""",,5.433385
"""Clothing""","""New Arrivals""",8.200581
"""Electronics""",,5.486832
"""Electronics""","""Launch""",8.080775
"""Electronics""","""Seasonal Sale""",8.471406
"""Furniture""",,5.430222
"""Furniture""","""Discount""",8.191888
"""Toys""",,5.50318


In [15]:
campaigns.filter(pl.col("Product Type") == "Books").collect()

Campaign Name,Campaign Date,Product Type
str,datetime[μs],str
"""Clearance""",2023-12-31 21:00:00,"""Books"""


In [16]:
(
    transactions.filter(
        (pl.col("Product Type") == "Books")
        & (pl.col("Sale Date") > pl.lit("2023-12-31 21:00:00").str.to_datetime())
    ).collect()
)

Sale Date,Product Type,Quantity
datetime[μs],str,i64


vertical and horizontal concatenation

In [17]:
df1 = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "value": ["a", "b", "c"],
    }
)
df2 = pl.DataFrame(
    {
        "id": [4, 5],
        "value": ["d", "e"],
    }
)

In [18]:
pl.concat([df1, df2], how="vertical")

id,value
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""
5,"""e"""


In [None]:
pl.concat([df1, df2])

id,value
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""
5,"""e"""


In [None]:
df2 = pl.DataFrame(
    {
        "value2": ["x", "y"],
    }
)

In [None]:
pl.concat([df1, df2], how="horizontal")

id,value,value2
i64,str,str
1,"""a""","""x"""
2,"""b""","""y"""
3,"""c""",
