In [27]:
import polars as pl

In [28]:
df = pl.read_csv("../data/hotel_bookings.csv", ignore_errors=True)
df = df.filter(pl.col("arrival_date_year") > 2015).groupby("hotel").sum()
df

hotel,is_canceled_sum,lead_time_sum,arrival_date_year_sum,arrival_date_week_number_sum,arrival_date_day_of_month_sum,stays_in_weekend_nights_sum,stays_in_week_nights_sum,adults_sum,children_sum,babies_sum,is_repeated_guest_sum,previous_cancellations_sum,previous_bookings_not_canceled_sum,booking_changes_sum,days_in_waiting_list_sum,adr_sum,required_car_parking_spaces_sum,total_of_special_requests_sum
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64
"""Resort Hotel""",8984,3020393,64013115,757747,504238,37589,98323,58921,4310,418,1644,432,5551,9561,11568,3060930.0500000524,4382,19708
"""City Hotel""",27098,7258507,132373876,1614044,1037534,52707,146050,122453,6736,308,1525,2569,10115,12931,215408,7179108.929999768,1730,38929


In [29]:
q = (pl.scan_csv("../data/hotel_bookings.csv", ignore_errors=True)
    .filter(pl.col("arrival_date_year") > 2015)
    .groupby("hotel")
    .agg(pl.col("*").sum())
)
df = q.collect()

In [30]:
df

hotel,is_canceled_sum,lead_time_sum,arrival_date_year_sum,arrival_date_week_number_sum,arrival_date_day_of_month_sum,stays_in_weekend_nights_sum,stays_in_week_nights_sum,adults_sum,children_sum,babies_sum,is_repeated_guest_sum,previous_cancellations_sum,previous_bookings_not_canceled_sum,booking_changes_sum,days_in_waiting_list_sum,adr_sum,required_car_parking_spaces_sum,total_of_special_requests_sum
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64
"""Resort Hotel""",8984,3020393,64013115,757747,504238,37589,98323,58921,4310,418,1644,432,5551,9561,11568,3060930.0500000524,4382,19708
"""City Hotel""",27098,7258507,132373876,1614044,1037534,52707,146050,122453,6736,308,1525,2569,10115,12931,215408,7179108.929999768,1730,38929


In [38]:
import numpy as np

np.random.seed(12)

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "foo", "foo"],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "A"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────────────────┬────────┐
│ nrs  ┆ names ┆ random               ┆ groups │
│ ---  ┆ ---   ┆ ---                  ┆ ---    │
│ i64  ┆ str   ┆ f64                  ┆ str    │
╞══════╪═══════╪══════════════════════╪════════╡
│ 1    ┆ foo   ┆ 0.15416284237967237  ┆ A      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2    ┆ ham   ┆ 0.7400496965154048   ┆ A      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3    ┆ spam  ┆ 0.26331501518513467  ┆ B      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ null ┆ foo   ┆ 0.5337393933802977   ┆ C      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5    ┆ foo   ┆ 0.014574962485419674 ┆ A      │
└──────┴───────┴──────────────────────┴────────┘


In [39]:
df2 = df[
    [
        pl.col("random").sum().alias("sum_method"),
        pl.sum("random").alias("sum_function"),
    ]
]
print(df2)

shape: (1, 2)
┌────────────────────┬────────────────────┐
│ sum_method         ┆ sum_function       │
│ ---                ┆ ---                │
│ f64                ┆ f64                │
╞════════════════════╪════════════════════╡
│ 1.7058419099459294 ┆ 1.7058419099459294 │
└────────────────────┴────────────────────┘


In [40]:
df3 = df[
    [
        pl.col("random").count().alias("count_method"),
        pl.count("random").alias("count_function"),
    ]
]
print(df3)

shape: (1, 2)
┌──────────────┬────────────────┐
│ count_method ┆ count_function │
│ ---          ┆ ---            │
│ u32          ┆ u32            │
╞══════════════╪════════════════╡
│ 5            ┆ 5              │
└──────────────┴────────────────┘


In [41]:
df4 = df.select(
    [
        (
            pl.when(pl.col("random") > 0.5)
            .then(0)
            .otherwise(pl.col("random")) * pl.sum("nrs")
        ).alias("binary_function")
    ]
)
print(df4)

shape: (5, 1)
┌────────────────────┐
│ binary_function    │
│ ---                │
│ f64                │
╞════════════════════╡
│ 1.695791266176396  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0.0                │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2.8964651670364816 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0.0                │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0.1603245873396164 │
└────────────────────┘


In [35]:
df5 = df[
    [
        pl.col("*"),
        pl.col("names").list().over("groups").alias("names/groups"),
        pl.col("names").set().over("groups").alias("unique_names/groups"),
    ]
]
print(df5)

shape: (5, 6)
┌──────┬───────┬──────────────────────┬────────┬────────────────────────┬────────────────────────┐
│ nrs  ┆ names ┆ random               ┆ groups ┆ sum[names]/groups      ┆ names/groups           │
│ ---  ┆ ---   ┆ ---                  ┆ ---    ┆ ---                    ┆ ---                    │
│ i64  ┆ str   ┆ f64                  ┆ str    ┆ list [str]             ┆ list [str]             │
╞══════╪═══════╪══════════════════════╪════════╪════════════════════════╪════════════════════════╡
│ 1    ┆ foo   ┆ 0.15416284237967237  ┆ A      ┆ ["foo", "ham", "test"] ┆ ["foo", "ham", "test"] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ ham   ┆ 0.7400496965154048   ┆ A      ┆ ["foo", "ham", "test"] ┆ ["foo", "ham", "test"] │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3    ┆ spam  ┆ 0.26331501518513467  ┆ B      ┆ ["spam"]               ┆ ["spam"]             

In [36]:
from polars.polars import PyExpr

In [37]:
help(PyExpr)

Help on class PyExpr in module builtins:

class PyExpr(object)
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __floordiv__(self, value, /)
 |      Return self//value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __mod__(self, value, /)
 |      Return self%value.
 |  
 |  __mul__(self, value, /)
 |      Return self*value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __radd__(self, value, /)
 |      Return value+self.
 |  
 |  __rfloordiv__(self, value, /)
 |      Return value//self.
 |  
 |  __rmod__(self, value, /)
 |      Return value%self.
 |  
 |  __rmul__(self, value, /)
 |      Return value*self.
 |  
 |  __rsub__(self, value, /)
 |      Return value-se