# Sources

- Polars Docs - https://docs.pola.rs/user-guide/getting-started/#select
- Tutorials - Matt Harrison: Getting Started with Polars - https://www.youtube.com/watch?v=CJ0f45evuME
    - 1:3:18 => converting `str` to `date` the polars way using `split.arr` to use an array operation on the column e.g. `df['time'].str.split('+').arr.get(0).str.strptime(pl.Datetime, '%Y-%m-%d %H:%M:%S')`
    - groupy_dynamic

- Matt Harrison - An Introduction to Polars | PyData NYC 2024 - https://www.youtube.com/watch?v=q3o2IdFQTOE
    - 53:16 - how to deal with missing values advice : find a domain expert, get to know why there are missing data points then think of a solution
        - drop row
        - drop column
        - add indicator variables
        - impute
    - 1:01:19: using `pipe()` in chaining methods you can use a lambda that prints the shape of a DataFrame combined with a `or` keyword that returns the DataFrame so that it is used in the next steps of the chain. Can't put a statement in the lambda so it's a smart short circuit as printing returns None so the `or` operator will pick the other value : `.pipe(lambda df: print(df.shape) or df)` => very useful in case of joins and merges. Can also put logging in here.

# Imports

In [1]:
import os
import polars as pl
import datetime as dt
from pl_utils import pl_info
from pl_utils import dtype_info

# Polars Documentation

## DataFrame Overview

In [2]:
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [3]:
df.columns

['name', 'birthdate', 'weight', 'height']

In [4]:
df.describe()

statistic,name,birthdate,weight,height
str,str,str,f64,f64
"""count""","""4""","""4""",4.0,4.0
"""null_count""","""0""","""0""",0.0,0.0
"""mean""",,"""1986-09-04 00:00:00""",66.775,1.6825
"""std""",,,13.560082,0.097082
"""min""","""Alice Archer""","""1981-04-30""",53.6,1.56
"""25%""",,"""1983-03-22""",57.9,1.65
"""50%""",,"""1985-02-15""",72.5,1.75
"""75%""",,"""1985-02-15""",72.5,1.75
"""max""","""Daniel Donovan""","""1997-01-10""",83.1,1.77


In [5]:
# custom function to mimic df.info from pandas
pl_info(df, show_memory=True, show_unique=True)

sys.version='3.12.11 (main, Sep  9 2025, 06:00:18) [GCC 14.2.0]'
pl.__version__='1.33.1'
_______________________
Polars DataFrame info — shape: (4, 4)
Estimated memory usage: 127.0B

Column      Dtype         Non-Null   Nulls  Unique
--------------------------------------------------
name        String               4       0       4
birthdate   Date                 4       0       4
weight      Float64              4       0       4
height      Float64              4       0       4

dtypes:
  String: 1
  Date: 1
  Float64: 2


In [6]:
display(df.head(3), df.tail(3), df.sample(2))

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65


name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


name,birthdate,weight,height
str,date,f64,f64
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Chloe Cooper""",1983-03-22,53.6,1.65


## Expression

In [7]:
print(dir(pl.col("foo")))

['__abs__', '__add__', '__and__', '__annotations__', '__array_ufunc__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__invert__', '__le__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_from_pyexpr', '_pyexpr', '_repr_html_', '_row_decode', '_row_encode', '_skip_batch_predicate', 'abs', 'add', 'agg_groups', 'alias', 'all', 'and_', 'any', 'append', 'approx_n_unique', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg_max', 'arg_mi

In [8]:
pl.col("weight") / (pl.col("height") ** 2)

## Contexts

`select`, `with_columns`, `filter`, `group_by`

In [9]:
df.select(pl.col("birthdate")).to_pandas()

Unnamed: 0,birthdate
0,1997-01-10
1,1985-02-15
2,1983-03-22
3,1981-04-30


In [10]:
result = df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
)
result

name,birth_year,bmi
str,i32,f64
"""Alice Archer""",1997,23.791913
"""Ben Brown""",1985,23.141498
"""Chloe Cooper""",1983,19.687787
"""Daniel Donovan""",1981,27.134694


In [11]:
result = df.select(
    pl.col("name"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
)
result

name,weight-5%,height-5%
str,f64,f64
"""Alice Archer""",55.0,1.48
"""Ben Brown""",68.88,1.68
"""Chloe Cooper""",50.92,1.57
"""Daniel Donovan""",78.94,1.66


In [12]:
# with_columns adds columns to the dataframe instead of selecting them
result = df.with_columns(
    birth_year=pl.col("birthdate").dt.year(),
    bmi=pl.col("weight") / (pl.col("height") ** 2),
)
result

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694


In [13]:
result = df.filter(pl.col("birthdate").dt.year() < 1990)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [14]:
df.filter(
    pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
    pl.col("height") > 1.7,
)

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77


In [15]:
df.filter(
    ~pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
)

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [16]:
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).len()

decade,len
i32,u32
1990,1
1980,3


In [17]:
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest"),
)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [18]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
    .agg(
        pl.col("name"),
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
    )
)

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


In [19]:
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [20]:
df.with_columns(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    pl.col("name").str.split(by=" ").list.first(),
)

name,birthdate,weight,height,decade
str,date,f64,f64,i32
"""Alice""",1997-01-10,57.9,1.56,1990
"""Ben""",1985-02-15,72.5,1.77,1980
"""Chloe""",1983-03-22,53.6,1.65,1980
"""Daniel""",1981-04-30,83.1,1.75,1980


In [21]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    ).select(
        pl.all().exclude("birthdate"),
    )
)

name,weight,height,decade
str,f64,f64,i32
"""Alice""",57.9,1.56,1990
"""Ben""",72.5,1.77,1980
"""Chloe""",53.6,1.65,1980
"""Daniel""",83.1,1.75,1980


In [22]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
)

<polars.dataframe.group_by.GroupBy at 0xffff54940560>

In [23]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
    .agg(
        pl.col("name"),  # agg on strings creates a list with the strings
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
    )
)

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


## Joining dataframes

In [24]:
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [25]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)
df2

name,parent,siblings
str,bool,i64
"""Ben Brown""",True,1
"""Daniel Donovan""",False,2
"""Alice Archer""",False,3
"""Chloe Cooper""",False,4


In [26]:
df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2


## Concatenating dataframes

In [27]:
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)

pl.concat([df, df3], how="vertical")

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Ethan Edwards""",1977-05-10,67.9,1.76
"""Fiona Foster""",1975-06-23,72.5,1.6
"""Grace Gibson""",1973-07-22,57.6,1.66
"""Henry Harris""",1971-08-03,93.1,1.8
