In [52]:
import sys
import polars as pl
import datetime as dt
from typing import Optional

In [53]:
sys.version

'3.12.11 (main, Sep  9 2025, 06:00:18) [GCC 14.2.0]'

In [3]:
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


# Expression

In [4]:
pl.col("weight") / (pl.col("height") ** 2)

In [61]:
def _bytes_to_human(n: int) -> str:
    # simple human-readable bytes
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if n < 1024.0:
            return f"{n:3.1f}{unit}"
        n /= 1024.0
    return f"{n:.1f}PB"

def pl_info(
    df: pl.DataFrame,
    show_memory: bool = True,
    show_unique: bool = True,
    col_width: Optional[int] = None,
):
    """
    Print a pandas-like DataFrame.info() for a Polars DataFrame.

    Parameters
    ----------
    df : pl.DataFrame
        The Polars DataFrame.
    show_memory : bool
        Show an estimated memory usage (in human-readable form).
    show_unique : bool
        Compute and show the number of unique values per column (may be slower).
    col_width : Optional[int]
        Force column name column width; if None auto-sizes to longest column name.
    """
    n_rows, n_cols = df.shape
    print(f"Polars DataFrame info — shape: ({n_rows}, {n_cols})")
    
    if show_memory:
        try:
            size_bytes = df.estimated_size()
            print(f"Estimated memory usage: {_bytes_to_human(size_bytes)}")
        except Exception:
            # fallback if method not present
            print("Estimated memory usage: (unavailable)")

    # prepare column widths
    names = list(df.schema.keys())
    max_name_len = max((len(n) for n in names), default=4)
    name_col_w = col_width if col_width is not None else max(10, max_name_len + 2)

    header = f"{'Column':{name_col_w}} {'Dtype':12} {'Non-Null':>9} {'Nulls':>7}"
    if show_unique:
        header += "  Unique"
    print("\n" + header)
    print("-" * len(header))

    for name, dtype in df.schema.items():
        nulls = df[name].null_count()
        non_null = n_rows - nulls
        dtype_str = str(dtype)
        line = f"{name:{name_col_w}} {dtype_str:12} {non_null:9d} {nulls:7d}"
        if show_unique:
            # n_unique can be somewhat expensive for large columns
            try:
                unique_count = int(df[name].n_unique())
            except Exception:
                unique_count = -1
            line += f"  {unique_count if unique_count >= 0 else 'N/A':>6}"
        print(line)

    # final summary similar to pandas
    print("\ndtypes:")
    # count dtypes
    dtype_counts = {}
    for dtype in df.schema.values():
        s = str(dtype)
        dtype_counts[s] = dtype_counts.get(s, 0) + 1
    for dt_name, cnt in dtype_counts.items():
        print(f"  {dt_name}: {cnt}")


pl_info(df, show_memory=True, show_unique=True)



Polars DataFrame info — shape: (4, 4)
Estimated memory usage: 127.0B

Column      Dtype         Non-Null   Nulls  Unique
--------------------------------------------------
name        String               4       0       4
birthdate   Date                 4       0       4
weight      Float64              4       0       4
height      Float64              4       0       4

dtypes:
  String: 1
  Date: 1
  Float64: 2


# Contexts

`select`, `with_columns`, `filter`, `group_by`

In [104]:
df.select(pl.col("birthdate")).to_pandas()

ModuleNotFoundError: No module named 'pyarrow'

In [62]:
result = df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
)
result

name,birth_year,bmi
str,i32,f64
"""Alice Archer""",1997,23.791913
"""Ben Brown""",1985,23.141498
"""Chloe Cooper""",1983,19.687787
"""Daniel Donovan""",1981,27.134694


In [6]:
result = df.select(
    pl.col("name"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
)
result

name,weight-5%,height-5%
str,f64,f64
"""Alice Archer""",55.0,1.48
"""Ben Brown""",68.88,1.68
"""Chloe Cooper""",50.92,1.57
"""Daniel Donovan""",78.94,1.66


In [7]:
# with_columns adds columns to the dataframe instead of selecting them
result = df.with_columns(
    birth_year=pl.col("birthdate").dt.year(),
    bmi=pl.col("weight") / (pl.col("height") ** 2),
)
result

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694


In [8]:
result = df.filter(pl.col("birthdate").dt.year() < 1990)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [9]:
result = df.filter(
    pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
    pl.col("height") > 1.7,
)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77


In [10]:
result = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).len()
result

decade,len
i32,u32
1990,1
1980,3


In [11]:
result = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest"),
)
result

decade,sample_size,avg_weight,tallest
i32,u32,f64,f64
1990,1,57.9,1.56
1980,3,69.73,1.77


In [12]:
result = (
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
    .agg(
        pl.col("name"),
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
    )
)
result

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


In [13]:
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [14]:
df.with_columns(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    pl.col("name").str.split(by=" ").list.first(),
)

name,birthdate,weight,height,decade
str,date,f64,f64,i32
"""Alice""",1997-01-10,57.9,1.56,1990
"""Ben""",1985-02-15,72.5,1.77,1980
"""Chloe""",1983-03-22,53.6,1.65,1980
"""Daniel""",1981-04-30,83.1,1.75,1980


In [15]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    ).select(
        pl.all().exclude("birthdate"),
    )
)

name,weight,height,decade
str,f64,f64,i32
"""Alice""",57.9,1.56,1990
"""Ben""",72.5,1.77,1980
"""Chloe""",53.6,1.65,1980
"""Daniel""",83.1,1.75,1980


In [16]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
)

<polars.dataframe.group_by.GroupBy at 0xffff8b5c62a0>

In [17]:
(
    df.with_columns(
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
        pl.col("name").str.split(by=" ").list.first(),
    )
    .select(
        pl.all().exclude("birthdate"),
    )
    .group_by(
        pl.col("decade"),
        maintain_order=True,
    )
    .agg(
        pl.col("name"),
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
    )
)

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


# Joining dataframes

In [18]:
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [19]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)
df2

name,parent,siblings
str,bool,i64
"""Ben Brown""",True,1
"""Daniel Donovan""",False,2
"""Alice Archer""",False,3
"""Chloe Cooper""",False,4


In [20]:
df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2


# Concatenating dataframes

In [21]:
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)

pl.concat([df, df3], how="vertical")

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Ethan Edwards""",1977-05-10,67.9,1.76
"""Fiona Foster""",1975-06-23,72.5,1.6
"""Grace Gibson""",1973-07-22,57.6,1.66
"""Henry Harris""",1971-08-03,93.1,1.8
