**Polars Tutorial**

@ Author: Rui Zhu<br>
@ Time: 2024-11-16<br>
@ Web: https://docs.pola.rs

In [59]:
import numpy as np
from pathlib import Path
import polars as pl

dir_data = Path("/Users/rui/Code/Astronote/33_Polars/data")

---
# Polars中的数据结构
- https://docs.pola.rs/user-guide/concepts/data-types-and-structures/

## Series
- [支持的数据类型](https://docs.pola.rs/user-guide/concepts/data-types-and-structures/#schema)

In [60]:
# Create a Series
s = pl.Series('ints', [1, 2, 3, 4, 5])
print(s)

shape: (5,)
Series: 'ints' [i64]
[
	1
	2
	3
	4
	5
]


In [61]:
# 指定数据类型
s1 = pl.Series('ints', [1, 2, 3, 4, 5], dtype=pl.UInt64)
print(s1)

shape: (5,)
Series: 'ints' [u64]
[
	1
	2
	3
	4
	5
]


## Dataframe

In [62]:
from datetime import date
# 从字典中创建一个DataFrame
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            date(1997, 1, 10),
            date(1985, 2, 15),
            date(1983, 3, 22),
            date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


## Dataframe的各种浏览操作

In [63]:
print(df.head(2))  # 获取前两行

shape: (2, 4)
┌──────────────┬────────────┬────────┬────────┐
│ name         ┆ birthdate  ┆ weight ┆ height │
│ ---          ┆ ---        ┆ ---    ┆ ---    │
│ str          ┆ date       ┆ f64    ┆ f64    │
╞══════════════╪════════════╪════════╪════════╡
│ Alice Archer ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown    ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
└──────────────┴────────────┴────────┴────────┘


In [64]:
print(df.tail(2))  # 获取后两行

shape: (2, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


In [65]:
print(df.glimpse(return_as_string=True))  # 转置表格后预览

Rows: 4
Columns: 4
$ name       <str> 'Alice Archer', 'Ben Brown', 'Chloe Cooper', 'Daniel Donovan'
$ birthdate <date> 1997-01-10, 1985-02-15, 1983-03-22, 1981-04-30
$ weight     <f64> 57.9, 72.5, 53.6, 83.1
$ height     <f64> 1.56, 1.77, 1.65, 1.75



In [66]:
# 随机取2行
import random
random.seed(42)

print(df.sample(2))

shape: (2, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
└────────────────┴────────────┴────────┴────────┘


In [67]:
print(df.describe())  # 获取表格总结

shape: (9, 5)
┌────────────┬────────────────┬─────────────────────┬───────────┬──────────┐
│ statistic  ┆ name           ┆ birthdate           ┆ weight    ┆ height   │
│ ---        ┆ ---            ┆ ---                 ┆ ---       ┆ ---      │
│ str        ┆ str            ┆ str                 ┆ f64       ┆ f64      │
╞════════════╪════════════════╪═════════════════════╪═══════════╪══════════╡
│ count      ┆ 4              ┆ 4                   ┆ 4.0       ┆ 4.0      │
│ null_count ┆ 0              ┆ 0                   ┆ 0.0       ┆ 0.0      │
│ mean       ┆ null           ┆ 1986-09-04 00:00:00 ┆ 66.775    ┆ 1.6825   │
│ std        ┆ null           ┆ null                ┆ 13.560082 ┆ 0.097082 │
│ min        ┆ Alice Archer   ┆ 1981-04-30          ┆ 53.6      ┆ 1.56     │
│ 25%        ┆ null           ┆ 1983-03-22          ┆ 57.9      ┆ 1.65     │
│ 50%        ┆ null           ┆ 1985-02-15          ┆ 72.5      ┆ 1.75     │
│ 75%        ┆ null           ┆ 1985-02-15          ┆ 72.5    

In [68]:
print(df.schema)  # 获取列名和数据类型

Schema({'name': String, 'birthdate': Date, 'weight': Float64, 'height': Float64})


## 指定Dataframe中的数据类型
- None表示不指定，自动推断数据类型

In [69]:
df = pl.DataFrame(
    {
        "name": ["Alice", "Ben", "Chloe", "Daniel"],
        "age": [27, 39, 41, 43],
    },
    schema={"name": None, "age": pl.UInt8},  # 这种方法必须指定所有列的数据类型
)

print(df)

shape: (4, 2)
┌────────┬─────┐
│ name   ┆ age │
│ ---    ┆ --- │
│ str    ┆ u8  │
╞════════╪═════╡
│ Alice  ┆ 27  │
│ Ben    ┆ 39  │
│ Chloe  ┆ 41  │
│ Daniel ┆ 43  │
└────────┴─────┘


In [70]:
df = pl.DataFrame(
    {
        "name": ["Alice", "Ben", "Chloe", "Daniel"],
        "age": [27, 39, 41, 43],
    },
    schema_overrides={"age": pl.UInt8},  # 这种方法只需要指定需要修改的列的数据类型
)

print(df)

shape: (4, 2)
┌────────┬─────┐
│ name   ┆ age │
│ ---    ┆ --- │
│ str    ┆ u8  │
╞════════╪═════╡
│ Alice  ┆ 27  │
│ Ben    ┆ 39  │
│ Chloe  ┆ 41  │
│ Daniel ┆ 43  │
└────────┴─────┘


---
# 表达式 (Expressions) 与 Contexts
- https://docs.pola.rs/user-guide/concepts/expressions-and-contexts/
- expressions是polars自己的语法表达，是惰性处理
- contexts的一些操作才会真正计算

In [71]:
from datetime import date

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            date(1997, 1, 10),
            date(1985, 2, 15),
            date(1983, 3, 22),
            date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


## 创建表达式

In [72]:
type(pl.col("weight"))

polars.expr.expr.Expr

In [73]:
# 计算BMI的一个表达式
bmi_expr = pl.col("weight") / (pl.col("height") ** 2)
print(bmi_expr)

[(col("weight")) / (col("height").pow([dyn int: 2]))]


## select方法: 按表达式创建新表

In [74]:
result = df.select(
    bmi=bmi_expr,
    avg_bmi=bmi_expr.mean(),
    ideal_max_bmi=25,  # 输入一个常数会被广播到所有行
    deviation=(bmi_expr - bmi_expr.mean()) / bmi_expr.std()
)
print(result)

shape: (4, 4)
┌───────────┬───────────┬───────────────┬───────────┐
│ bmi       ┆ avg_bmi   ┆ ideal_max_bmi ┆ deviation │
│ ---       ┆ ---       ┆ ---           ┆ ---       │
│ f64       ┆ f64       ┆ i32           ┆ f64       │
╞═══════════╪═══════════╪═══════════════╪═══════════╡
│ 23.791913 ┆ 23.438973 ┆ 25            ┆ 0.115645  │
│ 23.141498 ┆ 23.438973 ┆ 25            ┆ -0.097471 │
│ 19.687787 ┆ 23.438973 ┆ 25            ┆ -1.22912  │
│ 27.134694 ┆ 23.438973 ┆ 25            ┆ 1.210946  │
└───────────┴───────────┴───────────────┴───────────┘


## with_columns方法: 按表达式追加到新表
- 与select方法的区别只是这个追加到原始表格后, 而select只显示追加列

In [75]:
result = df.with_columns(
    bmi=bmi_expr,
    avg_bmi=bmi_expr.mean(),
    ideal_max_bmi=25,
)
print(result)

shape: (4, 7)
┌────────────────┬────────────┬────────┬────────┬───────────┬───────────┬───────────────┐
│ name           ┆ birthdate  ┆ weight ┆ height ┆ bmi       ┆ avg_bmi   ┆ ideal_max_bmi │
│ ---            ┆ ---        ┆ ---    ┆ ---    ┆ ---       ┆ ---       ┆ ---           │
│ str            ┆ date       ┆ f64    ┆ f64    ┆ f64       ┆ f64       ┆ i32           │
╞════════════════╪════════════╪════════╪════════╪═══════════╪═══════════╪═══════════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   ┆ 23.791913 ┆ 23.438973 ┆ 25            │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   ┆ 23.141498 ┆ 23.438973 ┆ 25            │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   ┆ 19.687787 ┆ 23.438973 ┆ 25            │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   ┆ 27.134694 ┆ 23.438973 ┆ 25            │
└────────────────┴────────────┴────────┴────────┴───────────┴───────────┴───────────────┘


In [76]:
"""
在原始数据集中添加两列
"""
num_a = [1, 2, 3, 4]
num_b = [5, 6, 7, 8]

result = df.with_columns([
    pl.Series("num_a", num_a),
    pl.Series("num_b", num_b),
])
print(result)

shape: (4, 6)
┌────────────────┬────────────┬────────┬────────┬───────┬───────┐
│ name           ┆ birthdate  ┆ weight ┆ height ┆ num_a ┆ num_b │
│ ---            ┆ ---        ┆ ---    ┆ ---    ┆ ---   ┆ ---   │
│ str            ┆ date       ┆ f64    ┆ f64    ┆ i64   ┆ i64   │
╞════════════════╪════════════╪════════╪════════╪═══════╪═══════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   ┆ 1     ┆ 5     │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   ┆ 2     ┆ 6     │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   ┆ 3     ┆ 7     │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   ┆ 4     ┆ 8     │
└────────────────┴────────────┴────────┴────────┴───────┴───────┘


## filter方法: 按条件筛选行

In [77]:
result = df.filter(
    pl.col("birthdate").is_between(date(1982, 12, 31), date(1996, 1, 1)),
    pl.col("height") > 1.7,
)
print(result)

shape: (1, 4)
┌───────────┬────────────┬────────┬────────┐
│ name      ┆ birthdate  ┆ weight ┆ height │
│ ---       ┆ ---        ┆ ---    ┆ ---    │
│ str       ┆ date       ┆ f64    ┆ f64    │
╞═══════════╪════════════╪════════╪════════╡
│ Ben Brown ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
└───────────┴────────────┴────────┴────────┘


## group_by方法: 分组

In [78]:
result = df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
).agg(pl.col("name"))
print(result)

shape: (2, 2)
┌────────┬─────────────────────────────────┐
│ decade ┆ name                            │
│ ---    ┆ ---                             │
│ i32    ┆ list[str]                       │
╞════════╪═════════════════════════════════╡
│ 1980   ┆ ["Ben Brown", "Chloe Cooper", … │
│ 1990   ┆ ["Alice Archer"]                │
└────────┴─────────────────────────────────┘


## 表达式扩展

In [79]:
print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


In [80]:
res = df.select(
    pl.col("weight", "height").mean().name.prefix("avg_")
)
print(res)  # 输出平均体重和平均身高

shape: (1, 2)
┌────────────┬────────────┐
│ avg_weight ┆ avg_height │
│ ---        ┆ ---        │
│ f64        ┆ f64        │
╞════════════╪════════════╡
│ 66.775     ┆ 1.6825     │
└────────────┴────────────┘


In [81]:
exp = [
    pl.col("weight").mean().alias("avg_weight"),
    pl.col("height").mean().alias("avg_height"),
]
res = df.select(exp)
print(res)

shape: (1, 2)
┌────────────┬────────────┐
│ avg_weight ┆ avg_height │
│ ---        ┆ ---        │
│ f64        ┆ f64        │
╞════════════╪════════════╡
│ 66.775     ┆ 1.6825     │
└────────────┴────────────┘


In [82]:
exp = (pl.col(pl.Float64) * 1.1).name.suffix("*1.1")
res = df.select(exp)
print(res)  # 将浮点数列中的所有值乘以1.1

shape: (4, 2)
┌────────────┬────────────┐
│ weight*1.1 ┆ height*1.1 │
│ ---        ┆ ---        │
│ f64        ┆ f64        │
╞════════════╪════════════╡
│ 63.69      ┆ 1.716      │
│ 79.75      ┆ 1.947      │
│ 58.96      ┆ 1.815      │
│ 91.41      ┆ 1.925      │
└────────────┴────────────┘


---
# Lazy API
- lazy模式通常比eager模式更快, 更节省内存

In [83]:
%%time
# eager mode
df = pl.read_csv(dir_data / "iris.csv")
df_small = df.filter(
    pl.col("sepal_length") > 5
)
df_agg = df_small.group_by("species").agg(pl.col("sepal_width").mean())
print(df_agg)

shape: (3, 2)
┌────────────┬─────────────┐
│ species    ┆ sepal_width │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ Virginica  ┆ 2.983673    │
│ Versicolor ┆ 2.804255    │
│ Setosa     ┆ 3.713636    │
└────────────┴─────────────┘
CPU times: user 1.5 ms, sys: 3.05 ms, total: 4.55 ms
Wall time: 1.57 ms


In [84]:
%%time
# lazy mode
q = (
    pl.scan_csv(dir_data / "iris.csv")
    .filter(pl.col("sepal_length") > 5)
    .group_by("species")
    .agg(pl.col("sepal_width").mean())
)
df = q.collect()
print(df)

shape: (3, 2)
┌────────────┬─────────────┐
│ species    ┆ sepal_width │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ Setosa     ┆ 3.713636    │
│ Virginica  ┆ 2.983673    │
│ Versicolor ┆ 2.804255    │
└────────────┴─────────────┘
CPU times: user 1.07 ms, sys: 1.51 ms, total: 2.59 ms
Wall time: 1.2 ms
