In [1]:
!pip install polars
import polars as pl

Collecting polars
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/af/49/476e176a703f84b685121396c3f1eb01ec1418f18e1fa357ac99cc67924f/polars-0.20.13-cp38-abi3-macosx_10_12_x86_64.whl.metadata
  Downloading polars-0.20.13-cp38-abi3-macosx_10_12_x86_64.whl.metadata (15 kB)
Downloading polars-0.20.13-cp38-abi3-macosx_10_12_x86_64.whl (24.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.13


# Main Verbs
For a single dataset:

* Column selection: `select()` + `drop()`
* Creating or altering columns: `with_columns()`
* Subsetting rows: `filter()`
* Ordering rows: `sort()`
* Computing group-level summary metrics: `group_by()` + `agg()`

For multiple datasets:

* Merging on a shared key: `join(strategy = '*')`; can optionally pass validation and renaming by appended suffix.
* Stacking datasets of the same structure: `concat()`
* Transforming rows and columns: `pivot()`

# Advanced Wrangling
`polars` dataframes will print the shape, column names and datatypes.

In [3]:
# create a fake polars dataframe

df = pl.DataFrame({'a':[1,1,2,2], 
                   'b':[3,4,5,6], 
                   'c':[7,8,9,0]})

df.head()

a,b,c
i64,i64,i64
1,3,7
1,4,8
2,5,9
2,6,0


## Horizontal Functions

In [8]:
df.with_columns(
    b_plus_c = pl.sum_horizontal(pl.col('b'), pl.col('c'))
)

a,b,c,b_plus_c
i64,i64,i64,i64
1,3,7,10
1,4,8,12
2,5,9,14
2,6,0,6


## Column Selectors

In [11]:
import polars.selectors as cs

In [12]:
# cast column to a different data type
df = df.with_columns(pl.col('a').cast(pl.Utf8))

In [14]:
# select columns by name or datatype
df.select(cs.starts_with('b') | cs.string())

b,a
i64,str
3,"""1"""
4,"""1"""
5,"""2"""
6,"""2"""


In [15]:
# negative select conditions
df.select(~cs.string())

b,c
i64,i64
3,7
4,8
5,9
6,0


## Using `with_columns`

In [16]:
# find all integer columns, add one to each and then create new column names
df.with_columns(
    cs.integer().add(1).name.suffix("_plus1")
)

a,b,c,b_plus1,c_plus1
str,i64,i64,i64,i64
"""1""",3,7,4,8
"""1""",4,8,5,9
"""2""",5,9,6,10
"""2""",6,0,7,1


In [17]:
# select a set of variables for rowwise transformations
df.with_columns(
    row_total = pl.sum_horizontal(cs.integer())
)

a,b,c,row_total
str,i64,i64,i64
"""1""",3,7,10
"""1""",4,8,12
"""2""",5,9,14
"""2""",6,0,6


## Using `group_by` and `agg`

In [19]:
# column selectors can be passed as inputs where columns are accepted
# group by string columns, sum all integer columns
df.group_by(cs.string()).agg(cs.integer().sum()) 

a,b,c
str,i64,i64
"""1""",7,15
"""2""",11,9


## Window Functions

In [20]:
# calculate the minimum, retaining original grain of dataset
df.with_columns(
    min_b = pl.col('b').min().over('a')
)

a,b,c,min_b
str,i64,i64,i64
"""1""",3,7,3
"""1""",4,8,3
"""2""",5,9,5
"""2""",6,0,5


In [22]:
# chains window function with case statement for when column is null
df.with_columns(
    n_b_odd = pl.when((pl.col('b') % 2) == 0)
                .then(1)
                .otherwise(0)
                .sum().over('a')
)

a,b,c,n_b_odd
str,i64,i64,i32
"""1""",3,7,1
"""1""",4,8,1
"""2""",5,9,1
"""2""",6,0,1


## List Columns & Nested Frames

In [24]:
# list columns can be created with pl.struct()
df.with_columns(list_col = pl.struct(cs.integer()))

a,b,c,list_col
str,i64,i64,struct[2]
"""1""",3,7,"{3,7}"
"""1""",4,8,"{4,8}"
"""2""",5,9,"{5,9}"
"""2""",6,0,"{6,0}"


In [25]:
# aggregate lists into miniature dataset
df.group_by('a').agg(list_col = pl.struct(cs.integer()))

a,list_col
str,list[struct[2]]
"""2""","[{5,9}, {6,0}]"
"""1""","[{3,7}, {4,8}]"


In [27]:
# group by cols identified by name and create a list of columns that do NOT have that name

cols = ['a']

(df
 .group_by(cs.by_name(cols))
 .agg(list_col = pl.struct(~cs.by_name(cols)))
)

a,list_col
str,list[struct[2]]
"""2""","[{5,9}, {6,0}]"
"""1""","[{3,7}, {4,8}]"


In [29]:
# unnesting
df_nested = df.group_by('a').agg(list_col = pl.struct(cs.integer()))

df_nested.explode('list_col').unnest('list_col')

a,b,c
str,i64,i64
"""2""",5,9
"""2""",6,0
"""1""",3,7
"""1""",4,8
