# The 3 Reasons Why I Have Permanently Switched From Pandas To Polars

In [10]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## 1. The `.list` Namespace

### Pandas

In [1]:
import pandas as pd
df = pd.DataFrame({
    "last_name": ["Johnson", "Jackson", "Smithson"],
    "members": [["John", "Ron", "Con"], ["Jack", "Rack"], ["Smith", "Pith", "With", "Lith"]],
    "city_of_residence": ["Boston", "New York City", "Dallas"]
})
print(df)

  last_name                    members city_of_residence
0   Johnson           [John, Ron, Con]            Boston
1   Jackson               [Jack, Rack]     New York City
2  Smithson  [Smith, Pith, With, Lith]            Dallas


In [2]:
df["family_leader"] = df["members"].str[0]
print(df)

  last_name                    members city_of_residence family_leader
0   Johnson           [John, Ron, Con]            Boston          John
1   Jackson               [Jack, Rack]     New York City          Jack
2  Smithson  [Smith, Pith, With, Lith]            Dallas         Smith


### Polars

In [4]:
import polars as pl
df = pl.DataFrame({
    "last_name": ["Johnson", "Jackson", "Smithson"],
    "members": [["John", "Ron", "Con"], ["Jack", "Rack"], ["Smith", "Pith", "With", "Lith"]],
    "city_of_residence": ["Boston", "New York City", "Dallas"]
})
df = df.with_columns([
    pl.col("members").list.get(0).alias("family_leader")])
print(df)

shape: (3, 4)
┌───────────┬─────────────────────────────┬───────────────────┬───────────────┐
│ last_name ┆ members                     ┆ city_of_residence ┆ family_leader │
│ ---       ┆ ---                         ┆ ---               ┆ ---           │
│ str       ┆ list[str]                   ┆ str               ┆ str           │
╞═══════════╪═════════════════════════════╪═══════════════════╪═══════════════╡
│ Johnson   ┆ ["John", "Ron", "Con"]      ┆ Boston            ┆ John          │
│ Jackson   ┆ ["Jack", "Rack"]            ┆ New York City     ┆ Jack          │
│ Smithson  ┆ ["Smith", "Pith", … "Lith"] ┆ Dallas            ┆ Smith         │
└───────────┴─────────────────────────────┴───────────────────┴───────────────┘


### Bloated API

In [5]:
import pandas as pd

df = pd.DataFrame({
    "a": [1, 1, 1],
    "b": [4, 5, 6]
})

column_name_indexer = ["a"]
boolean_mask_indexer = df["b"]==5
slice_indexer = slice(1, 3)

for o in [column_name_indexer, boolean_mask_indexer, slice_indexer]:
    print(df[o])

   a
0  1
1  1
2  1
   a  b
1  1  5
   a  b
1  1  5
2  1  6


## 2. `.scan_parquet()` and `.sink_parquet()`

In [6]:
import polars as pl
eager_df = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})
lazy_df = pl.LazyFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})

## 3. Data-Oriented Programming

In [7]:
import pandas as pd
df = (
    pd.DataFrame({
        "name": ["George", "Polly", "Golly", "Dolly"],
        "age": [3, 4, 13, 44]
    })
)
df["decade"] = (df["age"] / 10).astype(int) * 10
decade_counts = (
    df
    .groupby("decade")
    ["name"]
    .agg("count")
)
print(decade_counts)

decade
0     2
10    1
40    1
Name: name, dtype: int64


In [8]:
import polars as pl
decade_counts = (
    pl.DataFrame({
        "name": ["George", "Polly", "Golly", "Dolly"],
        "age": [3, 4, 13, 44]
    })
    .with_columns([
        ((pl.col("age") / 10).cast(pl.Int32) * 10).alias("decade")
    ])
    .groupby("decade")
    .agg(
        pl.col("name").count().alias("count")
    )
)
print(decade_counts)

shape: (3, 2)
┌────────┬───────┐
│ decade ┆ count │
│ ---    ┆ ---   │
│ i32    ┆ u32   │
╞════════╪═══════╡
│ 0      ┆ 2     │
│ 10     ┆ 1     │
│ 40     ┆ 1     │
└────────┴───────┘
