In [1]:
import polars as pl

working with text, temporal and ensted data types


string methods

In [2]:
corpus = pl.DataFrame(
    {
        "raw_text": [
            " Data Science is amazing",
            "Data_analysis > Data entry",
            " Python&Polars;Fast",
        ]
    }
)
corpus

raw_text
str
""" Data Science is amazing"""
"""Data_analysis > Data entry"""
""" Python&Polars;Fast"""


just me playing around 

In [3]:
(
    corpus.select(pl.col("raw_text").str.split(" ").explode().alias("token"))
    .filter(pl.col("token") != "")
    .select(pl.col("token").unique().alias("unique_token"))
    .with_row_index("id")
)

id,unique_token
u32,str
0,"""amazing"""
1,"""Data_analysis"""
2,"""Science"""
3,"""Data"""
4,"""is"""
5,"""entry"""
6,"""Python&Polars;Fast"""
7,""">"""


In [4]:
corpus = corpus.with_columns(
    processed_text=pl.col("raw_text")
    .str.strip_chars()
    .str.to_lowercase()
    .str.replace_all("_", "")
)
corpus

raw_text,processed_text
str,str
""" Data Science is amazing""","""data science is amazing"""
"""Data_analysis > Data entry""","""dataanalysis > data entry"""
""" Python&Polars;Fast""","""python&polars;fast"""


In [5]:
corpus.with_columns(
    first_5_chars=pl.col("processed_text").str.slice(0, 5),
    first_word=pl.col("processed_text").str.split("").list.get(0),
    second_word=pl.col("processed_text").str.split("").list.get(1),
)

raw_text,processed_text,first_5_chars,first_word,second_word
str,str,str,str,str
""" Data Science is amazing""","""data science is amazing""","""data ""","""d""","""a"""
"""Data_analysis > Data entry""","""dataanalysis > data entry""","""dataa""","""d""","""a"""
""" Python&Polars;Fast""","""python&polars;fast""","""pytho""","""p""","""y"""


In [6]:
corpus.with_columns(
    len_chars=pl.col("processed_text").str.len_chars(),
    len_bytes=pl.col("processed_text").str.len_bytes(),
    count_a=pl.col("processed_text").str.count_matches(
        "a",
    ),
)

raw_text,processed_text,len_chars,len_bytes,count_a
str,str,u32,u32,u32
""" Data Science is amazing""","""data science is amazing""",23,23,4
"""Data_analysis > Data entry""","""dataanalysis > data entry""",25,25,6
""" Python&Polars;Fast""","""python&polars;fast""",18,18,2


In [7]:
posts = pl.DataFrame(
    {"post": ["Loving #python and #polars!", "a boomer post without a hashtag"]}
)

In [8]:
hashtag_regex = r"#(\w+)"

posts.with_columns(hashtags=pl.col("post").str.extract_all(hashtag_regex))

post,hashtags
str,list[str]
"""Loving #python and #polars!""","[""#python"", ""#polars""]"
"""a boomer post without a hashta…",[]


categorical stuff

In [9]:
cats = pl.DataFrame(
    {"name": ["Persian cat", "Siamese cat", "Lynx", "Lynx"]},
    schema={"name": pl.Categorical},
)

cats.with_columns(name_physical=pl.col("name").to_physical())

name,name_physical
cat,u32
"""Persian cat""",0
"""Siamese cat""",1
"""Lynx""",2
"""Lynx""",2


In [10]:
more_cats = pl.DataFrame(
    {"name": ["Maine Coon Cat", "Lynx", "lynx", "Siamese Cat"]},
    schema={"name": pl.Categorical},
)

more_cats.with_columns(pl.col("name").alias("name_physical").to_physical())

name,name_physical
cat,u32
"""Maine Coon Cat""",0
"""Lynx""",1
"""lynx""",2
"""Siamese Cat""",3


In [11]:
cats.join(more_cats, on="name")

  cats.join(more_cats, on="name")


name
cat
"""Lynx"""
"""Lynx"""


In [12]:
bear_enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])

bear_enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Polar"], dtype=bear_enum_dtype
)

bear_cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

In [13]:
bear_enum_series

"""Polar"""
"""Panda"""
"""Brown"""
"""Polar"""


In [14]:
bear_cat_series

"""Polar"""
"""Panda"""
"""Brown"""
"""Brown"""
"""Polar"""


In [15]:
data1 = {"name": ["kane", "bane", "shane"], "type": ["Polar", "Polar", "Brown"]}

data2 = {"avg_weight": [255, 128, 512], "type": ["Polar", "Panda", "Brown"]}

bear1 = pl.DataFrame(data1, schema={"name": pl.String, "type": bear_enum_dtype})
bear2 = pl.DataFrame(data2, schema={"avg_weight": pl.Int16, "type": bear_enum_dtype})

In [16]:
bear1

name,type
str,enum
"""kane""","""Polar"""
"""bane""","""Polar"""
"""shane""","""Brown"""


In [17]:
bear2

avg_weight,type
i16,enum
255,"""Polar"""
128,"""Panda"""
512,"""Brown"""


In [18]:
bear1.join(bear2, on="type")

name,type,avg_weight
str,enum,i16
"""kane""","""Polar""",255
"""bane""","""Polar""",255
"""shane""","""Brown""",512


time

In [19]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

symbol,date,open,high,low,close,adj close,volume
str,date,f64,f64,f64,f64,f64,i64
"""ASML""",1999-01-04,11.765625,12.28125,11.765625,12.140625,7.522523,1801867
"""ASML""",1999-01-05,11.859375,14.25,11.71875,13.96875,8.655257,8241600
"""ASML""",1999-01-06,14.25,17.601563,14.203125,16.875,10.456018,16400267
"""ASML""",1999-01-07,14.742188,17.8125,14.53125,16.851563,10.441495,17722133
"""ASML""",1999-01-08,16.078125,16.289063,15.023438,15.796875,9.787995,10696000
…,…,…,…,…,…,…,…
"""TSM""",2023-06-26,102.019997,103.040001,100.089996,100.110001,99.125954,8560000
"""TSM""",2023-06-27,101.150002,102.790001,100.019997,102.080002,101.076591,9732000
"""TSM""",2023-06-28,100.5,101.879997,100.220001,100.919998,99.927986,8160900
"""TSM""",2023-06-29,101.339996,101.519997,100.019997,100.639999,99.650742,7383900


In [None]:
dates = pl.DataFrame({"date_str": ["2023-12-31", "2024-02-29"]}).with_columns(
    date=pl.col("date_str").str.to_date("%Y-%m-%d")
)

In [21]:
dates

date_str,date
str,date
"""2023-12-31""",2023-12-31
"""2024-02-29""",2024-02-29


In [22]:
(dates.with_columns(formatted_date=pl.col("date").dt.to_string("%d-%m-%Y")))

date_str,date,formatted_date
str,date,str
"""2023-12-31""",2023-12-31,"""31-12-2023"""
"""2024-02-29""",2024-02-29,"""29-02-2024"""


In [None]:
pl.DataFrame(
    {
        "monday": pl.date_range(
            start=pl.date(2024, 10, 28),
            end=pl.date(2024, 12, 1),
            interval="1 w",
            eager=True,
        ),
    }
)

monday
date
2024-10-28
2024-11-04
2024-11-11
2024-11-18
2024-11-25


list in polars

In [None]:
bools = pl.DataFrame({"values": [[True, True], [False, False, True], [False]]})

In [29]:
bools.with_columns(
    all_true=pl.col("values").list.all(),
    any_true=pl.col("values").list.any(),
    last=pl.col("values").list.last(),
    first=pl.col("values").list.first(),
    # second=pl.col("values").list.get(1) throws error
)

values,all_true,any_true,last,first
list[bool],bool,bool,bool,bool
"[true, true]",True,True,True,True
"[false, false, true]",False,True,True,False
[false],False,False,False,False


In [None]:
groups = pl.DataFrame({"ages": [[12, 21], [30, 40, 50], [42, 69]]})

In [37]:
groups.with_columns(
    over_forty=pl.col("ages").list.eval(pl.element() > 40, parallel=True)
).with_columns(all_over_forty=pl.col("over_forty").list.all())

ages,over_forty,all_over_forty
list[i64],list[bool],bool
"[12, 21]","[false, false]",False
"[30, 40, 50]","[false, false, true]",False
"[42, 69]","[true, true]",True


that was cool - like purrr. so you slap em in a list and then zip right through em. nice.

In [38]:
groups.explode("ages")

ages
i64
12
21
30
40
50
42
69


In [39]:
groups.select(ages=pl.col("ages").list.explode())

ages
i64
12
21
30
40
50
42
69


arrays

In [None]:
events = pl.DataFrame(
    [
        pl.Series("locations", ["Paris", "Amsterdam", "Barcelona"], dtype=pl.String),
        pl.Series(
            "temperatures",
            [
                [23, 27, 21, 22, 24, 23, 22],
                [17, 19, 15, 22, 18, 20, 21],
                [30, 32, 28, 29, 34, 33, 31],
            ],
            dtype=pl.Array(pl.Int64, shape=7),
        ),
    ]
)

In [48]:
events

locations,temperatures
str,"array[i64, 7]"
"""Paris""","[23, 27, … 22]"
"""Amsterdam""","[17, 19, … 21]"
"""Barcelona""","[30, 32, … 31]"


In [49]:
events.with_columns(
    median=pl.col("temperatures").arr.median(),
    max=pl.col("temperatures").arr.max(),
    warmest_dow=pl.col("temperatures").arr.arg_max(),
)

locations,temperatures,median,max,warmest_dow
str,"array[i64, 7]",f64,i64,u32
"""Paris""","[23, 27, … 22]",23.0,27,1
"""Amsterdam""","[17, 19, … 21]",19.0,22,3
"""Barcelona""","[30, 32, … 31]",31.0,34,4


In [None]:
from datetime import date

orders = pl.DataFrame(
    {
        "customer_id": [2781, 6139, 5392],
        "order_details": [
            {"amount": 250.00, "date": date(2024, 1, 3), "items": 5},
            {"amount": 150.00, "date": date(2024, 1, 5), "items": 1},
            {"amount": 100.00, "date": date(2024, 1, 5), "items": 3},
        ],
    },
)

In [59]:
orders

customer_id,order_details
i64,struct[3]
2781,"{250.0,2024-01-03,5}"
6139,"{150.0,2024-01-05,1}"
5392,"{100.0,2024-01-05,3}"


In [60]:
orders.select(pl.col("order_details").struct.field("amount"))

amount
f64
250.0
150.0
100.0


In [None]:
order_details_df = orders.unnest("order_details")
order_details_df

customer_id,amount,date,items
i64,f64,date,i64
2781,250.0,2024-01-03,5
6139,150.0,2024-01-05,1
5392,100.0,2024-01-05,3


In [None]:
order_details_df.select(
    "amount",
    "date",
    "items",
    order_details=pl.struct(pl.col("amount"), pl.col("date"), pl.col("items")),
)

amount,date,items,order_details
f64,date,i64,struct[3]
250.0,2024-01-03,5,"{250.0,2024-01-03,5}"
150.0,2024-01-05,1,"{150.0,2024-01-05,1}"
100.0,2024-01-05,3,"{100.0,2024-01-05,3}"


In [None]:
basket = pl.DataFrame(
    {"fruit": ["cherry", "apple", "banana", "banana", "apple", "banana"]}
)

In [65]:
basket

fruit
str
"""cherry"""
"""apple"""
"""banana"""
"""banana"""
"""apple"""
"""banana"""


In [66]:
basket.select(pl.col("fruit").value_counts(sort=True).struct.unnest())

fruit,count
str,u32
"""banana""",3
"""apple""",2
"""cherry""",1
