In [None]:
import polars as pl
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris

from datetime import datetime

# series

In [None]:
s = pl.Series("series", [1, 2, 3, 4], dtype=pl.Int64)
print(s)

shape: (4,)
Series: 'series' [i64]
[
	1
	2
	3
	4
]


# dataframe

In [None]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [None]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.021216 ┆ A      │
│ 2    ┆ ham   ┆ 0.060035 ┆ A      │
│ 3    ┆ spam  ┆ 0.260194 ┆ B      │
│ null ┆ egg   ┆ 0.195607 ┆ C      │
│ 5    ┆ null  ┆ 0.474002 ┆ B      │
└──────┴───────┴──────────┴────────┘


# Contesti

## select

la funzione `with_columns` è simile alla select: La differenza è che con la withcolumns Le colonne originali rimangono , quelle nuove vengono aggiunte.


In [None]:
# Applica delle espressioni alle colonne
df.select(
    pl.col("names").sort(),
    pl.col("random").cum_sum().alias("random_cumsum")
)

names,random_cumsum
str,f64
,0.021216
"""egg""",0.081251
"""foo""",0.341445
"""ham""",0.537052
"""spam""",1.011054


## filter

In [None]:
df.filter(pl.col("random") < 0.5)

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.021216,"""A"""
2.0,"""ham""",0.060035,"""A"""
3.0,"""spam""",0.260194,"""B"""
,"""egg""",0.195607,"""C"""
5.0,,0.474002,"""B"""


## groupby

In [None]:
df.group_by("groups").agg(
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"),
    pl.col("names").reverse().alias("reversed names"),
)

groups,nrs,count,random_sum,reversed names
str,i64,u32,f64,list[str]
"""A""",3,2,0.081251,"[""ham"", ""foo""]"
"""B""",8,2,0.260194,"[null, ""spam""]"
"""C""",0,1,0.195607,"[""egg""]"


# Espressioni

In [None]:
type(pl.col("foo").sort().head(2))

polars.expr.expr.Expr

# Lazy vs Eager

In [None]:
df_iris = load_iris(as_frame=True)["frame"]

In [None]:
df_iris.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [None]:
# creo il csv Iris

df_iris = load_iris(as_frame=True)["frame"]

# rename columns
df_iris.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

df_iris.to_csv("iris.csv")


## Eager

In [None]:
# legge il file csv
df = pl.read_csv("iris.csv")
# solo i record con sepal_length > 5
df_small = df.filter(pl.col("sepal_length") > 5)
# raggruppa per specie e calcola la larghezza media del sepalo per ogni specie
df_agg = df_small.group_by("species").agg(pl.col("sepal_width").mean())
print(df_agg)

shape: (3, 2)
┌─────────┬─────────────┐
│ species ┆ sepal_width │
│ ---     ┆ ---         │
│ i64     ┆ f64         │
╞═════════╪═════════════╡
│ 2       ┆ 2.983673    │
│ 0       ┆ 3.713636    │
│ 1       ┆ 2.804255    │
└─────────┴─────────────┘


## Lazy

In [None]:
q = (
    pl.scan_csv("iris.csv")
    .filter(pl.col("sepal_length") > 5)
    .group_by("species")
    .agg(pl.col("sepal_width").mean())
)

df = q.collect()
print(df)

shape: (3, 2)
┌─────────┬─────────────┐
│ species ┆ sepal_width │
│ ---     ┆ ---         │
│ i64     ┆ f64         │
╞═════════╪═════════════╡
│ 0       ┆ 3.713636    │
│ 2       ┆ 2.983673    │
│ 1       ┆ 2.804255    │
└─────────┴─────────────┘


# Streaming API

In [None]:
q = (
    pl.scan_csv("iris.csv")
    .filter(pl.col("sepal_length") > 5)
    .group_by("species")
    .agg(pl.col("sepal_width").mean())
)

df = q.collect(streaming=True)
print(df)

shape: (3, 2)
┌─────────┬─────────────┐
│ species ┆ sepal_width │
│ ---     ┆ ---         │
│ i64     ┆ f64         │
╞═════════╪═════════════╡
│ 1       ┆ 2.804255    │
│ 0       ┆ 3.713636    │
│ 2       ┆ 2.983673    │
└─────────┴─────────────┘


In [None]:
print(q.explain(streaming=True))

--- STREAMING
AGGREGATE
	[col("sepal_width").mean()] BY [col("species")] FROM

    Csv SCAN iris.csv
    PROJECT 3/6 COLUMNS
    SELECTION: [(col("sepal_length")) > (5.0)]  --- END STREAMING

  DF []; PROJECT */0 COLUMNS; SELECTION: "None"


  print(q.explain(streaming=True))
