### Basic operations
arithmetic, comparisons, general purpose

In [2]:
import polars as pl 
import numpy as np 

np.random.seed(42)
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", "spam"],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "A", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.37454  ┆ A      │
│ 2    ┆ ham   ┆ 0.950714 ┆ A      │
│ 3    ┆ spam  ┆ 0.731994 ┆ B      │
│ null ┆ egg   ┆ 0.598658 ┆ A      │
│ 5    ┆ spam  ┆ 0.156019 ┆ B      │
└──────┴───────┴──────────┴────────┘


### Basic arithmetic


In [3]:
result = df.select(
    (pl.col("nrs")+5).alias("nrs+5"),
    (pl.col("nrs")-5).alias("nrs-5"),
    (pl.col("nrs")*pl.col("random")).alias("nrs*random"),
    (pl.col("nrs")/pl.col("random")).alias("nrs/random"),
    (pl.col("nrs") ** 2).alias("nrs ** 2"),
    (pl.col("nrs") % 3).alias("nrs % 3"),
)
print(result)

shape: (5, 6)
┌───────┬───────┬────────────┬────────────┬──────────┬─────────┐
│ nrs+5 ┆ nrs-5 ┆ nrs*random ┆ nrs/random ┆ nrs ** 2 ┆ nrs % 3 │
│ ---   ┆ ---   ┆ ---        ┆ ---        ┆ ---      ┆ ---     │
│ i64   ┆ i64   ┆ f64        ┆ f64        ┆ i64      ┆ i64     │
╞═══════╪═══════╪════════════╪════════════╪══════════╪═════════╡
│ 6     ┆ -4    ┆ 0.37454    ┆ 2.669941   ┆ 1        ┆ 1       │
│ 7     ┆ -3    ┆ 1.901429   ┆ 2.103681   ┆ 4        ┆ 2       │
│ 8     ┆ -2    ┆ 2.195982   ┆ 4.098395   ┆ 9        ┆ 0       │
│ null  ┆ null  ┆ null       ┆ null       ┆ null     ┆ null    │
│ 10    ┆ 0     ┆ 0.780093   ┆ 32.047453  ┆ 25       ┆ 2       │
└───────┴───────┴────────────┴────────────┴──────────┴─────────┘


In [4]:
# We canuse pure python to achieve above operation
result_named_operators = df.select(
    (pl.col("nrs").add(5)).alias("nrs + 5"),
    (pl.col("nrs").sub(5)).alias("nrs - 5"),
    (pl.col("nrs").mul(pl.col("random"))).alias("nrs * random"),
    (pl.col("nrs").truediv(pl.col("random"))).alias("nrs / random"),
    (pl.col("nrs").pow(2)).alias("nrs ** 2"),
    (pl.col("nrs").mod(3)).alias("nrs % 3"),
)
print(result_named_operators)

shape: (5, 6)
┌─────────┬─────────┬──────────────┬──────────────┬──────────┬─────────┐
│ nrs + 5 ┆ nrs - 5 ┆ nrs * random ┆ nrs / random ┆ nrs ** 2 ┆ nrs % 3 │
│ ---     ┆ ---     ┆ ---          ┆ ---          ┆ ---      ┆ ---     │
│ i64     ┆ i64     ┆ f64          ┆ f64          ┆ i64      ┆ i64     │
╞═════════╪═════════╪══════════════╪══════════════╪══════════╪═════════╡
│ 6       ┆ -4      ┆ 0.37454      ┆ 2.669941     ┆ 1        ┆ 1       │
│ 7       ┆ -3      ┆ 1.901429     ┆ 2.103681     ┆ 4        ┆ 2       │
│ 8       ┆ -2      ┆ 2.195982     ┆ 4.098395     ┆ 9        ┆ 0       │
│ null    ┆ null    ┆ null         ┆ null         ┆ null     ┆ null    │
│ 10      ┆ 0       ┆ 0.780093     ┆ 32.047453    ┆ 25       ┆ 2       │
└─────────┴─────────┴──────────────┴──────────────┴──────────┴─────────┘


### Comparisons
we can do both named functions and overloaded operators as well

In [5]:
result = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),  # .gt
    (pl.col("nrs") >= 3).alias("nrs >= 3"),  # ge
    (pl.col("random") < 0.2).alias("random < .2"),  # .lt
    (pl.col("random") <= 0.5).alias("random <= .5"),  # .le
    (pl.col("nrs") != 1).alias("nrs != 1"),  # .ne
    (pl.col("nrs") == 1).alias("nrs == 1"),  # .eq
)
print(result)

shape: (5, 6)
┌─────────┬──────────┬─────────────┬──────────────┬──────────┬──────────┐
│ nrs > 1 ┆ nrs >= 3 ┆ random < .2 ┆ random <= .5 ┆ nrs != 1 ┆ nrs == 1 │
│ ---     ┆ ---      ┆ ---         ┆ ---          ┆ ---      ┆ ---      │
│ bool    ┆ bool     ┆ bool        ┆ bool         ┆ bool     ┆ bool     │
╞═════════╪══════════╪═════════════╪══════════════╪══════════╪══════════╡
│ false   ┆ false    ┆ false       ┆ true         ┆ false    ┆ true     │
│ true    ┆ false    ┆ false       ┆ false        ┆ true     ┆ false    │
│ true    ┆ true     ┆ false       ┆ false        ┆ true     ┆ false    │
│ null    ┆ null     ┆ false       ┆ false        ┆ null     ┆ null     │
│ true    ┆ true     ┆ true        ┆ true         ┆ true     ┆ false    │
└─────────┴──────────┴─────────────┴──────────────┴──────────┴──────────┘


### Boolean and bitwise operations


In [10]:
# Boolean operators & | ~
result = df.select(
    ((~pl.col("nrs").is_null()) & (pl.col("groups") == "A")).alias(
        "number not null and group A"
    ),
    ((pl.col("random") < 0.5) | (pl.col("groups") == "B")).alias(
        "random < 0.5 or group B"
    ),
)

print(result)

# Corresponding named functions `and_`, `or_`, and `not_`.
result2 = df.select(
    (pl.col("nrs").is_null().not_().and_(pl.col("groups") == "A")).alias(
        "number not null and group A"
    ),
    ((pl.col("random") < 0.5).or_(pl.col("groups") == "B")).alias(
        "random < 0.5 or group B"
    ),
)
print(result.equals(result2))

shape: (5, 2)
┌─────────────────────────────┬─────────────────────────┐
│ number not null and group A ┆ random < 0.5 or group B │
│ ---                         ┆ ---                     │
│ bool                        ┆ bool                    │
╞═════════════════════════════╪═════════════════════════╡
│ true                        ┆ true                    │
│ true                        ┆ false                   │
│ false                       ┆ true                    │
│ false                       ┆ false                   │
│ false                       ┆ true                    │
└─────────────────────────────┴─────────────────────────┘
True


In [11]:
result = df.select(
    pl.col("nrs"),
    (pl.col("nrs") & 6).alias("nrs & 6"),
    (pl.col("nrs") | 6).alias("nrs | 6"),
    (~pl.col("nrs")).alias("not nrs"),
    (pl.col("nrs") ^ 6).alias("nrs ^ 6"),
)

print(result)

shape: (5, 5)
┌──────┬─────────┬─────────┬─────────┬─────────┐
│ nrs  ┆ nrs & 6 ┆ nrs | 6 ┆ not nrs ┆ nrs ^ 6 │
│ ---  ┆ ---     ┆ ---     ┆ ---     ┆ ---     │
│ i64  ┆ i64     ┆ i64     ┆ i64     ┆ i64     │
╞══════╪═════════╪═════════╪═════════╪═════════╡
│ 1    ┆ 0       ┆ 7       ┆ -2      ┆ 7       │
│ 2    ┆ 2       ┆ 6       ┆ -3      ┆ 4       │
│ 3    ┆ 2       ┆ 7       ┆ -4      ┆ 5       │
│ null ┆ null    ┆ null    ┆ null    ┆ null    │
│ 5    ┆ 4       ┆ 7       ┆ -6      ┆ 3       │
└──────┴─────────┴─────────┴─────────┴─────────┘


### Counting unique values
n_unique : counts exact no of unique values and slow for large data sets. 
approx_n_unique : For large datasets, gives approximation uses HyperLogLog++ algo
value_counts : more info about the unique values and their counts



In [16]:
long_df = pl.DataFrame({"numbers": np.random.randint(0,100_000,100_000)})
result = long_df.select(
    pl.col("numbers").n_unique().alias("n_unique"),
    pl.col("numbers").approx_n_unique().alias("approx_n_unique"),
    pl.col("numbers").value_counts().alias("value counts") #returns a structs
)
print(result)

shape: (63_152, 3)
┌──────────┬─────────────────┬──────────────┐
│ n_unique ┆ approx_n_unique ┆ value counts │
│ ---      ┆ ---             ┆ ---          │
│ u32      ┆ u32             ┆ struct[2]    │
╞══════════╪═════════════════╪══════════════╡
│ 63152    ┆ 63721           ┆ {31517,2}    │
│ 63152    ┆ 63721           ┆ {90703,4}    │
│ 63152    ┆ 63721           ┆ {51102,1}    │
│ 63152    ┆ 63721           ┆ {9860,1}     │
│ 63152    ┆ 63721           ┆ {60129,1}    │
│ …        ┆ …               ┆ …            │
│ 63152    ┆ 63721           ┆ {96000,4}    │
│ 63152    ┆ 63721           ┆ {17546,2}    │
│ 63152    ┆ 63721           ┆ {79785,4}    │
│ 63152    ┆ 63721           ┆ {95857,1}    │
│ 63152    ┆ 63721           ┆ {84842,1}    │
└──────────┴─────────────────┴──────────────┘


### need a series with unique value or series with unique counts


In [17]:
result = df.select(
    pl.col("names").unique(maintain_order=True).alias("unique"),
    pl.col("names").unique_counts().alias("unique_counts"),
)

print(result)

shape: (4, 2)
┌────────┬───────────────┐
│ unique ┆ unique_counts │
│ ---    ┆ ---           │
│ str    ┆ u32           │
╞════════╪═══════════════╡
│ foo    ┆ 1             │
│ ham    ┆ 1             │
│ spam   ┆ 2             │
│ egg    ┆ 1             │
└────────┴───────────────┘


### conditionals 
ternery operator : when, then , otherwise(optional)

In [18]:
result = df.select(
    pl.col("nrs"),
    pl.when(pl.col("nrs")%2 ==1)
    .then(3 * pl.col("nrs")+1)
    .otherwise(pl.col("nrs")//2)
    .alias("Collatz"),
)
print(result)

shape: (5, 2)
┌──────┬─────────┐
│ nrs  ┆ Collatz │
│ ---  ┆ ---     │
│ i64  ┆ i64     │
╞══════╪═════════╡
│ 1    ┆ 4       │
│ 2    ┆ 1       │
│ 3    ┆ 10      │
│ null ┆ null    │
│ 5    ┆ 16      │
└──────┴─────────┘
