In [1]:
import polars as pl

In [2]:
grades_wide = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [85, 78, 92],
        "science": [90, 82, 85],
        "history": [88, 80, 97],
    }
)

In [3]:
grades_wide

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,97


In [None]:
grades_long = grades_wide.unpivot(
    index="student", variable_name="subject", value_name="grade")

In [15]:
grades_long

student,subject,grade
str,str,i64
"""Jeroen""","""math""",85
"""Thijs""","""math""",78
"""Ritchie""","""math""",92
"""Jeroen""","""science""",90
"""Thijs""","""science""",82
"""Ritchie""","""science""",85
"""Jeroen""","""history""",88
"""Thijs""","""history""",80
"""Ritchie""","""history""",97


In [None]:
grades_long.pivot(index="student",  on="subject", values="grade")

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,97


In [None]:
# setup multi grades per person and subject
multiple_grades = (
    grades_long
    .with_columns(
        student=pl.when(pl.col("student") == "Ritchie")
        .then(pl.lit("Jeroen"))
        .otherwise(pl.col("student"))
    )
    # .filter(pl.col("student") != "Ritchie")
    # .sort(pl.col("student"), pl.col("subject"))
    # .collect()
)

multiple_grades = (
    pl.concat([multiple_grades, grades_long])
    .with_columns(
        student=pl.when(pl.col("student") == "Ritchie")
        .then(pl.lit("Thijs"))
        .otherwise(pl.col("student"))
    )
)

multiple_grades = pl.concat(
    [
        multiple_grades.with_columns(pl.col("grade").mul(1.23).alias("grade")),
        multiple_grades.with_columns(pl.col("grade").cast(pl.Float64)),
        multiple_grades.with_columns(pl.col("grade").mul(0.85).alias("grade")),
    ]
)

multiple_grades

student,subject,grade
str,str,f64
"""Jeroen""","""math""",104.55
"""Thijs""","""math""",95.94
"""Jeroen""","""math""",113.16
"""Jeroen""","""science""",110.7
"""Thijs""","""science""",100.86
…,…,…
"""Thijs""","""science""",69.7
"""Thijs""","""science""",72.25
"""Jeroen""","""history""",74.8
"""Thijs""","""history""",68.0


In [None]:
# pivot and calc avg
multiple_grades.pivot(
    index="student", on="subject", values="grade", aggregate_function="mean"
)

student,math,science,history
str,f64,f64,f64
"""Jeroen""",89.662222,90.688889,93.426667
"""Thijs""",84.871111,85.213333,87.951111


In [None]:
multiple_grades.pivot(
    index="student", on="subject", values="grade", aggregate_function="median"
)

student,math,science,history
str,f64,f64,f64
"""Jeroen""",85.0,90.0,88.0
"""Thijs""",78.2,82.0,82.45


In [None]:
multiple_grades.pivot(
    index="student", on="subject", values="grade", aggregate_function=pl.element().max() - pl.element().min()
)

student,math,science,history
str,f64,f64,f64
"""Jeroen""",40.91,38.45,44.51
"""Thijs""",46.86,34.85,51.31


In [None]:
# pivoting works for dataframe because you gotta know the cols and data
# hwoever there is a work around

lf = pl.LazyFrame(
    {
        "col1": ["a", "a", "a", "b", "b", "b"],
        "col2": ["x", "x", "x", "y", "y", "y"],
        "col3": [6, 7, 3, 2, 5, 7],
    }
)

index = pl.col("col1")
on = pl.col("col2")
values = pl.col("col3")
unique_column_values = ["x", "y"]


def aggregate_function(col): return col.tanh().mean()


lf.group_by(index).agg(
    aggregate_function(values.filter(on == value)).alias(value)
    for value in unique_column_values
).collect()

col1,x,y
str,f64,f64
"""b""",,0.987978
"""a""",0.998347,


In [44]:
multiple_grades

student,subject,grade
str,str,f64
"""Jeroen""","""math""",104.55
"""Thijs""","""math""",95.94
"""Jeroen""","""math""",113.16
"""Jeroen""","""science""",110.7
"""Thijs""","""science""",100.86
…,…,…
"""Thijs""","""science""",69.7
"""Thijs""","""science""",72.25
"""Jeroen""","""history""",74.8
"""Thijs""","""history""",68.0


In [None]:
# transpose
grades_wide = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [85, 78, 92],
        "science": [90, 82, 85],
        "history": [88, 80, 97],
    }
)
grades_wide

student,math,science,history
str,i64,i64,i64
"""Jeroen""",85,90,88
"""Thijs""",78,82,80
"""Ritchie""",92,85,97


In [49]:
grades_wide.transpose()

column_0,column_1,column_2
str,str,str
"""Jeroen""","""Thijs""","""Ritchie"""
"""85""","""78""","""92"""
"""90""","""82""","""85"""
"""88""","""80""","""97"""


In [None]:
# originally they were using a big df and they grabbed these values programmatically
# but I did not want to type out the entire thing

report_columns = ["report_1", "report_2", "report_3"]

grades_wide.transpose(
    include_header=True,
    header_name="original_header",
    column_names=report_columns,
)

original_header,report_1,report_2,report_3
str,str,str,str
"""student""","""Jeroen""","""Thijs""","""Ritchie"""
"""math""","""85""","""78""","""92"""
"""science""","""90""","""82""","""85"""
"""history""","""88""","""80""","""97"""


In [53]:
# explosions!

grades_nested = pl.DataFrame(
    {
        "student": ["Jeroen", "Thijs", "Ritchie"],
        "math": [
            [85, 90, 88],
            [78, 82, 80],
            [92, 85, 97],
        ],
    }
)
grades_nested

student,math
str,list[i64]
"""Jeroen""","[85, 90, 88]"
"""Thijs""","[78, 82, 80]"
"""Ritchie""","[92, 85, 97]"


In [54]:
grades_nested.explode("math")

student,math
str,i64
"""Jeroen""",85
"""Jeroen""",90
"""Jeroen""",88
"""Thijs""",78
"""Thijs""",82
"""Thijs""",80
"""Ritchie""",92
"""Ritchie""",85
"""Ritchie""",97


In [None]:
# todo  check out partition