# 3. Data Science

In [None]:
import polars as pl

pl.Config.set_fmt_str_lengths(100)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
from kedro.config import ConfigLoader
from kedro.io import DataCatalog

In [None]:
conf_loader = ConfigLoader("conf")
conf_catalog = conf_loader.get("catalog.yml")
catalog = DataCatalog.from_config(conf_catalog)

In [None]:
df = catalog.load("labour_force_survey_2018_report_filtered")
df.head()

In [None]:
(
    df.groupby("SEX").agg(
        [
            pl.col("AGE_YEARS").median().alias("median_age"),
            pl.col("AGE_YEARS").std().alias("std_age"),
        ]
    )
)

In [None]:
df["MARITAL_STATUS"].value_counts()  # .sort(by="counts", descending=True)

In [None]:
(
    df.groupby("SEX").agg(
        [
            pl.col("GROSS_INCOME_MONTH").mean(),
            pl.col("BELONG_TO_TRADE_UNION").mean(),
            (pl.col("MORE_THAN_ONE_JOB").mean() * 100.0).alias(
                "MORE_THAN_ONE_JOB_PCT (%)"
            ),
        ]
    )
)

In [None]:
(
    df.drop_nulls("GROSS_INCOME_MONTH")
    .pivot(
        index="MARITAL_STATUS",
        columns="SEX",
        values="GROSS_INCOME_MONTH",
        aggregate_function="mean",
    )
    .select([pl.col("MARITAL_STATUS"), pl.all().exclude("MARITAL_STATUS").round(2)])
    .with_columns((pl.col("Male") / pl.col("Female")).round(1).alias("Diff (x)"))
)

In [None]:
import seaborn.objects as so

In [None]:
(
    so.Plot(
        data=df,
        y="MARITAL_STATUS",
        x="GROSS_INCOME_MONTH",
        color="SEX",
        pointsize="AGE_YEARS",
    ).add(so.Dots(), so.Dodge(), so.Jitter(0.3))
)