# 2. Data filtering

In [None]:
import polars as pl

from kedro.config import ConfigLoader
from kedro.io import DataCatalog

In [None]:
conf_loader = ConfigLoader("conf")
conf_catalog = conf_loader.get("catalog.yml")
catalog = DataCatalog.from_config(conf_catalog)

In [None]:
df_full = catalog.load("labour_force_survey_2018_report")
df_full.head()

In [None]:
interesting_columns = [
    "HOUSEHOLD_ID",
    "REGION",
    "PERSON_NAME",
    "RELATIONSHIP",
    "AGE_YEARS",
    "SEX",
    "MARITAL_STATUS",
    "BIRTH_PLACE_REGION",
    "CITIZENSHIP_MAJOR",
    "CITIZENSHIP_MINOR",
    "SCHOOLING_STATUS",
    "EMPLOYMENT_STATUS",
    "EMPLOYER_TYPE",
    "GROSS_INCOME_MONTH",
    "ENT_NUM_EMPLOYEES",
]
boolean_columns = [
    "DID_BUSINESS_YN",
    "DID_SMALL_BUSINESS_YN",
    "MORE_THAN_ONE_JOB",
    "BELONG_TO_TRADE_UNION",
]

In [None]:
df_filtered = df_full.select(interesting_columns + boolean_columns)
df_filtered.head()

In [None]:
df_cast = df_filtered.with_columns(
    [
        pl.col("GROSS_INCOME_MONTH")
        .map_dict(
            {"Refusal": None, "Dont know": None}, default=pl.col("GROSS_INCOME_MONTH")
        )
        .cast(pl.Float64)
        .fill_nan(None),
        pl.col(boolean_columns)
        .map_dict({"Yes": True, "No": False, "Dont know": None})
        .cast(pl.Boolean),
    ]
)
df_cast.head()

In [None]:
catalog.save("labour_force_survey_2018_report_filtered", df_cast)