In [2]:
import polars as pl
import polars.selectors as cs

## Create `anzsic_2006.csv`

In [3]:
def read_anzsic_sheet(sheet_name, prefix):
    df = (
        pl.read_excel(
            r"data/1292.0.55.002_anzsic 2006 - codes and titles.xls",
            sheet_name=sheet_name,
            drop_empty_rows=True,
        )
        .drop(cs.by_index(0))
        .select(cs.by_index(-2, -1))
        .with_columns(pl.all().fill_null(strategy="forward"))
        .drop_nulls()
    )
    df.columns = [f"{prefix}_code", f"{prefix}_title"]
    df = df.unique(f"{prefix}_title", keep="first", maintain_order=True).sort(
        f"{prefix}_code"
    )
    if prefix != "division":
        df = df.with_columns(
            pl.col(f"{prefix}_code").str.replace(r".$", "").alias(f"{prefix}_parent")
        )
    return df


anzsic06 = {
    sheet_name: read_anzsic_sheet(sheet_name, prefix)
    for sheet_name, prefix in zip(
        ["Divisions", "Groups", "Classes"], ["division", "group", "class"]
    )
}

anzsic06

{'Divisions': shape: (19, 2)
 ┌───────────────┬─────────────────────────────────┐
 │ division_code ┆ division_title                  │
 │ ---           ┆ ---                             │
 │ str           ┆ str                             │
 ╞═══════════════╪═════════════════════════════════╡
 │ A             ┆ Agriculture, Forestry and Fish… │
 │ B             ┆ Mining                          │
 │ C             ┆ Manufacturing                   │
 │ D             ┆ Electricity, Gas, Water and Wa… │
 │ E             ┆ Construction                    │
 │ …             ┆ …                               │
 │ O             ┆ Public Administration and Safe… │
 │ P             ┆ Education and Training          │
 │ Q             ┆ Health Care and Social Assista… │
 │ R             ┆ Arts and Recreation Services    │
 │ S             ┆ Other Services                  │
 └───────────────┴─────────────────────────────────┘,
 'Groups': shape: (214, 3)
 ┌────────────┬───────────────────────────

In [4]:
# Create Subdivisions
anzsic06["Subdivisions"] = pl.read_excel(
    r"data/1292.0.55.002_anzsic 2006 - codes and titles.xls",
    sheet_name="Subdivisions",
    drop_empty_rows=True,
).drop(cs.by_index(0))

anzsic06["Subdivisions"] = (
    anzsic06["Subdivisions"]
    .with_columns(pl.all().fill_null(strategy="forward"))
    .drop_nulls()
)

anzsic06["Subdivisions"].columns = [
    "division_code",
    "subdivision_code",
    "subdivision_title",
]

anzsic06["Subdivisions"] = (
    anzsic06["Subdivisions"]
    .unique("subdivision_title")
    .sort(["division_code", "subdivision_code"])
)

anzsic06["Subdivisions"]

division_code,subdivision_code,subdivision_title
str,str,str
"""A""","""01""","""Agriculture"""
"""A""","""02""","""Aquaculture"""
"""A""","""03""","""Forestry and Logging"""
"""A""","""04""","""Fishing, Hunting and Trapping"""
"""A""","""05""","""Agriculture, Forestry and Fish…"
…,…,…
"""R""","""91""","""Sports and Recreation Activiti…"
"""R""","""92""","""Gambling Activities"""
"""S""","""94""","""Repair and Maintenance"""
"""S""","""95""","""Personal and Other Services"""


In [5]:
anzsic06_combined = (
    anzsic06["Divisions"]
    .join(anzsic06["Subdivisions"], on="division_code", how="full")
    .join(
        anzsic06["Groups"],
        left_on="subdivision_code",
        right_on="group_parent",
        how="full",
    )
    .join(
        anzsic06["Classes"], left_on="group_code", right_on="class_parent", how="full"
    )
    .drop(cs.ends_with("parent"), cs.ends_with("_right"))
)

In [None]:
anzsic06_combined.write_csv("anzsic_2006.csv", quote_style="always")