# ABS Quarterly Estimated Resident Population 3101

## Python set-up

In [1]:
# system imports
import textwrap
import re

# analytic imports
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose  # type: ignore
import readabs as ra
from readabs import recalibrate
from readabs import metacol as mc

In [2]:
# local imports
import decompose
from abs_helper import get_abs_data
from henderson import hma
from plotting import (
    abbreviate,
    calc_growth,
    finalise_plot,
    line_plot,
    plot_covid_recovery,
    plot_growth_finalise,
    seas_trend_plot,
    state_abbr,
    state_colors,
    plot_revisions,
)

In [3]:
# pandas display settings
pd.options.display.max_rows = 999999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

# Display charts in this notebook
SHOW = False

## Get data from ABS

In [4]:
abs_dict, meta, source, RECENT = get_abs_data("3101.0")
plot_times = None, RECENT
plot_tags = "", "-recent"

Table 31010do001_202406 has no 'Index' sheet.
Table 31010do002_202406 has no 'Index' sheet.
Table 31010DO003_200106 has no 'Index' sheet.
Table Regional internal migration estimates, provisional has no 'Index' sheet.


In [5]:
# list fof available tables
textwrap.wrap(", ".join(abs_dict.keys()))

['310101, 3101016A, 3101016B, 310102, 310104, 3101051, 3101052, 3101053,',
 '3101054, 3101055, 3101056, 3101057, 3101058, 3101059']

## Data revisions

In [6]:
### Data revisions
def data_revisions() -> None:
    """Obtain and plot data revisions."""

    how_far_back = 6
    dataset = [
        "Estimated Resident Population",
        "Births ;  Australia ;",
        "Deaths ;  Australia ;",
        "Natural Increase ;  Australia ;",
        "Overseas Arrivals ;  Australia ;",
        "Overseas Departures ;  Australia ;",
        "Net Overseas Migration ;  Australia ;",
    ]
    stype = "Original"
    for series in dataset:
        repository = pd.DataFrame()
        history = None
        for _i in range(how_far_back):
            # from current to historic data
            d, m = ra.read_abs_cat(
                "3101.0", single_excel_only="310101", history=history
            )
            selector = {series: mc.did, stype: mc.stype}
            t, s, u = ra.find_abs_id(m, selector, regex=False, verbose=False)
            date = f"ABS print for {d[t].index[-1].strftime("%Y-%b")}"
            repository[date] = d[t][s]
            history = (d[t].index[-1] - 1).strftime("%b-%Y").lower()

        plot_revisions(
            data=repository,
            units=u,
            title=f"Data revsions: {re.sub(':.*$', '', series)}",
            rfooter=source,
            lfooter=f"Australia. {stype}. ",
            legend={"loc": "best", "fontsize": 9},
            show=SHOW,
        )

        if series == "Estimated Resident Population":
            plot_revisions(
                data=repository.diff(1),
                units=u,
                title=f"Data revsions: {re.sub(':.*$', '', series)} Growth",
                rfooter=source,
                lfooter=f"Australia. {stype}. ",
                legend={"loc": "best", "fontsize": 9},
                show=SHOW,
            )


data_revisions()

## Plotting

### Key charts

In [7]:
### NOTE ### -- this code block is very slow -- about 3 minutes


def key_charts():
    table = "310101"
    series_type = "Original"
    data = abs_dict[table]

    key_charts = [
        "Births",
        "Deaths",
        "Natural Increase",  # births - deaths
        "Overseas Arrivals",
        "Overseas Departures",
        "Net Overseas Migration",
    ]

    discontinuities = {
        # last date in continuity ...
        "Births": [pd.Period("2020-Q4", freq="Q")],
        "Deaths": [],
        "Natural Increase": [pd.Period("2020-Q4", freq="Q")],
        "Overseas Arrivals": [pd.Period("2020-Q1", freq="Q")],
        "Overseas Departures": [pd.Period("2020-Q1", freq="Q")],
        "Net Overseas Migration": [pd.Period("2020-Q1", freq="Q")],
    }

    starts = None, abs_dict[table].index[-1] - 30  # touch over 5 years ago
    for chart in key_charts:
        selector = {
            table: mc.table,
            series_type: mc.stype,
            chart: mc.did,
        }
        _table, id, units = ra.find_abs_id(meta, selector, verbose=False)
        series = data[id]
        series.name = chart
        series, units = ra.recalibrate(series, units)
        # print(f'End date check: {series.index[-1]}')

        # raw data plot
        common_plot_settings = {
            "title": chart,
            "y0": True,
            "ylabel": f"{units} / Quarter",
            "rfooter": source,
            "show": SHOW,
        }
        line_plot(
            series,
            lfooter=f"Australia. {series_type} series. ",
            **common_plot_settings,
        )

        # in-house seasonal decomp
        common_plot_settings["starts"] = starts
        decomposed = decompose.decompose(
            series.dropna(),
            constant_seasonal=True,
            arima_extend=True,
            discontinuity_list=discontinuities[chart],
            ignore_years=(2020, 2021),  # COVID
        )
        # display(decomposed)
        seas_trend_plot(
            decomposed[["Seasonally Adjusted", "Trend"]],
            tags=["sa-Mark" + t for t in plot_tags],
            lfooter="Australia. Seasonally adjusted using in-house methods. ",
            **common_plot_settings,
        )

        plot_covid_recovery(
            decomposed["Seasonally Adjusted"],
            tags="covid-recovery",
            lfooter="Australia. Seasonally adjusted series plotted. "
            + "Seasonally adjusted using in-house methods. ",
            **{k: v for k, v in common_plot_settings.items() if k != "starts"},
        )

        # python's seasonal decomp
        if series.gt(0).all():
            result = seasonal_decompose(series, model="multiplicable", period=4)
            df = pd.DataFrame(
                [series / result.seasonal, result.trend],
                index=["Seasonally Adjusted", "Trend"],
            ).T
            seas_trend_plot(
                df,
                tags=["sa-python" + t for t in plot_tags],
                lfooter="Australia. Seasonally adjusted using Python's "
                + "seasonal_decompose() from statsmodels.tsa.seasonal. ",
                **common_plot_settings,
            )


key_charts()

  return Index(index_like, name=name, copy=copy)
  return Index(index_like, name=name, copy=copy)
  return Index(index_like, name=name, copy=copy)
  return Index(index_like, name=name, copy=copy)
  return Index(index_like, name=name, copy=copy)
  return Index(index_like, name=name, copy=copy)


### Age related

#### set-up parameters

In [8]:
tables = tuple("31010" + str(i) for i in range(51, 60))
groups = ("Female", "Male", "Persons")  # persons should be last

state_colors["Australia"] = "grey"
linestyle = {
    "style": [
        "-",
        "-.",
        "--",
        ":",
    ]
    * 3  # repeat enough to cover all series
}

#### Utility functions

In [9]:
def get_age_data(table: str, group: str) -> tuple[str, pd.DataFrame]:
    # identify the relevant data
    relevant = meta[(meta[mc.table] == table) & meta[mc.did].str.contains(group)]

    # identify state
    state = relevant["Table Description"].iloc[0].split(",")[-1].strip()
    if state in state_abbr:
        state = state_abbr[state]

    # get raw data and associated labels
    columns = relevant[mc.id]
    data = abs_dict[table][columns]
    labels = (
        relevant[mc.did]
        .str.rsplit(";", n=2)
        .str[-2]
        .str.replace("100 and over", "100")
        .astype(int)
    )
    data_i = pd.DataFrame(data.to_numpy(), columns=labels, index=data.index)
    return state, data_i


# test
# get_age_data("3101051", "Persons")

In [10]:
def calculate_medians(data: pd.DataFrame) -> pd.Series:
    HALF = 0.5

    # calculate whole-year medians
    row_total = data.sum(axis=1)
    cumulative = data.div(row_total, axis=0).cumsum(axis=1)
    whole_median_age = cumulative.gt(HALF).idxmax(axis=1) - 1

    # calculate (imputed) fractional-year medians.
    low = pd.Series(
        {
            x: cumulative.loc[x, y]
            for x, y in zip(whole_median_age.index, whole_median_age.values)
        }
    )
    high = pd.Series(
        {
            x: cumulative.loc[x, y + 1]
            for x, y in zip(whole_median_age.index, whole_median_age.values)
        }
    )
    fractional_age = (HALF - low) / (high - low)
    fractional_median_age = whole_median_age + fractional_age
    return fractional_median_age

#### Age profiles

In [11]:
HMA_SMOOTHER = 5


def state_profiles():
    """Produce state population profiles by age."""

    for group in groups:
        state_compositions = {}

        for table in tables:
            # data capture
            state, data = get_age_data(table, group)
            period = data.index[-1]
            data_s = data.iloc[-1]

            # limited smoothing and store
            d = (data_s / data_s.sum()) * 100
            data_s = hma(d, HMA_SMOOTHER)
            state_compositions[state] = data_s

        # plot age profile ...
        state_compositions_df = pd.DataFrame(state_compositions)
        colors = [state_colors[x] for x in state_compositions_df.columns]
        ax = state_compositions_df.plot(
            lw=2,
            color=colors,
            **linestyle,
        )
        finalise_plot(
            axes=ax,
            title=f"Population distribution by Age and Jurisdiction ({group})",
            ylabel="Kernel Density Estimate (%)",
            xlabel="Age in whole years",
            legend={"loc": "best", "fontsize": "x-small", "ncols": 3},
            tag=group,
            lfooter=f"Australia. {period}",
            rfooter=f"Calculated from {source} {[int(i) for i in tables]}",
            show=SHOW,
        )


state_profiles()

#### Median Age by state

In [12]:
for group in groups:
    state_medians = {}

    # data capture
    for table in tables:
        state, df = get_age_data(table, group)
        state_medians[state] = calculate_medians(df)
    data = pd.DataFrame(state_medians)

    # plot
    colors = [state_colors[x] for x in data.columns]
    line_plot(
        data,
        color=colors,
        **linestyle,
        title=f"Median Population Age by Jurisdiction ({group})",
        ylabel="Years",
        xlabel=None,
        legend={"loc": "best", "fontsize": "x-small", "ncols": 3},
        lfooter=f"Australia. ",
        rfooter=f"Calculated from {source} {[int(i) for i in tables]}",
        width=2,
        show=SHOW,
    )

#### Median Age by Gender

In [13]:
def age_gender_profiles() -> None:
    """Produce state population profiles."""

    colors = [
        "hotpink",
        "cornflowerblue",
    ]

    for table in tables:
        # data capture
        group_medians = {}
        for group in groups[0:2]:  # assumes "Persons" is last group
            state, data = get_age_data(table, group)
            group_medians[group] = calculate_medians(data)
        data = pd.DataFrame(group_medians)

        # plot
        line_plot(
            data,
            color=colors,
            title=f"Median Population Age by Gender for {state}",
            ylabel="Years",
            rfooter=f"Calculated from {source} {table}",
            width=2,
            show=SHOW,
        )


age_gender_profiles()

### National and State Populations

In [14]:
table = "310104"
erp_df = abs_dict[table]
erp_phrase = "Estimated Resident Population ;  Persons ;"

states = (
    meta.loc[
        (meta[mc.table] == table) & (meta[mc.did].str.contains(erp_phrase)),
        mc.did,
    ]
    .str.replace(erp_phrase, "")
    .str.replace(" ;", "")
    .str.strip()
    .str.strip()
    .to_list()
)

for state in states:
    print(state)
    # get relevant data series
    selector = {
        table: mc.table,
        erp_phrase: mc.did,
        f";  {state} ;": mc.did,  # Australia, South Australia, etc.
    }
    _table, id, units = ra.find_abs_id(meta, selector, verbose=False)
    series = erp_df[id]
    series.name = "Estimated Resident Population"
    units = "Number Persons" if units == "Persons" else units
    series, units = recalibrate(series, units)

    # plot complete series
    title = f"Estimated Resident Population: {abbreviate(state)}"
    line_plot(
        series,
        title=title,
        ylabel=units,
        rfooter=f"{source} {table}",
        width=2,
        show=SHOW,
    )

    plot_covid_recovery(
        series,
        title=title,
        ylabel=units,
        tags="-covid",
        rfooter=f"{source} {table}",
        show=SHOW,
    )

    for start, tag in zip(plot_times, plot_tags):
        if state != "Australia" and start is None:
            continue

        growth_percent = calc_growth(series)
        plot_growth_finalise(
            *growth_percent,
            from_=start,
            tag=f"percent-{tag}",
            title=f"Growth in the {title}",
            rfooter=f"{source} {table}",
            annotate=9,
            show=SHOW,
        )

        growth_number = pd.DataFrame(
            [series.diff(4), series.diff(1)], index=["Annual", "Quarterly"]
        ).T
        growth_number, gunits = recalibrate(growth_number, units)
        plot_growth_finalise(
            growth_number["Annual"],
            growth_number["Quarterly"],
            from_=start,
            ylabel=gunits,
            title=f"Growth in the {title}",
            tag=f"numeric-{tag}",
            rfooter=f"{source} {table}",
            annotate=9,
            show=SHOW,
        )

New South Wales
Victoria
Queensland
South Australia
Western Australia
Tasmania
Northern Territory
Australian Capital Territory
Australia


## Finished

In [15]:
# watermark
%load_ext watermark
%watermark --machine -u -n -t -v -iv -w

Last updated: Fri Dec 13 2024 10:58:36

Python implementation: CPython
Python version       : 3.12.8
IPython version      : 8.30.0

Compiler    : Clang 18.1.8 
OS          : Darwin
Release     : 24.2.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

statsmodels: 0.14.4
pandas     : 2.2.3
numpy      : 1.26.4
readabs    : 0.0.17
re         : 2.2.1

Watermark: 2.5.0



In [16]:
print("Finished")

Finished
