In [109]:
## make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
from dataclasses import dataclass


from pa_lib.file import (
    project_dir,
    load_bin,
    load_pickle,
    load_xlsx,
    store_bin,
)
from pa_lib.data import (
    as_dtype,
    dtFactor,
    lookup,
    desc_col,
    chi2_expected,
    clean_up_categoricals,
)
from pa_lib.util import (
    cap_words,
    collect,
    value,
    normalize_rows,
    as_percent,
    flat_list,
    list_items,
)
from pa_lib.html import HR, html, H1, H2, H3

# display long columns completely, show more rows
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data sets

In [20]:
with project_dir("axinova"):
    ax_data = load_bin("ax_data.feather")
    spr_data = load_pickle("spr_data.pkl")
    time_codes = load_pickle("time_code_ratios.pkl")
    station_codes = load_pickle("station_code_ratios.pkl")
    global_codes = load_pickle("global_code_ratios.pkl")
    population_codes = load_pickle("population_ratios.pkl")
    ax_var_struct = load_bin("ax_var_struct.feather")

11:44:12 [INFO] Started loading binary file ...
11:44:12 [INFO] Reading from file C:\Users\kpf\data\axinova\ax_data.feather
11:44:12 [INFO] ... finished loading binary file in 0.31s (1.17s CPU)
11:44:12 [INFO] Started loading pickle file ...
11:44:12 [INFO] Reading from file C:\Users\kpf\data\axinova\spr_data.pkl
11:44:12 [INFO] ... finished loading pickle file in 0.01s (0.0s CPU)
11:44:12 [INFO] Started loading pickle file ...
11:44:12 [INFO] Reading from file C:\Users\kpf\data\axinova\time_code_ratios.pkl
11:44:12 [INFO] ... finished loading pickle file in 0.01s (0.02s CPU)
11:44:12 [INFO] Started loading pickle file ...
11:44:12 [INFO] Reading from file C:\Users\kpf\data\axinova\station_code_ratios.pkl
11:44:12 [INFO] ... finished loading pickle file in 0.0s (0.0s CPU)
11:44:12 [INFO] Started loading pickle file ...
11:44:12 [INFO] Reading from file C:\Users\kpf\data\axinova\global_code_ratios.pkl
11:44:12 [INFO] ... finished loading pickle file in 0.0s (0.0s CPU)
11:44:12 [INFO] St

## Select Axinova data by any column(s)

In [17]:
def _check_selection(data, selection, allowed_columns):
    allowed_values = {}
    for column in allowed_columns:
        allowed_values[column] = data[column].cat.categories
    if set(selection.keys()) - set(allowed_columns) != set():
        raise NameError(f"Illegal column name in selection: {selection.keys()}")
    clean_selection = {}
    for column in allowed_columns:
        if column in selection:
            col_values = flat_list(selection[column])
            if set(col_values) - set(allowed_values[column]) != set():
                raise ValueError(
                    f"Illegal value(s) in parameter {column}: {col_values}"
                )
            clean_selection[column] = col_values
        else:
            clean_selection[column] = None
    return clean_selection


def select_data(all_data, **selection):
    """
    Filter ax_data by different columns. Supports sequences of allowed values.
    """
    select_columns = (
        "DayOfWeek Station Variable Month TimeSlot Hour "
        + "Time TimeSlot_cat StationSprache Code".split()
    )
    selection = _check_selection(all_data, selection, allowed_columns=select_columns)
    row_mask = pd.Series([True] * all_data.shape[0])
    for col in select_columns:
        if selection[col] is not None:
            row_mask &= all_data[col].isin(selection[col])
    return all_data.loc[row_mask].pipe(clean_up_categoricals).reset_index(drop=True)

# Look up code ratios from Axinova data

In [4]:
RatioTable = pd.DataFrame


@dataclass
class Ratios:
    actual: RatioTable
    expected: RatioTable
    sd: RatioTable = None

In [5]:
def ax_population_ratios(variable: str, percent: bool = False) -> RatioTable:
    ratios = population_codes.query("Variable == @variable").pivot_table(
        values="Pop_Ratio", index="Variable", columns="Code"
    )
    return as_percent(ratios) if percent else ratios


def ax_global_ratios(variable: str, percent: bool = False) -> RatioTable:
    ratios = global_codes.query("Variable == @variable").pivot_table(
        values="Ratio", index="Variable", columns="Code"
    )
    return as_percent(ratios) if percent else ratios


def ax_station_ratios(variable: str, percent: bool = False) -> Ratios:
    actual_ratios = station_codes.query("Variable == @variable").pivot_table(
        values="Ratio", index="Station", columns="Code", fill_value=0
    )
    expected_ratios = ax_global_ratios(variable)
    if percent:
        result = Ratios(
            actual=as_percent(actual_ratios), expected=as_percent(expected_ratios)
        )
    else:
        result = Ratios(actual=actual_ratios, expected=expected_ratios)
    return result

In [6]:
def ax_ratios(
    variable: str,
    stations: str,
    reference: str = "all_stations",
    time_scale: str = "Hour",
    percent: bool = False,
) -> Ratios:
    subset = ax_data.loc[
        ax_data.Station.isin(flat_list(stations)) & (ax_data.Variable == variable)
    ]
    full_index = [
        (weekday, time)
        for weekday in ax_data["DayOfWeek"].cat.categories
        for time in ax_data[time_scale].cat.categories
    ]
    actual_counts = subset.pivot_table(
        values="Value",
        index=["DayOfWeek", time_scale],
        columns="Code",
        fill_value=0,
        aggfunc="sum",
    )
    actual_counts_sd_ratios = (
        (np.sqrt(actual_counts) / actual_counts)
        .fillna(0)
        .reindex(full_index, fill_value=0)
    )
    actual_ratios = normalize_rows(actual_counts).reindex(full_index, fill_value=0)

    if reference == "all_stations":
        expected_ratios = (
            time_codes[time_scale]
            .query("Variable == @variable")
            .pivot_table(
                values="Ratio",
                index=["DayOfWeek", time_scale],
                columns="Code",
                fill_value=0,
            )
        )
    elif reference == "station":
        expected_counts = chi2_expected(actual_counts)
        expected_ratios = normalize_rows(expected_counts).reindex(
            full_index, fill_value=0
        )
    else:
        raise ValueError(
            "Parameter 'reference' must be one of "
            + f"('station', 'all_stations'), was '{reference}'"
        )

    if percent:
        result = Ratios(
            actual=as_percent(actual_ratios),
            expected=as_percent(expected_ratios),
            sd=as_percent(actual_counts_sd_ratios),
        )
    else:
        result = Ratios(
            actual=actual_ratios, expected=expected_ratios, sd=actual_counts_sd_ratios
        )
    return result

# Look up SPR+ data split by variable

In [178]:
def spr_split(
    stations,
    variable,
    weekdays=None,
    reference="station",
    time_scale="Hour",
    type="abs",
    incl_totals=True,
    decimals=1,
):
    if type not in ["abs", "rel"]:
        raise ValueError(
            f"Parameter 'type' must be one of ('abs', 'rel'), was '{type}'"
        )
    if reference not in ["station", "all_stations"]:
        raise ValueError(
            "Parameter 'reference' must be one of "
            + f"('station', 'all_stations'), was '{reference}'"
        )
    if weekdays is None:
        weekdays = [
            "Monday",
            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
        ]

    spr_counts = (
        spr_data.loc[
            spr_data.Station.isin(flat_list(stations))
            & spr_data.DayOfWeek.isin(weekdays)
        ]
        .groupby(["DayOfWeek", time_scale], observed=True)[["Total"]]
        .agg("sum")
    )
    spr_count_sd_ratios = (np.sqrt(spr_counts) / spr_counts).fillna(0)

    ratios = ax_ratios(
        stations=stations,
        variable=variable,
        reference=reference,
        time_scale=time_scale,
        percent=False,
    )
    if type == "abs":
        code_ratios = ratios.actual
    elif type == "rel":
        code_ratios = ratios.actual - ratios.expected
    split_counts = code_ratios.mul(spr_counts.Total, axis="index").round(decimals)

    sd_ratios = np.sqrt(
        (ratios.sd ** 2).add((spr_count_sd_ratios ** 2).values, axis="index")
    )

    if incl_totals:
        split_counts.set_axis(
            split_counts.columns.to_list(), axis="columns", inplace=True
        )
        split_counts["Total"] = spr_counts.round(decimals)
        sd_ratios.set_axis(sd_ratios.columns.to_list(), axis="columns", inplace=True)
        sd_ratios["Total"] = spr_count_sd_ratios.values
    if decimals == 0:
        split_counts = split_counts.astype("int")

    return split_counts, sd_ratios

# Tests

## Station ratios

In [8]:
with value(ax_station_ratios(variable="g_220", percent=True)) as autobesitz:
    display(autobesitz.actual)
    display(autobesitz.expected)
    display(autobesitz.actual.sub(autobesitz.expected.values, axis="columns"))

Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,43,30,27
Basel SBB,44,20,36
Bellinzona,52,38,11
Bern,44,28,28
Biel/Bienne,50,25,25
Brig,52,39,9
Chur,46,37,17
Fribourg,31,56,13
Genève Aéroport,42,49,10
Genève Cornavin,52,18,30


Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,45,29,26


Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,-2,1,1
Basel SBB,-1,-9,10
Bellinzona,7,9,-15
Bern,-1,-1,2
Biel/Bienne,5,-4,-1
Brig,7,10,-17
Chur,1,8,-9
Fribourg,-14,27,-13
Genève Aéroport,-3,20,-16
Genève Cornavin,7,-11,4


## Time ratios for one station

In [None]:
with value(
    ax_ratios(stations="Lausanne", variable="g_220", time_scale="TimeSlot")
) as lausanne_auto:
    display(lausanne_auto.actual)
    display(lausanne_auto.expected)
    display(lausanne_auto.actual - lausanne_auto.expected)

## SPR+ data for one station split by ratios

In [None]:
with value(("Bern", "g_220", "TimeSlot")) as (stat, var, scale):
    display(spr_split(stations=stat, variable=var, type="abs", time_scale=scale)[0])
    display(spr_split(stations=stat, variable=var, type="rel", time_scale=scale)[0])

# Case study: Find best stations / time slots for owner of multiple cars

## Population ratios

In [10]:
ax_population_ratios(variable="g_220", percent=True)

Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,44,40,16


## Ratios over all stations

In [11]:
ax_global_ratios(variable="g_220", percent=True)

Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,45,29,26


Multi-car owners are **underrepresented** at digital station panels (29% vs. 40%)

## Ratios per station

In [12]:
ax_station_ratios(variable="g_220", percent=True).actual

Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,43,30,27
Basel SBB,44,20,36
Bellinzona,52,38,11
Bern,44,28,28
Biel/Bienne,50,25,25
Brig,52,39,9
Chur,46,37,17
Fribourg,31,56,13
Genève Aéroport,42,49,10
Genève Cornavin,52,18,30


Best stations are **Fribourg** (56%), **Zürich Flughafen** (50%), and **Genève Aéroport** (49%).

This compares to 29% over all stations, and 40% in the base population.

## Time slots at one station: Zürich Flughafen

In [13]:
prozent = ax_ratios(
    variable="g_220", stations="Zürich Flughafen", time_scale="Hour", percent=True
).actual
personen = spr_split(
    stations="Zürich HB", variable="g_220", time_scale="Hour", type="abs", decimals=0
)[0]
time_analysis = prozent.merge(
    personen, left_index=True, right_index=True, suffixes=(" [%]", " [Pers.]")
)

time_analysis.loc[prozent["2+ Autos"] > 40]

Unnamed: 0_level_0,Unnamed: 1_level_0,1 Auto [%],2+ Autos [%],Keines [%],1 Auto [Pers.],2+ Autos [Pers.],Keines [Pers.],Total
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,0,48,52,0,1123,1627,180,2930
Monday,1,0,100,0,802,0,0,802
Monday,4,18,82,0,0,0,712,712
Monday,5,29,60,11,1506,215,5141,6863
Monday,7,11,79,10,34559,14353,15151,64064
Monday,9,38,59,3,14221,10491,13664,38376
Monday,10,39,61,0,12515,10395,11864,34774
Monday,11,44,52,4,17402,10164,12503,40069
Monday,12,28,63,9,25315,11095,17344,53754
Monday,13,31,67,3,19727,11663,14282,45672


In [14]:
time_analysis.sort_values("2+ Autos [Pers.]", ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,1 Auto [%],2+ Autos [%],Keines [%],1 Auto [Pers.],2+ Autos [Pers.],Keines [Pers.],Total
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wednesday,17,30,45,25,48907,27117,20611,96636
Tuesday,17,54,24,22,50686,26275,21909,98870
Thursday,16,42,52,6,32431,22583,14328,69342
Thursday,17,43,54,2,49443,22268,23538,95249
Wednesday,16,41,49,11,29403,21033,18917,69354
Friday,17,54,36,10,46106,20395,23423,89924
Monday,17,46,40,15,54606,20102,21380,96088
Tuesday,16,33,61,6,33664,20031,16344,70039
Sunday,16,32,40,28,33044,19763,22988,75796
Monday,16,47,50,3,32116,19505,18034,69656


## Time slots with CI at one station: Zürich Flughafen

In [15]:
(counts, sd_ratios) = spr_split(
    stations="Zürich Flughafen",
    variable="g_220",
    time_scale="Hour",
    type="abs",
    decimals=1,
)

counts.merge(
    as_percent(sd_ratios),
    left_index=True,
    right_index=True,
    suffixes=(" [Pers]", " [sd%]"),
)

Unnamed: 0_level_0,Unnamed: 1_level_0,1 Auto [Pers],2+ Autos [Pers],Keines [Pers],Total [Pers],1 Auto [sd%],2+ Autos [sd%],Keines [sd%],Total [sd%]
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Monday,0,11.3,12.3,0.0,23.6,73,70,21,21
Monday,1,0.0,0.0,0.0,0.0,0,48,0,0
Monday,2,0.0,0.0,0.0,0.0,0,0,0,0
Monday,3,0.0,0.0,0.0,0.0,0,0,0,0
Monday,4,0.0,0.0,0.0,0.0,95,44,0,0
Monday,5,17.1,35.5,6.8,59.5,36,26,54,13
Monday,6,85.7,72.7,75.0,233.4,34,37,36,7
Monday,7,56.8,414.8,55.1,526.6,52,20,53,4
Monday,8,289.5,137.2,67.0,493.8,27,40,56,5
Monday,9,124.5,191.7,10.3,326.4,29,23,98,6


## Analysis for ZRH Airport (ax_data only)

In [16]:
with value(
    ax_ratios(
        variable="g_220",
        stations="Zürich Flughafen - Airside",
        reference="station",
        time_scale="TimeSlot",
    )
) as ratios:
    display(H3("Actual ratios"))
    display(ratios.actual)
    display(HR(), H3("Confidence intervals (sd in %)"))
    display(as_percent(ratios.sd))

Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,TimeSlot,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,Nacht,0.180987,0.552202,0.266811
Monday,Morgen-Rush,0.273142,0.434854,0.292004
Monday,Morgen,0.429608,0.413985,0.156407
Monday,Mittag,0.160982,0.609423,0.229595
Monday,Nachmittag,0.448211,0.480042,0.071747
Monday,Abend-Rush,0.371419,0.478662,0.149919
Monday,Abend,0.366501,0.421789,0.211709
Tuesday,Nacht,0.330247,0.669753,0.0
Tuesday,Morgen-Rush,0.505201,0.343728,0.151071
Tuesday,Morgen,0.473495,0.470133,0.056372


Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,TimeSlot,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,Nacht,27,16,22
Monday,Morgen-Rush,15,12,15
Monday,Morgen,17,18,29
Monday,Mittag,26,13,22
Monday,Nachmittag,13,12,32
Monday,Abend-Rush,15,13,23
Monday,Abend,14,13,18
Tuesday,Nacht,25,17,0
Tuesday,Morgen-Rush,13,15,23
Tuesday,Morgen,21,21,60


# Target audience ratios

In [83]:
var_info = {}
for var, data in ax_var_struct.groupby("Variable"):
    var_info[var] = dict(
        Label=data["Variable_Label"].max(),
        Codes=data["Label"].to_list(),
        Order=list(range(len(data["Label_Nr"].to_list()))),
    )


def var_label(variable):
    return var_info[variable]["Label"]


def var_code_labels(variable):
    return var_info[variable]["Codes"]


def var_code_order(variable):
    return var_info[variable]["Order"]

In [176]:
def target_ratios(variable, code_index, station, weekdays=None, time_scale="Hour"):
    code_nr = flat_list(code_index)
    weekday_list = [
        "Monday",
        "Tuesday",
        "Wednesday",
        "Thursday",
        "Friday",
        "Saturday",
        "Sunday",
    ]
    if weekdays is not None:
        if set(flat_list(weekdays)) - set(weekday_list) != set():
            raise ValueError(f"Illegal weekday in {weekdays}")
        weekday_list = flat_list(weekdays)

    try:
        code_labels = list_items(var_code_labels(variable), code_nr)
    except IndexError:
        raise ValueError(
            f"Illegal code index(es) in {code_nr}, allowed are {var_code_order(variable)}"
        ) from None

    def percent(num):
        return round(num * 100, 1)

    glob_ratio = percent(
        ax_global_ratios(variable)[code_labels].sum(axis="columns").values[0]
    )
    pop_ratio = percent(
        ax_population_ratios(variable)[code_labels].sum(axis="columns").values[0]
    )
    station_counts, sd_ratios = spr_split(
        stations=station,
        weekdays=weekday_list,
        variable=variable,
        time_scale=time_scale,
        incl_totals=True,
    )
    target_counts = station_counts[code_labels].sum(axis="columns")
    target_ratios = (
        normalize_rows(station_counts).mul(2)[code_labels].sum(axis="columns")
    )
    target_data = (
        pd.DataFrame({"Persons": target_counts, "Ratio": target_ratios})
        .eval("CI = sqrt(Persons)")
        .eval("CI_prc = 100 * CI / Persons")
    )

    display(HR())
    print(
        f"{var_label(variable)}{code_labels}, bhf = {glob_ratio}%, pop = {pop_ratio}%"
    )
    display(H3(station))
    display(target_data.head(24))

In [179]:
code_spec = {"g_220": (0, 1), "md_ek": [5], "md_hhverm": [5, 6]}
stations = ["Biel/Bienne"]
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]

for (var, code_idx) in code_spec.items():
    for station in flat_list(stations):
        target_ratios(
            variable=var, code_index=code_idx, station=station, weekdays="Tuesday", time_scale="Hour"
        )

ValueError: Unable to coerce to DataFrame, shape must be (168, 3): given (24, 1)