In [30]:
## make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
from typing import NamedTuple

from pa_lib.file import (
    project_dir,
    load_bin,
    load_pickle,
    load_xlsx,
    store_bin,
)
from pa_lib.data import (
    as_dtype,
    dtFactor,
    lookup,
    desc_col,
    chi2_expected,
    clean_up_categoricals,
)
from pa_lib.util import cap_words, collect, value, normalize_rows, as_percent, flatten

# display long columns completely, show more rows
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data sets

In [5]:
with project_dir("axinova"):
    ax_data = load_bin("ax_data.feather")
    spr_data = load_pickle("spr_data.pkl")
    time_codes = load_pickle("time_code_ratios.pkl")
    station_codes = load_pickle("station_code_ratios.pkl")
    global_codes = load_pickle("global_code_ratios.pkl")
    population_codes = load_pickle("population_ratios.pkl")

10:34:29 [INFO] Started loading binary file ...
10:34:29 [INFO] Reading from file C:\Users\kpf\data\axinova\ax_data.feather
10:34:29 [INFO] ... finished loading binary file in 0.28s (0.97s CPU)
10:34:29 [INFO] Started loading pickle file ...
10:34:29 [INFO] Reading from file C:\Users\kpf\data\axinova\spr_data.pkl
10:34:29 [INFO] ... finished loading pickle file in 0.01s (0.02s CPU)
10:34:29 [INFO] Started loading pickle file ...
10:34:29 [INFO] Reading from file C:\Users\kpf\data\axinova\time_code_ratios.pkl
10:34:29 [INFO] ... finished loading pickle file in 0.01s (0.0s CPU)
10:34:29 [INFO] Started loading pickle file ...
10:34:29 [INFO] Reading from file C:\Users\kpf\data\axinova\station_code_ratios.pkl
10:34:29 [INFO] ... finished loading pickle file in 0.0s (0.02s CPU)
10:34:29 [INFO] Started loading pickle file ...
10:34:29 [INFO] Reading from file C:\Users\kpf\data\axinova\global_code_ratios.pkl
10:34:29 [INFO] ... finished loading pickle file in 0.0s (0.0s CPU)
10:34:29 [INFO] S

## Select Axinova data by any column(s)

In [28]:
def _check_selection(data, selection, allowed_columns):
    allowed_values = {}
    for column in allowed_columns:
        allowed_values[column] = data[column].cat.categories
    if set(selection.keys()) - set(allowed_columns) != set():
        raise NameError(f"Unknown column name in selection: {selection.keys()}")
    clean_selection = {}
    for column in allowed_columns:
        if column in selection:
            col_values = list(flatten(selection[column]))
            if set(col_values) - set(allowed_values[column]) != set():
                raise ValueError(
                    f"Illegal value(s) in parameter {column}: {col_values}"
                )
            clean_selection[column] = col_values
        else:
            clean_selection[column] = None
    return clean_selection


def select_data(all_data, **selection):
    select_columns = "DayOfWeek Station Variable Month TimeSlot Hour Time TimeSlot_cat StationSprache".split()
    selection = _check_selection(all_data, selection, allowed_columns=select_columns)
    row_mask = pd.Series([True] * all_data.shape[0])
    for col in select_columns:
        if selection[col] is not None:
            row_mask &= all_data[col].isin(selection[col])
    return all_data.loc[row_mask].pipe(clean_up_categoricals).reset_index(drop=True)

# Look up code ratios from Axinova data

In [6]:
RatioTable = pd.DataFrame
Ratios = NamedTuple("Ratios", (("actual", RatioTable), ("expected", RatioTable)))

In [16]:
def ax_population_ratios(variable: str, percent: bool = False) -> RatioTable:
    ratios = population_codes.query("Variable == @variable").pivot_table(
        values="Pop_Ratio", index="Variable", columns="Code"
    )
    return as_percent(ratios) if percent else ratios


def ax_global_ratios(variable: str, percent: bool = False) -> RatioTable:
    ratios = global_codes.query("Variable == @variable").pivot_table(
        values="Ratio", index="Variable", columns="Code"
    )
    return as_percent(ratios) if percent else ratios


def ax_station_ratios(variable: str, percent: bool = False) -> Ratios:
    actual_ratios = station_codes.query("Variable == @variable").pivot_table(
        values="Ratio", index="Station", columns="Code", fill_value=0
    )
    expected_ratios = ax_global_ratios(variable)
    if percent:
        result = Ratios(as_percent(actual_ratios), as_percent(expected_ratios))
    else:
        result = Ratios(actual_ratios, expected_ratios)
    return result

In [8]:
def ax_ratios(
    variable: str,
    stations: str,
    reference: str = "all_stations",
    time_scale: str = "Hour",
    percent: bool = False,
) -> Ratios:
    subset = ax_data.loc[
        ax_data.Station.isin(flatten(stations)) & (ax_data.Variable == variable)
    ]
    full_index = [
        (weekday, time)
        for weekday in ax_data["DayOfWeek"].cat.categories
        for time in ax_data[time_scale].cat.categories
    ]
    actual_counts = subset.pivot_table(
        values="Value", index=["DayOfWeek", time_scale], columns="Code", fill_value=0,
    )
    actual_ratios = normalize_rows(actual_counts).reindex(full_index, fill_value=0)

    if reference == "all_stations":
        expected_ratios = (
            time_codes[time_scale]
            .query("Variable == @variable")
            .pivot_table(
                values="Ratio",
                index=["DayOfWeek", time_scale],
                columns="Code",
                fill_value=0,
            )
        )
    elif reference == "station":
        expected_counts = chi2_expected(actual_counts)
        expected_ratios = normalize_rows(expected_counts).reindex(
            full_index, fill_value=0
        )
    else:
        raise ValueError(
            f"Parameter 'reference' must be one of ('station', 'all_stations'), was '{reference}'"
        )

    if percent:
        result = Ratios(as_percent(actual_ratios), as_percent(expected_ratios))
    else:
        result = Ratios(actual_ratios, expected_ratios)
    return result

# Look up SPR+ data split by variable

In [63]:
def spr_split(
    stations,
    variable,
    reference="station",
    time_scale="Hour",
    type="abs",
    incl_totals=True,
    decimals=1,
):
    if type not in ["abs", "rel"]:
        raise ValueError(
            f"Parameter 'type' must be one of ('abs', 'rel'), was '{type}'"
        )
    if reference not in ["station", "all_stations"]:
        raise ValueError(
            f"Parameter 'reference' must be one of ('station', 'all_stations'), was '{reference}'"
        )

    spr_counts = (
        spr_data.loc[spr_data.Station.isin(flatten(stations))]
        .groupby(["DayOfWeek", time_scale])[["Total"]]
        .agg("sum")
    )
    ratios = ax_ratios(
        stations=stations, variable=variable, reference=reference, time_scale=time_scale
    )
    if type == "abs":
        code_ratios = ratios.actual
    elif type == "rel":
        code_ratios = ratios.actual - ratios.expected
    result = code_ratios.mul(spr_counts.Total, axis="index").round(decimals)

    if incl_totals:
        result.set_axis(result.columns.to_list(), axis="columns", inplace=True)
        result["Total"] = spr_counts.round(decimals)

    if decimals == 0:
        result = result.astype("int")

    return result

# Tests

## Station ratios

In [10]:
with value(ax_station_ratios(variable="g_220", percent=True)) as autobesitz:
    display(autobesitz.actual)
    display(autobesitz.expected)
    display(autobesitz.actual.sub(autobesitz.expected.values, axis="columns"))

Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,43,30,27
Basel SBB,44,20,36
Bellinzona,52,38,10
Bern,43,28,28
Biel/Bienne,50,25,25
Brig,51,40,9
Chur,48,35,17
Fribourg,31,56,14
Genève Aéroport,41,49,10
Genève Cornavin,53,18,29


Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,45,29,26


Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,-2,1,1
Basel SBB,-1,-9,10
Bellinzona,7,9,-16
Bern,-2,-1,2
Biel/Bienne,5,-4,-1
Brig,6,11,-17
Chur,3,6,-9
Fribourg,-14,27,-12
Genève Aéroport,-4,20,-16
Genève Cornavin,8,-11,3


## Time ratios for one station

In [11]:
with value(ax_ratios(stations="Lausanne", variable="g_220")) as lausanne_auto:
    display(lausanne_auto.actual)
    display(lausanne_auto.expected)
    display(lausanne_auto.actual - lausanne_auto.expected)

Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,00,0.333711,0.384791,0.281498
Monday,01,1.000000,0.000000,0.000000
Monday,02,0.000000,0.000000,0.000000
Monday,03,0.409561,0.000000,0.590439
Monday,04,0.380052,0.000000,0.619948
...,...,...,...,...
Sunday,19,0.296197,0.419960,0.283843
Sunday,20,0.361548,0.423531,0.214922
Sunday,21,0.417628,0.315654,0.266718
Sunday,22,0.279258,0.384970,0.335771


Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,00,0.379786,0.316787,0.303428
Monday,01,0.505276,0.251623,0.243101
Monday,02,0.374364,0.060254,0.565383
Monday,03,0.486027,0.395825,0.118148
Monday,04,0.285555,0.409515,0.304931
...,...,...,...,...
Sunday,19,0.384024,0.266989,0.348987
Sunday,20,0.379769,0.332839,0.287393
Sunday,21,0.369698,0.340051,0.290251
Sunday,22,0.459069,0.309407,0.231524


Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,00,-0.046075,0.068004,-0.021930
Monday,01,0.494724,-0.251623,-0.243101
Monday,02,-0.374364,-0.060254,-0.565383
Monday,03,-0.076466,-0.395825,0.472291
Monday,04,0.094497,-0.409515,0.315017
...,...,...,...,...
Sunday,19,-0.087828,0.152971,-0.065144
Sunday,20,-0.018221,0.090692,-0.072471
Sunday,21,0.047930,-0.024398,-0.023533
Sunday,22,-0.179810,0.075563,0.104247


## SPR+ data for one station split by ratios

In [12]:
with value(("Bern", "g_220", "Hour")) as (stat, var, scale):
    display(spr_split(stations=stat, variable=var, type="abs", time_scale=scale))
    display(spr_split(stations=stat, variable=var, type="rel", time_scale=scale))

Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,00,904,534,606
Monday,01,0,0,0
Monday,02,0,0,0
Monday,03,0,0,0
Monday,04,0,0,0
...,...,...,...,...
Sunday,19,17954,9953,11323
Sunday,20,11530,7955,10888
Sunday,21,8026,7698,6904
Sunday,22,6384,3777,4581


Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,00,74,-77,3
Monday,01,0,0,0
Monday,02,0,0,0
Monday,03,0,0,0
Monday,04,0,0,0
...,...,...,...,...
Sunday,19,2027,-1775,-252
Sunday,20,-801,-1126,1927
Sunday,21,-1161,933,228
Sunday,22,399,-630,231


# Case study: Find best stations / time slots for owner of multiple cars

## Population ratios

In [17]:
ax_population_ratios(variable="g_220", percent=True)

Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,44,40,16


## Ratios over all stations

In [13]:
ax_global_ratios(variable="g_220", percent=True)

Code,1 Auto,2+ Autos,Keines
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g_220,45,29,26


Multi-car owners are **underrepresented** at digital station panels (29% vs. 40%)

## Ratios per station

In [15]:
ax_station_ratios(variable="g_220", percent=True).actual

Code,1 Auto,2+ Autos,Keines
Station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aarau,43,30,27
Basel SBB,44,20,36
Bellinzona,52,38,10
Bern,43,28,28
Biel/Bienne,50,25,25
Brig,51,40,9
Chur,48,35,17
Fribourg,31,56,14
Genève Aéroport,41,49,10
Genève Cornavin,53,18,29


Best stations are **Fribourg** (56%), **Zürich Flughafen** (50%), and **Genève Aéroport** (49%).

This compares to 29% over all stations, and 40% in the base population.

## Time slots at one station: Zürich Flughafen

In [57]:
prozent = ax_ratios(
    variable="g_220", stations="Zürich Flughafen", time_scale="Hour", percent=True
).actual

prozent.loc[prozent["2+ Autos"] > 40].round(1)

Unnamed: 0_level_0,Code,1 Auto,2+ Autos,Keines
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,0,48,52,0
Monday,1,0,100,0
Monday,4,39,61,0
Monday,5,29,43,28
Monday,7,23,55,22
Monday,10,43,57,0
Monday,11,30,50,21
Monday,12,27,48,24
Monday,13,31,46,24
Monday,14,21,58,20


In [64]:
personen = spr_split(stations="Zürich Flughafen", variable="g_220", type="abs", decimals=0)

In [33]:
select_data(
    ax_data, Station="Zürich Flughafen", DayOfWeek="Monday", Hour="01", Variable="g_220"
).loc[:, ["Month", "Time", "Value"]]

Unnamed: 0,Month,Time,Value
0,7,01:00 - 01:15,1.44976
1,7,01:15 - 01:30,1.44976
2,7,01:30 - 01:45,1.44976


In [68]:
prozent.merge(personen, left_index=True, right_index=True, suffixes=(" [%]", " [Pers.]"))

Unnamed: 0_level_0,Unnamed: 1_level_0,1 Auto [%],2+ Autos [%],Keines [%],1 Auto [Pers.],2+ Autos [Pers.],Keines [Pers.],Total
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,0,48,52,0,11,12,0,24
Monday,1,0,100,0,0,0,0,0
Monday,2,0,0,0,0,0,0,0
Monday,3,0,0,0,0,0,0,0
Monday,4,39,61,0,0,0,0,0
Monday,5,29,43,28,18,26,16,59
Monday,6,35,30,36,81,69,83,233
Monday,7,23,55,22,119,291,116,527
Monday,8,39,33,27,194,165,135,494
Monday,9,37,36,27,119,118,89,326


In [72]:
spr_data.head(30)

Unnamed: 0,WT,Flaeche_ID,Anbieter,Spr_Flaeche_ID,Time,Total,Nummer,PF,Position-Nr.,Gebiet,Gebietsbez.,Gebiet Code,Adresszusatz,STM,EN-Nr.,NG-Nr.,Hour,ShortTime,DayOfWeek,Station
0,Dienstag,680759,1,1680759,00:00:00,34.656119,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Tuesday,Zürich HB
1,Donnerstag,680759,1,1680759,00:00:00,47.076751,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Thursday,Zürich HB
2,Freitag,680759,1,1680759,00:00:00,59.387391,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Friday,Zürich HB
3,Mittwoch,680759,1,1680759,00:00:00,41.019025,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Wednesday,Zürich HB
4,Montag,680759,1,1680759,00:00:00,41.005048,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Monday,Zürich HB
5,Samstag,680759,1,1680759,00:00:00,156.054594,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Saturday,Zürich HB
6,Sonntag,680759,1,1680759,00:00:00,203.845905,680759.0,Rail eBoard,DA01,26101.0,Zürich 1,ZH,"Gleis 41, Linien 5,6,7,9,12, Sektor B","Bahnhof, WaitingZone, Beem",416692.0,212748.0,0,00:00,Sunday,Zürich HB
7,Dienstag,680760,1,1680760,00:00:00,23.251568,680760.0,Rail ePanel,DD01,26101.0,Zürich 1,ZH,Halle Kopfperron,Bahnhof,417223.0,212986.0,0,00:00,Tuesday,Zürich HB
8,Donnerstag,680760,1,1680760,00:00:00,33.302476,680760.0,Rail ePanel,DD01,26101.0,Zürich 1,ZH,Halle Kopfperron,Bahnhof,417223.0,212986.0,0,00:00,Thursday,Zürich HB
9,Freitag,680760,1,1680760,00:00:00,42.149386,680760.0,Rail ePanel,DD01,26101.0,Zürich 1,ZH,Halle Kopfperron,Bahnhof,417223.0,212986.0,0,00:00,Friday,Zürich HB
