In [57]:
## make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

from pa_lib.file import (
    project_dir,
    load_bin,
    load_pickle,
    load_xlsx,
    store_bin,
)
from pa_lib.data import as_dtype, dtFactor, lookup, desc_col
from pa_lib.util import cap_words, collect, value, normalize_rows

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data sets

In [35]:
with project_dir("axinova"):
    ax_data = load_bin("ax_data.feather")
    spr_data = load_pickle("spr_data.pkl")

15:24:08 [INFO] Started loading binary file ...
15:24:08 [INFO] Reading from file C:\Users\kpf\data\axinova\ax_data.feather
15:24:08 [INFO] ... finished loading binary file in 0.66s (1.12s CPU)
15:24:08 [INFO] Started loading pickle file ...
15:24:08 [INFO] Reading from file C:\Users\kpf\data\axinova\spr_data.pkl
15:24:08 [INFO] ... finished loading pickle file in 0.03s (0.0s CPU)


## Make weekday fields compatible

In [36]:
spr_data["DayOfWeek"] = spr_data.WT.map(
    {
        "Montag": "Monday",
        "Dienstag": "Tuesday",
        "Mittwoch": "Wednesday",
        "Donnerstag": "Thursday",
        "Freitag": "Friday",
        "Samstag": "Saturday",
        "Sonntag": "Sunday",
    }
).cat.reorder_categories(
    "Monday Tuesday Wednesday Thursday Friday Saturday Sunday".split(), ordered=True,
)

# Functions to look up ax-ratios for a given site

In [126]:
%%time
global_codes_by_hour = ax_data.groupby(
    ["Variable", "DayOfWeek", "Hour", "Code"], observed=True, as_index=False
)["Value"].agg("sum")

global_codes_by_hour["Ratio"] = global_codes_by_hour.groupby(
    ["Variable", "DayOfWeek", "Hour"], observed=True
)["Value"].transform(lambda s: s / s.sum())

Wall time: 7.49 s


In [125]:
global_codes_by_hour.head(20)

Unnamed: 0,Variable,DayOfWeek,Hour,Code,Value,Ratio
0,g_220,Monday,0,1 Auto,52.527501,0.379786
1,g_220,Monday,0,2+ Autos,43.814252,0.316787
2,g_220,Monday,0,Keines,41.966582,0.303428
3,g_220,Monday,1,1 Auto,13.416857,0.505276
4,g_220,Monday,1,2+ Autos,6.68148,0.251623
5,g_220,Monday,1,Keines,6.455201,0.243101
6,g_220,Monday,2,1 Auto,5.733443,0.374364
7,g_220,Monday,2,2+ Autos,0.922794,0.060254
8,g_220,Monday,2,Keines,8.658931,0.565383
9,g_220,Monday,3,1 Auto,5.37179,0.486027


In [131]:
global_codes_by_hour.query("Variable == 'md_ek'").pivot_table(
    values="Ratio", index=["DayOfWeek", "Hour"], columns="Code", fill_value=0
)

Unnamed: 0_level_0,Code,Keine Angabe,Mehr als 12'000 CHF,Weniger als 3'000 CHF,Zwischen 3'000 und 4'500 CHF,Zwischen 4'501 und 6'000 CHF,Zwischen 6'001 und 9'000 CHF,Zwischen 9'001 und 12'000 CHF
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,00,0.051526,0.276001,0.042410,0.030586,0.203998,0.245040,0.150440
Monday,01,0.300081,0.136359,0.000000,0.019975,0.023928,0.225946,0.293712
Monday,02,0.047858,0.060254,0.000000,0.511991,0.000000,0.243272,0.136625
Monday,03,0.000000,0.559733,0.000000,0.143968,0.118148,0.083492,0.094659
Monday,04,0.018113,0.370464,0.000000,0.070763,0.043050,0.460400,0.037211
...,...,...,...,...,...,...,...,...
Sunday,19,0.124176,0.186511,0.075698,0.065671,0.116306,0.205838,0.225800
Sunday,20,0.164018,0.190455,0.061113,0.079926,0.093518,0.177203,0.233766
Sunday,21,0.159188,0.208332,0.049730,0.101262,0.071455,0.162842,0.247190
Sunday,22,0.120314,0.183764,0.049885,0.126304,0.124704,0.178004,0.217025


In [132]:
def ax_var(station, variable, by):
    subset = ax_data.query(
        f"Station == @station and Variable == @variable"
    )
    actual_ratios = normalize_rows(
        subset.pivot_table(
            values="Value",
            index=["DayOfWeek", by],
            columns="Code",
            aggfunc="sum",
            fill_value=0,
        )
    )
    #expected_ratio = global_ratio(variable, by)
    return actual_ratios

In [133]:
ax_var(station="Zürich HB", variable="md_ek", by="Hour")

Unnamed: 0_level_0,Code,Keine Angabe,Mehr als 12'000 CHF,Weniger als 3'000 CHF,Zwischen 3'000 und 4'500 CHF,Zwischen 4'501 und 6'000 CHF,Zwischen 6'001 und 9'000 CHF,Zwischen 9'001 und 12'000 CHF
DayOfWeek,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Monday,00,0.000000,0.289273,0.071409,0.065591,0.188280,0.327460,0.057987
Monday,01,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
Monday,02,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000
Monday,03,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000
Monday,04,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...
Sunday,19,0.082792,0.187649,0.078412,0.076645,0.134988,0.171838,0.267676
Sunday,20,0.138020,0.224268,0.077441,0.096626,0.142862,0.161085,0.159699
Sunday,21,0.174071,0.223953,0.045447,0.091135,0.083664,0.169344,0.212386
Sunday,22,0.081881,0.107983,0.056550,0.159811,0.140871,0.242183,0.210719


In [48]:
tmp.columns

Index(['Time', 'Variable', 'Code', 'Value', 'Year', 'Month', 'logValue',
       'VarDesc', 'TimeSlot', 'Hour', 'is_weekend', 'is_day', 'is_rush',
       'is_day_no_rush', 'TimeSlot_cat', 'StationSprache'],
      dtype='object')