# Imports & Settings

In [1]:
# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

In [3]:
%load_ext autoreload
%autoreload
%matplotlib inline

import pandas as pd
import numpy as np
import qgrid
from datetime import datetime as dtt

from pa_lib.file import data_files, load_bin, store_bin, load_csv, write_xlsx, load_xlsx
from pa_lib.data import (
    calc_col_partitioned,
    clean_up_categoricals,
    unfactorize,
    flatten,
    replace_col,
    cond_col,
    desc_col,
    unfactorize,
    as_dtype,
    flatten_multi_index_cols,
)
from pa_lib.util import obj_size, cap_words, normalize_rows, clear_row_max
from pa_lib.log import time_log, info
from pa_lib.vis import dive

# display long columns completely, show more rows
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load EK data

In [324]:
data_files('*ek*')

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
bd_by_week.feather,24.1 MB,10.07.19 15:56:05
bd_cluster_ek_info.feather,4.1 MB,15.08.19 15:46:00
bd_ek_minmax.feather,1.9 MB,10.07.19 15:34:53
bd_long_by_week.feather,23.3 MB,10.07.19 10:25:41
ek_region.feather,2.1 MB,22.08.19 09:45:06
pv_by_week.feather,11.1 MB,22.05.19 18:01:14


In [325]:
ek_info = load_bin('bd_cluster_ek_info.feather')

2019-08-22 15:15:51 [INFO] Reading from file C:\Users\kpf\data\bd_cluster_ek_info.feather
  labels, = index.labels
2019-08-22 15:15:51 [INFO] Finished loading binary file in 0.02s (0.02s CPU)


# Prepare PLZ mapping data

In [326]:
data_files('*.xlsx')

Unnamed: 0_level_0,size,mtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Raumgliederungen.xlsx,112.0 KB,20.08.19 14:57:46
do-t-09.02-gwr-37.xlsx,414.7 KB,20.08.19 17:00:54
ppi_data.xlsx,74.5 MB,18.07.19 17:13:43
ppi_kamp.xlsx,122.3 KB,31.07.19 15:26:30
ppi_kamp_by_std_publ.xlsx,10.1 KB,18.07.19 17:14:16
ppi_kamp_hilo.xlsx,150.3 KB,18.07.19 17:15:20
ppi_kamp_hilo_by_std_publ.xlsx,18.7 KB,18.07.19 17:15:24
sbbnutz_columns_2014.xlsx,16.0 KB,03.07.19 16:42:07
sbbnutz_data_2014.xlsx,1.6 MB,03.07.19 16:42:07
test.xlsx,9.9 KB,11.07.19 16:57:20


In [327]:
plz = load_xlsx("do-t-09.02-gwr-37.xlsx", sheet_name="PLZ4").rename(
    columns={"PLZ4": "PLZ", "%_IN_GDE": "PRC", "KTKZ": "KANTON", "GDENAMK": "NAME"}
)

2019-08-22 15:15:54 [INFO] Reading from file C:\Users\kpf\data\do-t-09.02-gwr-37.xlsx
2019-08-22 15:15:54 [INFO] Finished loading xlsx file in 0.77s (0.77s CPU)


In [328]:
plz.head(10)

Unnamed: 0,PLZ,PRC,KANTON,GDENR,NAME
0,8914,100.0,ZH,1,Aeugst am Albis
1,8909,14.34,ZH,2,Affoltern am Albis
2,8910,85.66,ZH,2,Affoltern am Albis
3,8906,100.0,ZH,3,Bonstetten
4,6340,0.77,ZH,4,Hausen am Albis
5,8915,72.06,ZH,4,Hausen am Albis
6,8925,27.17,ZH,4,Hausen am Albis
7,8908,100.0,ZH,5,Hedingen
8,8926,100.0,ZH,6,Kappel am Albis
9,8934,100.0,ZH,7,Knonau


### PLZ are not uniquely mapped to towns, not even cantons!

In [329]:
plz.PLZ.value_counts().head(10)

1148    8
3053    7
9107    7
2345    7
6110    7
8500    6
3800    6
9427    6
2720    6
2523    6
Name: PLZ, dtype: int64

In [330]:
def collect(s, sep=","):
    return sep.join(map(str, s[s.notna()].unique()))


plz.groupby("PLZ").agg({"KANTON": ["nunique", collect]}).sort_values(
    ("KANTON", "nunique"), ascending=False
).head(10)

Unnamed: 0_level_0,KANTON,KANTON
Unnamed: 0_level_1,nunique,collect
PLZ,Unnamed: 1_level_2,Unnamed: 2_level_2
9107,3,"AR,AI,SG"
2814,3,"SO,BL,JU"
6390,3,"UR,OW,NW"
2827,3,"BE,SO,JU"
9428,3,"AR,AI,SG"
9427,3,"AR,AI,SG"
6010,3,"LU,OW,NW"
9450,3,"AR,AI,SG"
3254,2,"BE,SO"
9100,2,"AR,SG"


### Find best matches for town and canton by maximising PRC per PLZ

In [331]:
def main_features(df):
    return df.loc[df.PRC.idxmax(), ['GDENR', 'NAME', 'KANTON']]

plz_unique = plz.groupby('PLZ').apply(main_name)

# Load data on economic regions

In [332]:
reg = load_xlsx("Raumgliederungen.xlsx", sheet_name="Daten", skiprows=0, header=1)

# drop unused
reg.drop(0, axis="index", inplace=True)
reg.drop(["Bezirks-nummer", "Kantons-nummer"], axis="columns", inplace=True)

# rename columns
reg.rename(
    columns={
        "BFS Gde-nummer": "GDENR",
        "Gemeindename": "NAME",
        "Kanton": "KANTON",
        "Bezirksname": "BEZIRK",
        "Arbeitsmarktgrossregionen 2018": "GROSSREGION_ID",
        "Arbeitsmarktregionen 2018": "REGION_ID",
    },
    inplace=True,
)

# fix data types
reg = reg.astype({"GDENR": "int64", "GROSSREGION_ID": "int64", "REGION_ID": "int64"})

2019-08-22 15:16:02 [INFO] Reading from file C:\Users\kpf\data\Raumgliederungen.xlsx
2019-08-22 15:16:02 [INFO] Finished loading xlsx file in 0.19s (0.19s CPU)


In [333]:
reg.head()

Unnamed: 0,GDENR,NAME,KANTON,BEZIRK,GROSSREGION_ID,REGION_ID
1,1,Aeugst am Albis,ZH,Affoltern,12,12031
2,2,Affoltern am Albis,ZH,Affoltern,12,12031
3,3,Bonstetten,ZH,Affoltern,12,12034
4,4,Hausen am Albis,ZH,Affoltern,11,11060
5,5,Hedingen,ZH,Affoltern,12,12031


### Grossregionen

In [334]:
reg_grossreg = load_xlsx(
    "Raumgliederungen.xlsx",
    sheet_name="CH1+CL_GBAE2018+1.0",
    skiprows=0,
    header=1,
    index_col=0,
)

2019-08-22 15:16:04 [INFO] Reading from file C:\Users\kpf\data\Raumgliederungen.xlsx
2019-08-22 15:16:04 [INFO] Finished loading xlsx file in 0.17s (0.17s CPU)


In [335]:
reg_grossreg.head()

Unnamed: 0_level_0,Label
Code,Unnamed: 1_level_1
1,Region Genf
2,Region Lausanne
3,Region Neuenburg
4,Region Freiburg
5,Region Biel–Jura


### Regionen

In [336]:
reg_reg = load_xlsx(
    "Raumgliederungen.xlsx",
    sheet_name="CH1+CL_BAE2018+1.0",
    skiprows=0,
    header=1,
    index_col=0,
)

2019-08-22 15:16:06 [INFO] Reading from file C:\Users\kpf\data\Raumgliederungen.xlsx
2019-08-22 15:16:06 [INFO] Finished loading xlsx file in 0.17s (0.17s CPU)


In [337]:
reg_reg.head()

Unnamed: 0_level_0,Label
Code,Unnamed: 1_level_1
1011,Vernier–Lancy
1012,Genève
1013,Le Grand-Saconnex
1014,Nyon
1015,Thônex–Chêne-Bougeries


### Merge Grossregionen and Regionen

In [338]:
reg = (
    reg.assign(
        REGION=reg_reg.loc[reg.REGION_ID].values,
        GROSSREGION=reg_grossreg.loc[reg.GROSSREGION_ID].values,
    )
    .drop(["GROSSREGION_ID", "REGION_ID"], axis="columns")
    .set_index("GDENR")
)

In [339]:
reg.head()

Unnamed: 0_level_0,NAME,KANTON,BEZIRK,REGION,GROSSREGION
GDENR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Aeugst am Albis,ZH,Affoltern,Dietikon–Schlieren,Region Zürich
2,Affoltern am Albis,ZH,Affoltern,Dietikon–Schlieren,Region Zürich
3,Bonstetten,ZH,Affoltern,Horgen–Wädenswil,Region Zürich
4,Hausen am Albis,ZH,Affoltern,Zug,Zentralschweiz
5,Hedingen,ZH,Affoltern,Dietikon–Schlieren,Region Zürich


# Add region info to customers

### Find a GDENR for each swiss customer

In [340]:
swiss_customer = ek_info.EK_Land == "SCHWEIZ"

#### First, match by PLZ

In [341]:
ek_info["GDENR_PLZ"] = plz_unique.reindex(
    pd.to_numeric(ek_info.EK_Plz, errors="coerce").fillna(-1).astype("int64")
).GDENR.values

#### Then, match by full town name

In [342]:
gdenr_by_name = plz.groupby('NAME').agg({'GDENR': 'first'})

ek_info["GDENR_NAME"] = (
    gdenr_by_name.reindex(ek_info.EK_Ort).values
)

#### Then, match by first word of town name (for, e.g. *Laufen (BL)*)

In [343]:
def firstnames(s):
    s = s.fillna("")
    return s.str.partition(expand=False).apply(lambda x: x[0])


gdenr_by_firstname = (
    plz.assign(FIRSTNAME=firstnames(plz.NAME))
    .groupby("FIRSTNAME")
    .agg({"GDENR": "first"})
)

ek_info["GDENR_FIRSTNAME"] = gdenr_by_firstname.reindex(
    firstnames(ek_info.EK_Ort)
).values

#### Finally, match some special cases by remapping them to the closest town, then match by full name

In [344]:
ek_info["GDENR_SPECIAL"] = gdenr_by_name.reindex(
    ek_info.EK_Ort.replace(
        {
            "Schönbühl Einkaufszentrum": "Urtenen-Schönbühl",
            "Emmenbrücke 1": "Emmen",
            "Glattzentrum b. Wallisellen": "Wallisellen",
            "Zürich-Flughafen": "Kloten",
            "Büsingen": "Schaffhausen",
            "Serfontana": "Chiasso",
            "Triesen": "Sevelen",
            "Campione d'Italia": "Bissone",
        }
    )
).values

#### Merge all matches by successive fallback

In [345]:
ek_info["GDENR"] = (
    ek_info.GDENR_PLZ
    .fillna(ek_info.GDENR_NAME)
    .fillna(ek_info.GDENR_FIRSTNAME)
    .fillna(ek_info.GDENR_SPECIAL)
)

# Delete matches of non-swiss customers (e.g. by PLZ or first name)
ek_info.loc[~swiss_customer, 'GDENR'] = None

ek_info.drop(
    "GDENR_PLZ GDENR_NAME GDENR_FIRSTNAME GDENR_SPECIAL".split(), axis="columns", inplace=True
)

#### Sanity check: this should be empty!

In [346]:
not_matched = ek_info.loc[
    ek_info.GDENR.isnull()
    & swiss_customer,
    ["EK_Plz", "EK_Ort"],
]

not_matched

Unnamed: 0,EK_Plz,EK_Ort


### Join region info on GDENR

In [347]:
ek_region = (
    ek_info[["Endkunde_NR", "GDENR"]]
    .merge(reg, on="GDENR", how="left")
    .drop("GDENR", axis="columns")
    .reset_index(drop=True)
)

ek_region.head(20)

Unnamed: 0,Endkunde_NR,NAME,KANTON,BEZIRK,REGION,GROSSREGION
0,100034,Winterthur,ZH,Winterthur,Winterthur,Region Zürich
1,100039,Uster,ZH,Uster,Uster–Dübendorf,Region Zürich
2,100061,Winterthur,ZH,Winterthur,Winterthur,Region Zürich
3,100064,Uster,ZH,Uster,Uster–Dübendorf,Region Zürich
4,100066,Uster,ZH,Uster,Uster–Dübendorf,Region Zürich
5,100083,Volketswil,ZH,Uster,Uster–Dübendorf,Region Zürich
6,100092,Zürich,ZH,Zürich,Zürich,Region Zürich
7,100095,Wetzikon (ZH),ZH,Hinwil,Wetzikon (ZH),Region Zürich
8,100097,Dübendorf,ZH,Uster,Uster–Dübendorf,Region Zürich
9,100098,Wallisellen,ZH,Bülach,Uster–Dübendorf,Region Zürich


#### Sanity check: this should be empty!

In [348]:
ek_info.loc[~swiss_customer & ek_region.KANTON.notnull()]

Unnamed: 0,Endkunde_NR,Endkunde,EK_Aktiv,EK_Land,EK_Plz,EK_Ort,Agentur,EK_BG,EK_BG_ID,Auftrag_BG_ID,Auftrag_BG_Anz,Kamp_Erfass_Jahr_min,Kamp_Erfass_Jahr_max,GDENR


# Write out result

In [349]:
store_bin(ek_region, 'ek_region.feather')

2019-08-22 15:16:55 [INFO] Writing to file C:\Users\kpf\data\ek_region.feather
2019-08-22 15:16:55 [INFO] Written 2.1 MB
2019-08-22 15:16:55 [INFO] Finished storing binary file in 0.01s (0.02s CPU)
