# Net Zahlen Reservationen/Offerten (Vertrag = Ja) basierend auf erfassdatum

In [101]:

import numpy as np
import pandas as pd

#######################
## Datenaufbereitung ##
#######################


# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))


from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows  = None

## Libraries & Settings ##
from pa_lib.file import load_bin
from pa_lib.util import cap_words
from pa_lib.log import time_log, info

import datetime as dt
from dateutil.relativedelta import relativedelta

from pa_lib.data import (
    clean_up_categoricals,
    unfactorize,
)

In [2]:
def load_booking_data():
    bd_raw = load_bin("vkprog\\bd_data.feather").rename(
        mapper=lambda name: cap_words(name, sep="_"), axis="columns"
    )
    bd = bd_raw.loc[(bd_raw.Netto > 0)].pipe(clean_up_categoricals)
    return bd


In [3]:
raw_data_bookings = load_booking_data()

2019-10-24 09:05:53 [INFO] Started loading binary file
2019-10-24 09:05:53 [INFO] Reading from file C:\Users\stc\data\vkprog\bd_data.feather
2019-10-24 09:05:53 [INFO] Finished loading binary file in 0.18s (0.89s CPU)


In [4]:
raw_data_bookings.loc[:,["KV_NR","Agps_NR","Endkunde_NR"]].groupby(["KV_NR","Agps_NR"]).count().sort_values("Endkunde_NR", ascending= False).head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Endkunde_NR
KV_NR,Agps_NR,Unnamed: 2_level_1
484261,857753,1


In [5]:
raw_data_bookings.query("Endkunde_NR == 473515").sample(5)

Unnamed: 0,Endkunde_NR,Endkunde,EK_Abc,EK_Boni,EK_Plz,EK_Ort,EK_Land,EK_HB_Apg_Kurzz,EK_Kam_Betreut,EK_Aktiv,Agentur,AG_Hauptbetreuer,Verkaufsberater,Endkunde_Branchengruppe_ID,Endkunde_Branchengruppe,Endkunde_Nbranchengruppe_ID,Endkunde_Nbranchengruppe,Endkunde_Branchenkat_ID,Endkunde_Branchenkat,Endkunde_Nbranchenkat_ID,Endkunde_Nbranchenkat,Auftrag_Branchengruppe_ID,Auftrag_Branchengruppe,Auftrag_Nbranchengruppe_ID,Auftrag_Nbranchengruppe,Auftrag_Branchenkat_ID,Auftrag_Branchenkat,Auftrag_Nbranchenkat_ID,Auftrag_Nbranchenkat,Agps_NR,Segment,KV_NR,KV_Typ,Kampagnen_Status,Kampagne_Erfassungsdatum,Kampagne_Beginn,Auftragsart,Res_Dat,Annullation_Datum,Aush_Von,Dauer,Vertrag,Brutto,Netto,Agglo,PF,Kamp_Beginn_Jahr,Kamp_Beginn_KW,Kamp_Beginn_KW_2,Kamp_Beginn_KW_4,Kamp_Erfass_Jahr,Kamp_Erfass_KW,Kamp_Erfass_KW_2,Kamp_Erfass_KW_4
1384164,473515,Publifutura Affichage Italia s.r.l.,B,gut,22038,Tavernerio (CO),ITALIA,TRA,0,1,Publifutura s.r.l.,TRA,TRA,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,705,WG - Lose / Lotterien,,,12,WB - Finanzwirtschaft / Versicherung,,,2514180,APG|SGA,912799,KPG,4,2017-06-16,2017-07-03,Promotion,2017-06-16,NaT,2017-07-03,14,Nein,454,227,"91362,93787,93851,A0261,A0351,A1061,A2701,A500...","City ePanel,F12,F200,F200 Traffic,F200L,F24,F4...",2017,27,27,25,2017,24,23,21
852485,473515,Publifutura Affichage Italia s.r.l.,B,gut,22038,Tavernerio (CO),ITALIA,TRA,0,1,Publifutura Srl,ROS,ROS,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,405,WG - Reisen / Hotels / Sanatorien,,,10,WB - Freizeit / Touristik,,,1054646,APG|SGA,542436,KPG,4,2008-10-13,2009-02-02,Promotion,2008-11-07,NaT,2009-02-02,14,Nein,1107,554,"91362,93787,93851,A0261,A0351,A1061,A2701,A500...","City ePanel,F12,F200,F200 Traffic,F200L,F24,F4...",2009,6,5,5,2008,42,41,41
595716,473515,Publifutura Affichage Italia s.r.l.,B,gut,22038,Tavernerio (CO),ITALIA,TRA,0,1,,,TRA,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,2496276,APG|SGA,908625,KPG,4,2017-05-08,2017-05-22,Kommerziell,2017-05-11,NaT,2017-05-29,7,Nein,279,223,"91362,93787,93851,A0261,A0351,A1061,A2701,A500...","City ePanel,F12,F200,F200 Traffic,F200L,F24,F4...",2017,21,21,21,2017,19,19,17
1023353,473515,Publifutura Affichage Italia s.r.l.,B,gut,22038,Tavernerio (CO),ITALIA,TRA,0,1,Publifutura s.r.l.,TRA,TRA,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,400,WG - Automarkt,,,13,WB - Verkehr,,,2744203,APG|SGA,958346,KPG,4,2018-07-04,2018-07-23,Promotion,2018-07-04,NaT,2018-07-23,14,Nein,1965,1179,"91362,93787,93851,A0261,A0351,A1061,A2701,A500...","City ePanel,F12,F200,F200 Traffic,F200L,F24,F4...",2018,30,29,29,2018,27,27,25
1140895,473515,Publifutura Affichage Italia s.r.l.,B,gut,22038,Tavernerio (CO),ITALIA,TRA,0,1,Publifutura Affichage Italia s.r.l.,TRA,TRA,720,WG - Dienstleistung,,,15,WB - Dienstleistung,,,901,WG - Handel / Grossverteiler,901.0,WG - Handel / Grossverteiler,14,WB - Handel,14.0,WB - Handel,2960486,APG|SGA,997939,KPG,4,2019-05-13,2019-06-03,Kommerziell,2019-05-13,NaT,2019-06-03,14,Nein,1578,1136,"91362,93787,93851,A0261,A0351,A1061,A2701,A500...","City ePanel,F12,F200,F200 Traffic,F200L,F24,F4...",2019,23,23,21,2019,20,19,17


# PLZ STUFF

In [6]:
from pa_lib.data import desc_col

In [14]:
ek_info = load_bin("vkprog\\ek_info.feather")

2019-10-24 09:07:48 [INFO] Started loading binary file
2019-10-24 09:07:48 [INFO] Reading from file C:\Users\stc\data\vkprog\ek_info.feather
2019-10-24 09:07:48 [INFO] Finished loading binary file in 0.03s (0.05s CPU)


In [486]:
## Contains Verkaufsgebiete

plz_data = load_bin("vkprog\\plz_data.feather")
plz_data.loc[:,"PLZ"] = plz_data.loc[:,"PLZ"].astype("int64")

2019-10-25 11:18:48 [INFO] Started loading binary file
2019-10-25 11:18:48 [INFO] Reading from file C:\Users\stc\data\vkprog\plz_data.feather
2019-10-25 11:18:48 [INFO] Finished loading binary file in 0.0s (0.0s CPU)


In [487]:
## Our basis table with our customers
col_list = """Endkunde_NR
              PLZ
              GEMEINDE
              KANTON
              EK_Land""".split()

cust_current = ek_info.loc[:,col_list]

In [523]:
## International customers get mapped to MAT, INTERNATIONAL

# find the internationl customers
row_select = (cust_current.loc[:,"EK_Land"] != "SCHWEIZ")
cust_matched_international = cust_current.loc[row_select,:]

# map international customers to (MAT, INTERNATIONAL)
cust_matched_international.loc[:,"VERKAUFS_GEBIETS_CODE"] = "INTERNATIONAL"
cust_matched_international.loc[:,"VB_VKGEB"] = "MAT"

# matched, cleanup
col_list = """Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()
cust_matched = cust_matched_international.loc[:,col_list].copy()

# unmatched, cleanup
row_select = (pd.merge(cust_current,
                      cust_matched_international,
                      on="Endkunde_NR",
                      how="left")
              .loc[:,"VERKAUFS_GEBIETS_CODE"]
              .isna()
             )
cust_unmatched = cust_current.loc[row_select,:]

print("cust_matched_international:", cust_matched.shape)
print("            cust_unmatched:", cust_unmatched.shape)
print("              cust_current:", cust_current.shape)

cust_matched_international: (1178, 3)
            cust_unmatched: (31059, 5)
              cust_current: (32237, 5)


In [524]:
def plz_data_olap_counter(column):
    agg_counter = (plz_data.groupby(column)
                        .agg({"VERKAUFS_GEBIETS_CODE": "count"})
                        .reset_index()
                        .rename(columns={"VERKAUFS_GEBIETS_CODE": "CNT"})) # COUNT
    return agg_counter

In [526]:
def report_success():
    print("cust_matched:  ", cust_matched.shape)
    print("cust_unmatched:", cust_unmatched.shape)
    print("cust_current:  ", cust_current.shape)
    print("Matched-Perc:  ",cust_matched.shape[0]/cust_current.shape[0])

In [527]:
def collect(s, sep=","):
        return sep.join(map(str, s[s.notna()].unique()))

- Only swiss customers are left now to match
- Next step: Figure out unique ``FRAKTION`` in ``plz_data``, but first check if it is worthwhile

In [525]:
fraktion_cnt = plz_data_olap_counter("FRAKTION")
plz_cnt      = plz_data_olap_counter("PLZ")
ort_cnt      = plz_data_olap_counter("ORT")

In [558]:
#max(fraktion_cnt.loc[:,"CNT"])
#max(plz_cnt.loc[:,"CNT"])
#max(ort_cnt.loc[:,"CNT"])

max(fraktion_cnt.loc[:,"CNT"] + plz_cnt.loc[:,"CNT"] + ort_cnt.loc[:,"CNT"])

35.0

In [546]:
unique_by = 1

## Match by FRAKTION
plz_unique_fraktion = (pd.merge(plz_data,
                                fraktion_cnt,
                                left_on="FRAKTION",
                                right_on="FRAKTION",
                                how="left"
                               )
                         .query(f"CNT =={unique_by}") # only unique ones
                         .groupby("FRAKTION")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_fraktion = (pd.merge(cust_unmatched,
                                  plz_unique_fraktion.loc[:,"""FRAKTION VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="FRAKTION",
                                  how="left"
                                 )
                         )

row_select = cust_container_fraktion.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() # unmatched rows!
cust_matched_fraktion = cust_container_fraktion.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_matched          = cust_matched.append(cust_matched_fraktion).drop_duplicates()
cust_unmatched        = cust_container_fraktion.loc[ row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

print(f"FRAKTION, unique by {unique_by}")
report_success()

## Match by PLZ
plz_unique_plz = (pd.merge(plz_data,
                                plz_cnt,
                                left_on="PLZ",
                                right_on="PLZ",
                                how="left"
                               )
                         .query(f"CNT == {unique_by}") # only unique ones
                         .groupby("PLZ")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_plz = (pd.merge(cust_unmatched,
                               plz_unique_plz.loc[:,"""PLZ VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                               left_on="PLZ",
                               right_on="PLZ",
                               how="left"
                              )
                     )

row_select = cust_container_plz.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() # unmatched rows!
cust_matched_plz = cust_container_plz.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_matched     = cust_matched.append(cust_matched_plz).drop_duplicates()
cust_unmatched   = cust_container_plz.loc[ row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

print(f"PLZ, unique by {unique_by}")
report_success()

## Match by ORT
plz_unique_ort = (pd.merge(plz_data,
                                ort_cnt,
                                left_on="ORT",
                                right_on="ORT",
                                how="left"
                               )
                         .query(f"CNT == {unique_by}") # only unique ones
                         .groupby("ORT")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_ort = (pd.merge(cust_unmatched,
                               plz_unique_ort.loc[:,"""ORT VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                               left_on="GEMEINDE",
                               right_on="ORT",
                               how="left"
                              )
                     )

row_select = cust_container_ort.loc[:,"VERKAUFS_GEBIETS_CODE"].isna()
cust_matched_ort = cust_container_ort.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_matched     = cust_matched.append(cust_matched_plz).drop_duplicates()
cust_unmatched   = cust_container_ort.loc[ row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

print(f"ORT, unique by {unique_by}")
report_success()

FRAKTION, unique by 1
cust_matched:   (24748, 3)
cust_unmatched: (7489, 5)
cust_current:   (32237, 5)
Matched-Perc:   0.7676893011136272
PLZ, unique by 1
cust_matched:   (24748, 3)
cust_unmatched: (7489, 5)
cust_current:   (32237, 5)
Matched-Perc:   0.7676893011136272
ORT, unique by 1
cust_matched:   (24748, 3)
cust_unmatched: (7489, 5)
cust_current:   (32237, 5)
Matched-Perc:   0.7676893011136272


In [529]:
cust_unmatched.sample(5)

Unnamed: 0,Endkunde_NR,PLZ,GEMEINDE,KANTON,EK_Land
3546,543855,8127,Forch,ZH,SCHWEIZ
4189,564917,3011,Bern,BE,SCHWEIZ
773,132389,1201,Genève,GE,SCHWEIZ
6842,641628,8200,Schaffhausen,SH,SCHWEIZ
5097,592758,9000,St. Gallen,SG,SCHWEIZ


In [530]:
cust_matched.sample(5)

Unnamed: 0,Endkunde_NR,VERKAUFS_GEBIETS_CODE,VB_VKGEB
5593,473069,V-Z02,
13633,568620,V-Z02,
18578,572145,V-O04,REM
8301,509017,V-M05,JAN
11404,545338,V-W05,PIE


- Matching with ``Ort`` was useless.
- We continue with repeating all the previous steps with none-unique search, to say: ``COUNT == 2``
- We start with ``Fraktion``

In [391]:
plz_2nique_fraktion = (pd.merge(plz_data,
                                fraktion_cnt,
                                left_on="FRAKTION",
                                right_on="FRAKTION",
                                how="left"
                               )
                         .query("CNT == 2") # only 2nique ones
                         .groupby("FRAKTION")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_2nique_fraktion = (pd.merge(cust_unmatched,
                                  plz_2nique_fraktion.loc[:,"""FRAKTION VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="FRAKTION",
                                  how="left"
                                 )
                         )

row_select = cust_container_2nique_fraktion.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_2nique_fraktion = cust_container_2nique_fraktion.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_2nique_fraktion.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [392]:
cust_matched_2nique_fraktion.shape

(74, 3)

In [393]:
cust_unmatched.shape

(7415, 5)

- Continue with ``PLZ``

In [395]:
plz_2nique_plz = (pd.merge(plz_data,
                                plz_cnt,
                                left_on="PLZ",
                                right_on="PLZ",
                                how="left"
                               )
                         .query("CNT == 2") # only 2nique ones
                         .groupby("PLZ")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_2nique_plz = (pd.merge(cust_unmatched,
                                  plz_2nique_plz.loc[:,"""PLZ VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="PLZ",
                                  right_on="PLZ",
                                  how="left"
                                 )
                         )

row_select = cust_container_2nique_plz.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_2nique_plz = cust_container_2nique_plz.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_2nique_plz.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [396]:
print(cust_unmatched.shape)
print(cust_current.shape)

(4391, 5)
(32237, 5)


In [397]:
4391 / 32237

0.13620994509414647

- Continue with ``Ort``, N = 2

In [398]:
plz_2nique_ort = (pd.merge(plz_data,
                                ort_cnt,
                                left_on="ORT",
                                right_on="ORT",
                                how="left"
                               )
                         .query("CNT == 2") # only 2nique ones
                         .groupby("ORT")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_2nique_ort = (pd.merge(cust_unmatched,
                                  plz_2nique_ort.loc[:,"""ORT VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="ORT",
                                  how="left"
                                 )
                         )

row_select = cust_container_2nique_ort.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_2nique_ort = cust_container_2nique_ort.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_2nique_ort.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [399]:
print(cust_matched_2nique_ort.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(77, 3)
(4314, 5)
0.1338213853646431


- Repeat with ``FRAKTION`` N=3

In [400]:
plz_3nique_fraktion = (pd.merge(plz_data,
                                fraktion_cnt,
                                left_on="FRAKTION",
                                right_on="FRAKTION",
                                how="left"
                               )
                         .query("CNT == 3") # only 2nique ones
                         .groupby("FRAKTION")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_3nique_fraktion = (pd.merge(cust_unmatched,
                                  plz_3nique_fraktion.loc[:,"""FRAKTION VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="FRAKTION",
                                  how="left"
                                 )
                         )

row_select = cust_container_3nique_fraktion.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_3nique_fraktion = cust_container_3nique_fraktion.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_3nique_fraktion.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [401]:
print(cust_matched_3nique_fraktion.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(0, 3)
(4314, 5)
0.1338213853646431


In [402]:
plz_3nique_plz = (pd.merge(plz_data,
                                plz_cnt,
                                left_on="PLZ",
                                right_on="PLZ",
                                how="left"
                               )
                         .query("CNT == 3") # only 2nique ones
                         .groupby("PLZ")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_3nique_plz = (pd.merge(cust_unmatched,
                                  plz_3nique_plz.loc[:,"""PLZ VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="PLZ",
                                  right_on="PLZ",
                                  how="left"
                                 )
                         )

row_select = cust_container_3nique_plz.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 
cust_matched_3nique_plz = cust_container_3nique_plz.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]
cust_unmatched          = cust_container_3nique_plz.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [403]:
print(cust_matched_3nique_plz.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(2100, 3)
(2214, 5)
0.06867884728727859


In [405]:
#cust_matched_3nique_plz

In [406]:
plz_3nique_ort = (pd.merge(plz_data,
                                ort_cnt,
                                left_on="ORT",
                                right_on="ORT",
                                how="left"
                               )
                         .query("CNT == 3") # only 2nique ones
                         .groupby("ORT")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_3nique_ort = (pd.merge(cust_unmatched,
                                  plz_3nique_ort.loc[:,"""ORT VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="ORT",
                                  how="left"
                                 )
                         )

row_select = cust_container_3nique_ort.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_3nique_ort = cust_container_3nique_ort.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_3nique_ort.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [408]:
print(cust_matched_3nique_ort.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(116, 3)
(2098, 5)
0.06508049756490988


In [412]:
plz_4nique_fraktion = (pd.merge(plz_data,
                                fraktion_cnt,
                                left_on="FRAKTION",
                                right_on="FRAKTION",
                                how="left"
                               )
                         .query("CNT == 4") # only 2nique ones
                         .groupby("FRAKTION")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_4nique_fraktion = (pd.merge(cust_unmatched,
                                  plz_4nique_fraktion.loc[:,"""FRAKTION VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="FRAKTION",
                                  how="left"
                                 )
                         )

row_select = cust_container_4nique_fraktion.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_4nique_fraktion = cust_container_4nique_fraktion.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_4nique_fraktion.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [413]:
print(cust_matched_4nique_fraktion.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(0, 3)
(2098, 5)
0.06508049756490988


In [414]:
plz_4nique_plz = (pd.merge(plz_data,
                                plz_cnt,
                                left_on="PLZ",
                                right_on="PLZ",
                                how="left"
                               )
                         .query("CNT == 4") # only 2nique ones
                         .groupby("PLZ")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_4nique_plz = (pd.merge(cust_unmatched,
                                  plz_4nique_plz.loc[:,"""PLZ VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="PLZ",
                                  right_on="PLZ",
                                  how="left"
                                 )
                         )

row_select = cust_container_4nique_plz.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 
cust_matched_4nique_plz = cust_container_4nique_plz.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]
cust_unmatched          = cust_container_4nique_plz.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [416]:
print(cust_matched_4nique_plz.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(925, 3)
(1173, 5)
0.036386760554642184


In [419]:
plz_4nique_ort = (pd.merge(plz_data,
                                ort_cnt,
                                left_on="ORT",
                                right_on="ORT",
                                how="left"
                               )
                         .query("CNT == 4") # only 2nique ones
                         .groupby("ORT")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_4nique_ort = (pd.merge(cust_unmatched,
                                  plz_4nique_ort.loc[:,"""ORT VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="ORT",
                                  how="left"
                                 )
                         )

row_select = cust_container_4nique_ort.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_4nique_ort = cust_container_4nique_ort.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_4nique_ort.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [420]:
print(cust_matched_4nique_ort.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(24, 3)
(1149, 5)
0.03564227440518659


In [422]:
plz_5nique_fraktion = (pd.merge(plz_data,
                                fraktion_cnt,
                                left_on="FRAKTION",
                                right_on="FRAKTION",
                                how="left"
                               )
                         .query("CNT == 5") # only 2nique ones
                         .groupby("FRAKTION")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_5nique_fraktion = (pd.merge(cust_unmatched,
                                  plz_5nique_fraktion.loc[:,"""FRAKTION VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="FRAKTION",
                                  how="left"
                                 )
                         )

row_select = cust_container_5nique_fraktion.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_5nique_fraktion = cust_container_5nique_fraktion.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container_5nique_fraktion.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [423]:
print(cust_matched_5nique_fraktion.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(0, 3)
(1149, 5)
0.03564227440518659


In [427]:
plz_5nique_plz = (pd.merge(plz_data,
                                plz_cnt,
                                left_on="PLZ",
                                right_on="PLZ",
                                how="left"
                               )
                         .query("CNT > 4") # only 2nique ones
                         .groupby("PLZ")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container_5nique_plz = (pd.merge(cust_unmatched,
                                  plz_5nique_plz.loc[:,"""PLZ VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="PLZ",
                                  right_on="PLZ",
                                  how="left"
                                 )
                         )

row_select = cust_container_5nique_plz.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 
cust_matched_5nique_plz = cust_container_5nique_plz.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]
cust_unmatched          = cust_container_5nique_plz.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [429]:
print(cust_matched_5nique_plz.shape)
print(cust_unmatched.shape)
print(cust_unmatched.shape[0] / cust_current.shape[0])

(431, 3)
(206, 5)
0.006390172782827186


In [467]:
plz_rest_ort = (pd.merge(plz_data,
                                ort_cnt,
                                left_on="ORT",
                                right_on="ORT",
                                how="left"
                               )
                         #.query("CNT == 4") # only 2nique ones
                         .groupby("ORT")
                         .agg(
                             {"VERKAUFS_GEBIETS_CODE": collect,
                              "VB_VKGEB": collect,
                             }
                         )
                         .reset_index()
                      )

cust_container = (pd.merge(cust_unmatched,
                                  plz_rest_ort.loc[:,"""ORT VERKAUFS_GEBIETS_CODE  VB_VKGEB""".split()],
                                  left_on="GEMEINDE",
                                  right_on="ORT",
                                  how="left"
                                 )
                         )

row_select = cust_container.loc[:,"VERKAUFS_GEBIETS_CODE"].isna() 

cust_matched_rest_ort = cust_container.loc[~row_select,"""Endkunde_NR VERKAUFS_GEBIETS_CODE VB_VKGEB""".split()]

cust_unmatched        = cust_container.loc[row_select,"""Endkunde_NR PLZ GEMEINDE KANTON EK_Land""".split()]

In [470]:
cust_unmatched.sort_values("PLZ")

Unnamed: 0,Endkunde_NR,PLZ,GEMEINDE,KANTON,EK_Land
199,653534,1200,Genève 8,GE,SCHWEIZ
124,552314,1200,Genève 1,GE,SCHWEIZ
45,141486,2004,Neuchâtel 4,NE,SCHWEIZ
5,111513,2007,Neuchâtel 7,NE,SCHWEIZ
87,508720,2009,Neuchâtel 9,NE,SCHWEIZ
204,658567,3235,Erlach,BE,SCHWEIZ
179,621498,3235,Erlach,BE,SCHWEIZ
178,621314,3235,Erlach,BE,SCHWEIZ
46,146376,3235,Erlach,BE,SCHWEIZ
161,596506,6061,Sarnen 1,OW,SCHWEIZ


In [472]:
set(cust_unmatched.loc[:,"PLZ"])

{1200,
 2004,
 2007,
 2009,
 3235,
 6061,
 6317,
 6906,
 6907,
 6908,
 6976,
 7133,
 7201,
 8621}

In [478]:
#row_select = (plz_data.loc[:,"ORT"] == "Zürich") & (plz_data.loc[:,"PLZ"] == 7001)
#row_select = (plz_data.loc[:,"PLZ"] == 7001)
row_select = plz_data.loc[:,"PLZ"].isin(set(cust_unmatched.loc[:,"PLZ"]))
plz_data.loc[row_select,:]

Unnamed: 0,PLZ,FRAKTION,ORT,VERKAUFS_GEBIETS_CODE,VB_VKGEB
