# Libraries & Settings

In [21]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import qgrid
#import beakerx as bx
from datetime import datetime as dtt

from pa_lib.file import data_files, load_bin, store_bin, store_excel
from pa_lib.data import (calc_col_partitioned, clean_up_categoricals, flatten, 
                         replace_col, cond_col, desc_col, unfactorize)
from pa_lib.util import obj_size, cap_words
from pa_lib.log  import time_log, info
from pa_lib.types import dtFactor
from pa_lib.vis import dive

# display long columns completely
pd.set_option('display.max_colwidth', 200)

pd.set_option('display.max_rows', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [22]:
bd_raw = load_bin('bd_data.feather').rename(mapper=lambda name: cap_words(name, sep='_'), axis='columns')
bd = bd_raw.loc[(bd_raw.Netto >= 0)].pipe(clean_up_categoricals)

2019-08-07 12:31:32 [INFO] Reading from file /home/pa/data/bd_data.feather
  labels, = index.labels
2019-08-07 12:31:32 [INFO] Finished loading binary file in 0.21s (0.62s CPU)


In [23]:
desc_col(bd)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Endkunde_NR,category,0/1537638,41944
Endkunde,category,0/1537638,39314
EK_Abc,category,5386/1532252,8
EK_Boni,category,2408/1535230,3
EK_Plz,category,477/1537161,3474
EK_Ort,category,476/1537162,3509
EK_Land,category,476/1537162,60
EK_HB_Apg_Kurzz,category,67769/1469869,99
EK_Aktiv,category,0/1537638,2
Agentur,category,775808/761830,4638


# Prepare Endkunden Information

In [24]:
def last_notna(s):
    try:
        return s.loc[s.notna()].iat[-1]
    except IndexError:
        return np.NaN


def collect(s, sep=", "):
    return sep.join(map(str, s[s.notna()].unique()))


# this takes around 150 seconds
with time_log("preparing EK_INFO"):
    ek_info = (
        bd.sort_values(["Endkunde_NR", "Kampagne_Erfassungsdatum"])
        .astype({"Endkunde_NR": "int64", "Kamp_Erfass_Jahr": "int16"})
        .groupby("Endkunde_NR")
        .agg(
            {
                "Endkunde": last_notna,
                "EK_Aktiv": last_notna,
                "EK_Land": last_notna,
                "EK_Plz": last_notna,
                "EK_Ort": last_notna,
                "Agentur": last_notna,
                "Endkunde_Branchengruppe": last_notna,
                "Endkunde_Branchengruppe_ID": last_notna,
                "Auftrag_Branchengruppe_ID": [collect, "nunique"],
                "Kamp_Erfass_Jahr": ["min", "max"],
            }
        )
    )

ek_info.set_axis(
    labels="Endkunde EK_Aktiv EK_Land EK_Plz EK_Ort Agentur EK_BG EK_BG_ID Auftrag_BG_ID Auftrag_BG_Anz Kamp_Erfass_Jahr_min Kamp_Erfass_Jahr_max".split(),
    axis="columns",
    inplace=True,
)

2019-08-07 12:36:14 [INFO] Finished preparing EK_INFO in 268.3s (272.13s CPU)


In [34]:
store_bin(ek_info, 'bd_cluster_ek_info.feather')

2019-08-07 15:22:48 [INFO] Writing to file /home/pa/data/bd_cluster_ek_info.feather
2019-08-07 15:22:48 [INFO] Written 5.4 MB
2019-08-07 15:22:48 [INFO] Finished storing binary file in 0.06s (0.07s CPU)


### How many customers started or ended in which year?

In [32]:
pd.crosstab(index=ek_info.Kamp_Erfass_Jahr_min, columns=ek_info.Kamp_Erfass_Jahr_max, margins=True)

Kamp_Erfass_Jahr_max,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Kamp_Erfass_Jahr_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2007,0,4,0,0,1,0,1,3,0,0,2,4,15
2008,365,201,139,106,101,134,133,127,157,160,375,829,2827
2009,0,2208,662,433,345,385,322,364,336,399,743,1441,7638
2010,0,0,1692,406,233,182,188,161,168,170,336,454,3990
2011,0,0,0,1816,325,214,174,223,177,153,283,445,3810
2012,0,0,0,0,1636,325,217,190,224,157,255,328,3332
2013,0,0,0,0,0,1662,376,213,196,199,274,337,3257
2014,0,0,0,0,0,0,1684,371,227,198,318,384,3182
2015,0,0,0,0,0,0,0,1795,360,254,337,402,3148
2016,0,0,0,0,0,0,0,0,2017,438,402,449,3306


### Distribution of Auftragsart vs. Vertrag

In [33]:
display(pd.crosstab(index=[bd.Auftragsart, bd.Vertrag], columns=bd.Kamp_Erfass_Jahr, margins=True))
display('Netto-Umsatz')
display(bd.pivot_table(index=['Auftragsart', 'Vertrag'], columns='Kamp_Erfass_Jahr', values='Netto', aggfunc='sum', fill_value=0, margins=True))

Unnamed: 0_level_0,Kamp_Erfass_Jahr,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Auftragsart,Vertrag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Aushangauftrag Partner,Nein,0,90,2074,1698,1815,2337,2786,1520,33,16,19,23,15,12426
Eigenwerbung APG,Nein,0,0,294,69,124,177,41,721,1256,1380,1168,423,168,5821
Freespace,Nein,0,0,0,0,0,0,0,0,0,0,2786,4069,1885,8740
Goodwill,Nein,0,10,103,104,123,180,163,134,192,134,150,158,71,1522
Karitativ,Nein,0,200,438,695,403,704,524,726,897,1301,1354,1541,823,9606
Kommerziell,Nein,0,14150,71746,78234,84716,78514,77696,91767,108510,126063,125936,126379,82562,1066273
Logistik für Dritte,Nein,0,0,96,525,1434,1416,1292,1530,1900,1866,2045,1772,962,14838
Politisch,Nein,0,1041,3880,4385,8983,5950,5655,6170,10853,9185,6759,7973,7746,78580
Promotion,Nein,0,191,3895,2870,3123,3301,5370,2590,3584,4673,5462,7906,1175,44140
Sponsoring,Nein,0,67,642,493,774,715,682,396,415,326,366,902,203,5981


'Netto-Umsatz'

Unnamed: 0_level_0,Kamp_Erfass_Jahr,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Auftragsart,Vertrag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Aushangauftrag Partner,Nein,0,30444,849600,791206,739514,1026394,1000669,439344,110412,92214,117954,150966,81126,5429843
Eigenwerbung APG,Nein,0,0,3897655,0,4670,1048532,0,360,133390,126540,1222454,72427,0,6506028
Freespace,Nein,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Goodwill,Nein,0,0,6543,7939,4953,18668,2994,4762,4191,1732,17772,16741,9174,95469
Karitativ,Nein,0,724644,1960077,1628917,1866012,2668615,2253943,2318474,2578634,3459117,2775404,3962032,2657962,28853831
Kommerziell,Nein,0,33806952,148051046,162393013,168628594,175031787,158519973,182560236,193519765,204222602,197001081,191750167,165496589,1980981805
Logistik für Dritte,Nein,0,0,0,0,4600,184578,31589,132394,479850,420927,418245,522661,266499,2461343
Politisch,Nein,0,2040430,6938949,6463858,18562173,11082670,9945225,10458567,16977360,11414298,6379246,8489070,11226187,119978033
Promotion,Nein,0,1083041,5375644,4376195,3393104,3505266,5451654,3185928,3787542,4991362,4881049,9304271,2263649,51598705
Sponsoring,Nein,0,348746,1097029,1087749,2340803,2588788,3282531,3450304,3740079,3906123,3671259,3432667,704138,29650216
