# Libraries & Settings

In [80]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import qgrid
from datetime import datetime as dtt

from pa_lib.file import data_files, load_bin, store_bin, load_csv, write_xlsx, load_xlsx
from pa_lib.data import (
    calc_col_partitioned,
    clean_up_categoricals,
    flatten,
    replace_col,
    cond_col,
    desc_col,
    unfactorize,
    as_dtype,
    flatten_multi_index_cols,
)
from pa_lib.util import obj_size, cap_words
from pa_lib.log import time_log, info
from pa_lib.vis import dive

# display long columns completely, show more rows
pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)


def qshow(df, fit_width=False):
    return qgrid.show_grid(
        df, grid_options={"forceFitColumns": fit_width, "fullWidthRows": False}
    )

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [2]:
bd_raw = load_bin("bd_data.feather").rename(
    mapper=lambda name: cap_words(name, sep="_"), axis="columns"
)
bd = bd_raw.loc[(bd_raw.Netto >= 0)].pipe(clean_up_categoricals)

2019-08-08 14:00:11 [INFO] Reading from file C:\Users\kpf\data\bd_data.feather
  labels, = index.labels
2019-08-08 14:00:11 [INFO] Finished loading binary file in 0.18s (0.53s CPU)


In [3]:
desc_col(bd)

Unnamed: 0,DTYPE,NULLS,UNIQUE
Endkunde_NR,category,0/1526093,41740
Endkunde,category,0/1526093,39122
EK_Abc,category,5389/1520704,8
EK_Boni,category,2408/1523685,3
EK_Plz,category,475/1525618,3468
EK_Ort,category,474/1525619,3504
EK_Land,category,474/1525619,60
EK_HB_Apg_Kurzz,category,67234/1458859,95
EK_Aktiv,category,0/1526093,2
Agentur,category,770186/755907,4606


# Prepare Endkunden Information

In [4]:
def last_notna(s):
    try:
        return s.loc[s.notna()].iat[-1]
    except IndexError:
        return np.NaN


def collect(s, sep=", "):
    return sep.join(map(str, s[s.notna()].unique()))


# this takes around 150 seconds
with time_log("preparing EK_INFO"):
    ek_info = (
        bd.sort_values(["Endkunde_NR", "Kampagne_Erfassungsdatum"])
        .astype({"Endkunde_NR": "int64", "Kamp_Erfass_Jahr": "int16"})
        .groupby("Endkunde_NR")
        .agg(
            {
                "Endkunde": last_notna,
                "EK_Aktiv": last_notna,
                "EK_Land": last_notna,
                "EK_Plz": last_notna,
                "EK_Ort": last_notna,
                "Agentur": last_notna,
                "Endkunde_Branchengruppe": last_notna,
                "Endkunde_Branchengruppe_ID": last_notna,
                "Auftrag_Branchengruppe_ID": [collect, "nunique"],
                "Kamp_Erfass_Jahr": ["min", "max"],
            }
        )
    )

ek_info.set_axis(
    labels="Endkunde EK_Aktiv EK_Land EK_Plz EK_Ort Agentur EK_BG EK_BG_ID Auftrag_BG_ID Auftrag_BG_Anz Kamp_Erfass_Jahr_min Kamp_Erfass_Jahr_max".split(),
    axis="columns",
    inplace=True,
)

2019-08-08 14:02:46 [INFO] Finished preparing EK_INFO in 147.77s (150.3s CPU)


### How many customers started or ended in which year?

In [5]:
pd.crosstab(
    index=ek_info.Kamp_Erfass_Jahr_min,
    columns=ek_info.Kamp_Erfass_Jahr_max,
    margins=True,
)

Kamp_Erfass_Jahr_max,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Kamp_Erfass_Jahr_min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2007,0,4,0,0,1,0,1,3,0,0,2,4,15
2008,365,201,139,106,101,134,133,127,159,163,405,794,2827
2009,0,2210,662,433,345,385,324,368,338,406,812,1355,7638
2010,0,0,1693,406,233,183,189,164,171,175,358,418,3990
2011,0,0,0,1822,325,214,175,224,178,156,309,407,3810
2012,0,0,0,0,1638,326,217,191,224,159,266,311,3332
2013,0,0,0,0,0,1663,376,215,196,200,292,315,3257
2014,0,0,0,0,0,0,1685,374,229,198,353,343,3182
2015,0,0,0,0,0,0,0,1799,362,256,366,365,3148
2016,0,0,0,0,0,0,0,0,2021,441,426,418,3306


### Store and reload result

In [6]:
store_bin(ek_info, "bd_cluster_ek_info.feather")

2019-08-08 14:02:55 [INFO] Writing to file C:\Users\kpf\data\bd_cluster_ek_info.feather
2019-08-08 14:02:55 [INFO] Written 5.4 MB
2019-08-08 14:02:55 [INFO] Finished storing binary file in 0.04s (0.03s CPU)


In [7]:
ek_info = load_bin('bd_cluster_ek_info.feather')

2019-08-08 14:02:56 [INFO] Reading from file C:\Users\kpf\data\bd_cluster_ek_info.feather
2019-08-08 14:02:56 [INFO] Finished loading binary file in 0.02s (0.02s CPU)


# Distribution of Auftragsart vs. Vertrag

In [8]:
display(
    pd.crosstab(
        index=[bd.Auftragsart, bd.Vertrag], columns=bd.Kamp_Erfass_Jahr, margins=True
    )
)
display("Netto-Umsatz")
display(
    bd.pivot_table(
        index=["Auftragsart", "Vertrag"],
        columns="Kamp_Erfass_Jahr",
        values="Netto",
        aggfunc="sum",
        fill_value=0,
        margins=True,
    )
)

Unnamed: 0_level_0,Kamp_Erfass_Jahr,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Auftragsart,Vertrag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Aushangauftrag Partner,Nein,0,90,2074,1698,1815,2337,2786,1520,33,16,19,23,14,12425
Eigenwerbung APG,Nein,0,0,294,69,124,177,41,721,1256,1380,1168,423,138,5791
Freespace,Nein,0,0,0,0,0,0,0,0,0,0,2786,4069,1704,8559
Goodwill,Nein,0,10,103,104,123,180,163,134,192,134,150,158,61,1512
Karitativ,Nein,0,200,438,695,403,704,524,726,897,1301,1354,1541,746,9529
Kommerziell,Nein,0,14150,71746,78234,84716,78514,77696,91767,108510,126063,125936,126373,74202,1057907
Logistik für Dritte,Nein,0,0,96,525,1434,1416,1292,1530,1900,1866,2045,1772,810,14686
Politisch,Nein,0,1041,3880,4385,8983,5950,5655,6170,10853,9185,6759,7973,6969,77803
Promotion,Nein,0,191,3895,2870,3123,3301,5370,2590,3584,4673,5462,7906,988,43953
Sponsoring,Nein,0,67,642,493,774,715,682,396,415,326,366,902,188,5966


'Netto-Umsatz'

Unnamed: 0_level_0,Kamp_Erfass_Jahr,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All
Auftragsart,Vertrag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Aushangauftrag Partner,Nein,0,30444,849600,791206,739514,1026394,1000669,439344,110412,92214,117954,150966,72198,5420915
Eigenwerbung APG,Nein,0,0,3897655,0,4670,1048532,0,360,133390,126540,1222454,72427,0,6506028
Freespace,Nein,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Goodwill,Nein,0,0,6543,7939,4953,18668,2994,4762,4191,1732,17772,16741,5925,92220
Karitativ,Nein,0,724644,1960077,1628917,1866012,2668615,2253943,2318474,2578634,3459117,2775404,3962032,2632140,28828009
Kommerziell,Nein,0,33806952,148051046,162393013,168628594,175031787,158519973,182560236,193519765,204222602,197001081,191949492,145654263,1961338804
Logistik für Dritte,Nein,0,0,0,0,4600,184578,31589,132394,479850,420927,418245,522661,239121,2433965
Politisch,Nein,0,2040430,6938949,6463858,18562173,11082670,9945225,10458567,16977360,11414298,6379246,8489070,10458705,119210551
Promotion,Nein,0,1083041,5375644,4376195,3393104,3505266,5451654,3185928,3787542,4991362,4881049,9304271,1960374,51295430
Sponsoring,Nein,0,348746,1097029,1087749,2340803,2588788,3282531,3450304,3740079,3906123,3671259,3432667,438138,29384216


# Restrict bookings to interesting customers (current & long-term)

### List of current long-time customers

* "Current" means that their last booking was not more than two years back. 
* "Long-time" means that they had at least two years of bookings.

In [30]:
limit_year = pd.Timestamp.today().year - 2

ek_nr_current = ek_info.loc[
    (ek_info.Kamp_Erfass_Jahr_max >= limit_year)
    & (ek_info.Kamp_Erfass_Jahr_min < limit_year - 2),
    "Endkunde_NR"
]

In [31]:
bd_current = bd.loc[bd.Endkunde_NR.isin(ek_nr_current)].pipe(clean_up_categoricals)

# Aggregate bookings per customer, year, and KW_2 / KW_4 period

Both by Reservation and Aushang.

In [32]:
def sum_calc(df, col_year, col_week):
    return (
        df.loc[:, ["Endkunde_NR", col_year, col_week, "Netto"]]
        .pipe(unfactorize)
        .groupby(["Endkunde_NR", col_year, col_week], observed=True, as_index=False)
        .agg({"Netto": ["sum"]})
        .set_axis(
            f"Endkunde_NR {col_year} {col_week} Netto_Sum".split(),
            axis="columns",
            inplace=False,
        )
    )


def aggregate_bookings(df, period):
    info(f"Period: {period}")
    info("Calculate Reservation...")
    df_res = sum_calc(df, "Kamp_Erfass_Jahr", f"Kamp_Erfass_{period}")
    info("Calculate Aushang...")
    df_aus = sum_calc(df, "Kamp_Beginn_Jahr", f"Kamp_Beginn_{period}")

    info("Merge Results...")
    df_aggr = df_res.merge(
        right=df_aus,
        left_on=["Endkunde_NR", "Kamp_Erfass_Jahr", f"Kamp_Erfass_{period}"],
        right_on=["Endkunde_NR", "Kamp_Beginn_Jahr", f"Kamp_Beginn_{period}"],
        how="outer",
        suffixes=("_Res", "_Aus"),
    ).rename(
        {"Kamp_Erfass_Jahr": "Jahr", f"Kamp_Erfass_{period}": period}, axis="columns"
    )

    df_aggr = (
        df_aggr.fillna(
            {
                "Jahr": df_aggr.Kamp_Beginn_Jahr,
                period: df_aggr[f"Kamp_Beginn_{period}"],
                "Netto_Sum_Res": 0,
                "Netto_Sum_Aus": 0,
            }
        )
        .drop(["Kamp_Beginn_Jahr", f"Kamp_Beginn_{period}"], axis="columns")
        .astype({"Jahr": "int16"})
        .astype({period: "int8"})
        .sort_values(["Jahr", "Endkunde_NR", period])
        .reset_index(drop=True)
    )

    return df_aggr

In [58]:
bd_aggr_2w = aggregate_bookings(bd_current, 'KW_2')
bd_aggr_4w = aggregate_bookings(bd_current, 'KW_4')

2019-08-08 15:40:07 [INFO] Period: KW_2
2019-08-08 15:40:07 [INFO] Calculate Reservation...
2019-08-08 15:40:08 [INFO] Calculate Aushang...
2019-08-08 15:40:08 [INFO] Merge Results...
2019-08-08 15:40:09 [INFO] Period: KW_4
2019-08-08 15:40:09 [INFO] Calculate Reservation...
2019-08-08 15:40:09 [INFO] Calculate Aushang...
2019-08-08 15:40:09 [INFO] Merge Results...


In [59]:
bd_aggr_2w.head(10)

Unnamed: 0,Endkunde_NR,Jahr,KW_2,Netto_Sum_Res,Netto_Sum_Aus
0,103092,2007,45,5310.0,0.0
1,115554,2007,25,0.0,0.0
2,116266,2007,23,5700.0,0.0
3,164877,2007,13,0.0,0.0
4,164877,2007,19,0.0,0.0
5,177225,2007,45,2365.0,0.0
6,494878,2007,47,5900.0,0.0
7,100098,2008,39,16755.0,0.0
8,100143,2008,39,15000.0,0.0
9,100143,2008,49,2560.0,0.0


### Calculate sum curve per customer * year, over periods

In [44]:
def make_year_grp_sumcurve(df, year_col, grp_col, data_col, prefix=''):
    # build new columns with sum/cumsum per year/grp
    df = (df
          .pipe(calc_col_partitioned, f'{prefix}sumJahr', fun='sum',    on=data_col, part_by=[year_col, grp_col])
          .pipe(calc_col_partitioned, f'{prefix}cumJahr', fun='cumsum', on=data_col, part_by=[year_col, grp_col]))

    # divide data_col by sum to get scaled amounts, show in %
    df = (df.eval(f'{prefix}prcJahr = ({data_col} / {prefix}sumJahr) * 100 + 0.5')
            .fillna({f'{prefix}prcJahr': 0})
            .astype({f'{prefix}prcJahr': 'int'}))

    # divide cumsum by sum to get sum curve [0, 1], show in %
    df = (df.eval(f'{prefix}crvJahr = ({prefix}cumJahr / {prefix}sumJahr) * 100 + 0.5')
            .fillna({f'{prefix}crvJahr': 0})
            .astype({f'{prefix}crvJahr': 'int'}))
    return df

In [62]:
bd_aggr_2w = (
    bd_aggr_2w.pipe(
        make_year_grp_sumcurve,
        year_col="Jahr",
        grp_col="Endkunde_NR",
        data_col="Netto_Sum_Res",
        prefix="Res_",
    )
    .pipe(
        make_year_grp_sumcurve,
        year_col="Jahr",
        grp_col="Endkunde_NR",
        data_col="Netto_Sum_Aus",
        prefix="Aus_",
    )
    .sort_values(["Endkunde_NR", "Jahr", "KW_2"])
    .reset_index(drop=True)
)

bd_aggr_4w = (
    bd_aggr_4w.pipe(
        make_year_grp_sumcurve,
        year_col="Jahr",
        grp_col="Endkunde_NR",
        data_col="Netto_Sum_Res",
        prefix="Res_",
    )
    .pipe(
        make_year_grp_sumcurve,
        year_col="Jahr",
        grp_col="Endkunde_NR",
        data_col="Netto_Sum_Aus",
        prefix="Aus_",
    )
    .sort_values(["Endkunde_NR", "Jahr", "KW_4"])
    .reset_index(drop=True)
)

### Store and reload results

In [67]:
store_bin(bd_aggr_2w, 'bd_cluster_aggr_2w.feather')
store_bin(bd_aggr_4w, 'bd_cluster_aggr_4w.feather')

2019-08-08 16:12:20 [INFO] Writing to file C:\Users\kpf\data\bd_cluster_aggr_2w.feather
2019-08-08 16:12:21 [INFO] Written 19.1 MB
2019-08-08 16:12:21 [INFO] Finished storing binary file in 0.02s (0.11s CPU)
2019-08-08 16:12:21 [INFO] Writing to file C:\Users\kpf\data\bd_cluster_aggr_4w.feather
2019-08-08 16:12:21 [INFO] Written 15.9 MB
2019-08-08 16:12:21 [INFO] Finished storing binary file in 0.02s (0.19s CPU)


In [72]:
bd_aggr_2w = load_bin('bd_cluster_aggr_2w.feather')
bd_aggr_4w = load_bin('bd_cluster_aggr_4w.feather')

2019-08-08 16:18:24 [INFO] Reading from file C:\Users\kpf\data\bd_cluster_aggr_2w.feather
2019-08-08 16:18:24 [INFO] Finished loading binary file in 0.01s (0.0s CPU)
2019-08-08 16:18:24 [INFO] Reading from file C:\Users\kpf\data\bd_cluster_aggr_4w.feather
2019-08-08 16:18:24 [INFO] Finished loading binary file in 0.01s (0.16s CPU)


# Finally, some clustering

### Drop incomplete years

We have data for the last 10 years, the current year is incomplete

In [92]:
valid_years = list(range(pd.Timestamp.today().year - 10, pd.Timestamp.today().year))
valid_years

[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

### Pivot prc values by period

In [94]:
bd_res_prc_2w_data = bd_aggr_2w.loc[(bd_aggr_2w.Jahr.isin(valid_years))].pivot_table(
    index=["Endkunde_NR", "Jahr"],
    columns="KW_2",
    values="Res_prcJahr",
    aggfunc="sum",
    fill_value=0,
)

bd_res_prc_2w_data.head(12)

Unnamed: 0_level_0,KW_2,1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51
Endkunde_NR,Jahr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
100034,2009,0,0,0,33,2,0,0,0,6,0,0,0,0,0,0,0,0,0,0,14,0,6,40,0,0,0
100034,2010,0,0,0,0,31,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,57,0,0,0,0
100034,2011,0,0,0,0,40,0,0,0,0,0,0,0,0,0,0,14,0,0,0,4,2,41,0,0,0,0
100034,2012,0,0,0,0,29,0,0,0,0,0,0,0,0,0,8,9,0,0,0,0,0,51,2,0,0,0
100034,2013,0,0,52,2,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,32,0,0,0,0
100034,2014,0,40,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,0,0,0,0
100034,2015,0,0,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,5,0,0,0
100034,2016,0,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39,4,0,0,0,0
100034,2017,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,47,0,0,0,0
100034,2018,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,74,0,0,0,0


### Aggregate over years by customer

In [175]:
ek_2w_prc_mean = (
    bd_res_prc_2w_data.reset_index()
    .drop("Jahr", axis="columns")
    .groupby("Endkunde_NR")
    .agg("mean")
)

ek_2w_prc_mean_stack = (
    ek_2w_prc_mean.stack()
    .reset_index()
    .set_axis(["Endkunde_NR", "KW_2", "prc_mean"], axis="columns", inplace=False)
)

display(ek_2w_prc_mean.head(10))
display(ek_2w_prc_mean_stack.head(28))

KW_2,1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51
Endkunde_NR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
100034,0.0,9.7,15.2,3.5,10.2,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,2.0,3.7,0.0,0.0,0.0,1.8,6.7,42.1,4.7,0.0,0.0,0.0
100039,2.3,5.3,0.0,0.0,4.2,8.8,11.2,8.5,5.5,0.0,0.0,3.1,0.0,0.0,0.0,8.9,0.0,16.8,0.0,0.0,0.0,0.0,12.7,2.6,2.2,8.0
100064,0.0,0.0,0.0,0.0,11.111111,0.0,83.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
100083,4.111111,0.0,0.0,11.111111,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,11.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.666667
100095,0.0,0.0,1.8,22.3,9.6,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.9,31.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.571429,0.0,5.142857,0.0,2.285714,8.285714,0.0,0.0,18.571429,5.428571,2.142857,10.571429,14.285714,0.0
100098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
100104,0.0,0.0,0.0,0.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,80.0,3.7,3.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100109,0.0,0.0,4.9,0.0,2.6,13.5,0.0,0.0,65.1,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Endkunde_NR,KW_2,prc_mean
0,100034,1,0.0
1,100034,3,9.7
2,100034,5,15.2
3,100034,7,3.5
4,100034,9,10.2
5,100034,11,0.0
6,100034,13,0.0
7,100034,15,0.0
8,100034,17,0.6
9,100034,19,0.0


### Several clustering methods

Prepare data

In [176]:
X = ek_2w_prc_mean.to_numpy()

X_columns = ek_2w_prc_mean.columns

#### OPTICS

In [128]:
from sklearn.cluster import OPTICS

clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05, n_jobs=-1)

clust.fit(X)

#### MiniBatchKMeans

In [190]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(n_clusters=20,
                         random_state=0,
                         batch_size=100,
                         max_iter=10,
                         reassignment_ratio=0.5,
                         n_init=10,
                         verbose=True)

kminibatch_labels = kmeans.fit_predict(X)

Init 1/10 with method: k-means++
Inertia for init 1/10: 285694.909399
Init 2/10 with method: k-means++
Inertia for init 2/10: 284596.898128
Init 3/10 with method: k-means++
Inertia for init 3/10: 297419.144985
Init 4/10 with method: k-means++
Inertia for init 4/10: 291112.433895
Init 5/10 with method: k-means++
Inertia for init 5/10: 310689.948625
Init 6/10 with method: k-means++
Inertia for init 6/10: 282953.336385
Init 7/10 with method: k-means++
Inertia for init 7/10: 299501.674071
Init 8/10 with method: k-means++
Inertia for init 8/10: 275468.003470
Init 9/10 with method: k-means++
Inertia for init 9/10: 282128.239356
Init 10/10 with method: k-means++
Inertia for init 10/10: 301448.636461
Minibatch iteration 1/820: mean batch inertia: 984.753505, ewa inertia: 984.753505 
Minibatch iteration 2/820: mean batch inertia: 1106.335547, ewa inertia: 987.719282 
Minibatch iteration 3/820: mean batch inertia: 1039.662907, ewa inertia: 988.986354 
Minibatch iteration 4/820: mean batch inerti

In [191]:
pd.DataFrame(data=kmeans.cluster_centers_, columns=X_columns)

KW_2,1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51
0,1.299648,1.500277,1.524791,0.737129,1.669305,0.553064,1.179083,0.49745,1.361388,0.529503,0.850568,0.348839,0.425273,0.524004,1.139101,0.478475,0.402966,0.87484,0.379478,0.452711,1.422033,1.374092,2.433153,48.077992,16.327478,1.200874
1,24.347924,3.598095,3.551669,2.456705,2.909829,2.144973,1.552277,1.165211,1.333323,0.780752,1.357147,0.741281,1.119521,0.803149,1.054837,0.944392,1.22676,0.643688,1.23483,1.4438,1.639495,1.986628,0.991508,1.96453,2.747023,16.303574
2,1.255457,3.548057,2.498286,3.596122,2.727108,2.769637,2.358115,2.153873,1.761761,1.030073,1.838298,2.467986,1.902853,2.143736,1.651459,1.541966,2.27766,0.813715,1.311392,1.982884,2.575928,2.446475,0.867092,2.281272,3.077749,1.720683
3,0.241724,0.161411,0.399816,0.333284,0.149193,0.071369,0.400864,0.143292,0.312586,0.325489,0.158645,0.275934,0.573029,0.554772,0.38059,0.703366,0.566271,0.957054,0.615155,84.256832,1.360558,0.642857,1.381038,1.124778,0.488266,0.775953
4,1.012559,0.367101,1.880602,1.470648,1.73324,0.553447,1.188767,0.970196,1.12825,0.935435,1.759695,2.923042,35.169784,24.926203,3.449686,2.005364,1.614082,1.196989,0.905145,1.050101,0.414246,0.503259,0.688729,0.529376,1.118929,1.043643
5,1.130247,0.546306,0.913809,1.679858,1.59936,1.779058,1.746861,1.772363,66.13884,3.069727,1.24409,1.078862,0.716592,1.306127,0.786968,2.009524,0.585375,1.488941,0.363957,1.583185,0.566961,0.477,0.897224,0.460905,0.64864,1.831413
6,1.17901,3.32067,64.803629,1.989981,4.080699,1.772537,1.509723,0.435071,1.004633,0.354614,0.869798,0.358402,1.421291,0.572919,0.369303,0.390718,0.546387,2.548343,0.13845,0.953592,0.280452,1.75177,0.519209,1.021111,0.493202,0.489755
7,0.841304,1.374639,1.856664,2.125184,5.714759,3.124689,5.500556,36.157869,4.336269,2.24905,2.078362,2.517193,2.073742,1.455027,1.770925,1.076609,0.954809,0.920189,1.06667,0.836497,3.626093,1.464987,1.11575,1.196383,1.198084,1.008095
8,0.706283,0.319954,1.715815,0.724275,1.157569,0.68204,1.891598,0.604674,1.370838,1.867857,51.632969,11.013509,3.065554,3.502334,1.08423,1.225849,0.484413,1.409025,0.338165,1.506086,0.645224,0.994698,0.653771,0.913053,0.387236,0.617878
9,1.063642,1.34531,2.616974,2.067458,3.106118,2.614763,5.173224,8.594926,9.577555,16.866616,5.966816,6.917127,3.294477,2.471077,2.165178,2.345403,1.545928,0.939723,1.286799,1.038089,1.864386,1.601608,1.465631,1.604554,1.006641,0.770124


In [192]:
ek_2w_prc_mean['kmMiniBatch'] = kminibatch_labels

In [193]:
ek_2w_prc_mean.kmMiniBatch.value_counts()

2     2401
9      518
16     515
17     514
13     466
7      414
1      341
18     327
11     285
4      262
6      241
14     228
0      225
8      221
19     218
15     215
12     213
10     200
5      198
3      196
Name: kmMiniBatch, dtype: int64

#### Normal KMeans

In [180]:
from sklearn.cluster import KMeans

nkmeans = KMeans(n_clusters=10, random_state=0, verbose=1, n_jobs=-1)

nkmeans_labels = nkmeans.fit_predict(X)

In [181]:
pd.DataFrame(data=nkmeans.cluster_centers_, columns=X_columns)

KW_2,1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51
0,1.384876,1.383198,1.909419,1.220893,1.048416,0.697487,1.454145,1.521305,2.075336,0.84124,1.460403,1.427292,1.068835,1.241309,0.807523,0.851708,0.763334,1.880378,1.309339,2.637939,4.288203,44.695057,4.086978,3.193244,2.305337,1.302549
1,1.464954,2.503118,62.86959,2.001959,2.777056,2.004229,1.36375,0.622658,1.110566,0.79392,1.458455,0.411078,1.473202,0.632896,0.579413,0.692267,0.426733,2.397439,0.457502,1.238642,0.751824,1.301764,0.741247,1.173396,0.733522,0.650734
2,0.980375,0.609724,1.483448,0.867405,1.009632,0.82013,1.194321,0.512208,0.620803,0.964725,2.058188,0.547139,1.491535,1.029451,1.78731,1.296997,1.644412,67.978245,1.886246,2.081067,1.909811,1.321607,0.973104,1.408983,0.619403,0.5349
3,1.620127,1.826232,1.648266,1.352334,2.470911,1.180459,1.806552,4.120407,1.977082,1.349445,1.079727,1.318649,1.038273,0.945777,1.085814,1.077583,1.074137,1.751651,2.07835,2.405289,21.954118,4.199449,4.779056,13.428791,5.725371,1.624611
4,0.946585,0.425738,0.875149,1.215136,1.687991,1.523496,1.700225,2.746138,66.789568,3.349725,1.866365,1.494402,0.69921,0.902017,0.419756,1.15818,0.52283,1.442664,0.660325,1.628539,0.561432,0.639133,0.671587,0.643957,0.552534,1.306504
5,1.186253,1.659172,2.851643,3.506391,53.853446,2.132235,2.925212,2.309148,2.318989,1.149741,1.616734,0.898254,1.789481,1.308294,1.291515,1.400757,1.143922,1.132247,1.209326,1.061299,0.86488,1.11792,1.10364,0.95979,1.085331,0.915721
6,2.970771,2.598919,2.604416,2.954397,2.717911,2.526163,2.608002,4.815297,2.841375,2.885546,4.727349,3.231199,3.973848,3.463475,2.564056,1.551736,2.086733,2.414491,2.455939,2.651452,1.553047,2.015163,2.612053,1.771356,2.351526,2.569045
7,1.07626,1.033635,2.355411,1.474445,3.078818,2.141638,55.570071,3.77062,3.002002,1.378108,1.97812,0.974083,1.151527,1.815583,1.046749,1.264744,0.945201,1.272093,0.729006,1.284228,0.88195,0.931196,0.872264,0.75564,0.334751,0.878791
8,1.186115,1.241962,1.341059,1.70588,1.572935,0.785121,1.375763,1.796886,1.777078,1.615735,1.531878,1.613199,1.987374,2.12849,4.87619,46.964018,2.526923,3.046869,1.948509,2.300875,1.320239,1.72613,1.082594,0.95304,0.966661,0.873928
9,0.824082,0.472524,1.017689,0.575726,1.224581,0.639794,1.264732,0.756655,0.943427,0.38252,0.586236,0.488357,0.796719,0.950066,0.491638,0.914101,0.610198,1.753848,2.099156,69.277467,2.651253,1.673523,1.881123,1.59038,0.575253,0.913796


In [183]:
pd.Series(nkmeans_labels).value_counts()

6    4824
3    1086
0     374
9     323
8     301
5     285
2     271
1     265
7     264
4     205
dtype: int64