In [None]:
3+4

# Load modules

In [None]:
# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np

from collections import namedtuple

from pa_lib.file import (
    store_bin,
    project_dir,
    data_files,
    load_csv,
    load_bin,
    write_xlsx,
    load_txt
)

from pa_lib.data import (
    as_dtype,
    dtFactor,
    desc_col,
    lookup,
    clean_up_categoricals
)

from pa_lib.util import (
    collect,
    value,
    flatten,
    normalize_rows,
    list_items
)

from pa_lib.log import time_log
from pa_lib.types import Record

# display long columns completely, show more rows
pd.set_option("display.max_colwidth", 300)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

In [None]:
file_dir

In [None]:
from pa_lib.util import iso_to_datetime

In [None]:
from pa_lib.data  import calc_col_partitioned

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

# Load data files

In [None]:
with project_dir("MediaFocus"):
    # List files in folder:
    display(data_files("*.feather"))
    
    # Branchen per KW über sie letzten 10 Jahre:
    br_kw     = load_bin("branchen_kw_10y.feather").astype({'Jahr': 'int64'})
    
    # Branchen und Produktgruppen letzten 10 Jahre:
    br_pg_kw  = load_bin("branchen_pg_kw_10y.feather").astype({'Jahr': 'int64'})
    
    # Branchen und Werbetreibende (aehnlich zu Endkunde) (KEINE JAHRE!!!!!)
    br_wbt_kw = load_bin("branchen_wbt_kw.feather")

In [None]:
print(80*"#"+"\n")
print("br_kw")
display(br_kw.shape)
display(
    br_kw.sample(5)
)


print(80*"#"+"\n")
print("br_pg_kw")
display(br_pg_kw.shape)
display(
    br_pg_kw.sample(5)
)


print(80*"#"+"\n")
print("br_wbt_kw")
display()
display(
    br_wbt_kw.sample(5)
)
print(80*"#")

# Data Prep

In [None]:
## Jahres Brutto
kw_columns = list(br_kw.columns)[2:]
br_kw.loc[:,"Brutto_Jahr"] = br_kw.loc[:,kw_columns].sum(axis="columns")

################################################################################
## Relatives Jahres-Brutto

br_kw = calc_col_partitioned(
    df=br_kw,
    col="Brutto_Jahr_Relativ", # Name new column
    #fun="max",
    fun=lambda s: (s / s.max()),
    on="Brutto_Jahr",
    part_by="Branche")

################################################################################
display(
    br_kw
        .loc[:,["Jahr","Branche","Brutto_Jahr","Brutto_Jahr_Relativ"]]
        .sample(5)
)


## Melting `br_kw`

In [None]:
################################################################################
## Put all KW_XY folders underneath each other:

calender_columns = [col for col in br_kw.columns if col.startswith("KW_")]

container_df = (
    pd.melt(
        br_kw,
        id_vars    = ["Branche","Jahr"],
        value_vars = calender_columns,
        var_name   = "KW",
        value_name = "Brutto"
    )
)

################################################################################
## Create Datum column:

years_temp = container_df.loc[:,"Jahr"]
kws_temp   = container_df.loc[:,"KW"].map(lambda x : int(x[3:5]))

date_series = (
    years_temp
        .combine(
            kws_temp,
            func = lambda year, kw: iso_to_datetime(year=year, kw=kw, day=1)
        )
)

container_df.loc[:,"Datum"] = date_series

br_kw_datum = (
    container_df
        #.loc[:,["Branche","Datum","Brutto"]]
        .sort_values(["Branche","Datum"])
        .pipe(clean_up_categoricals)
)
################################################################################
## Turn Datum column into Integer, so lmplot can handle it:

kw_to_decimal = br_kw_datum.KW.map(lambda x : int(x[3:5]))/53
Datum_Rel = (br_kw_datum.Jahr + kw_to_decimal )

br_kw_datum.loc[:,"Datum_Relativ"] = Datum_Rel # integer series

################################################################################
## Create Brutto relative to min and max of Branche:

br_kw_datum = calc_col_partitioned(
    df      = br_kw_datum,
    col     = "Brutto_Relativ",
    fun     = lambda s: (s  / s.max()),
    on      = "Brutto",
    part_by = "Branche")

################################################################################
## Show me the result:

display(br_kw_datum.head())

## Melting `br_pg_kw`

In [None]:
################################################################################
## Put all KW_XY folders underneath each other:

calender_columns = [col for col in br_pg_kw.columns if col.startswith("KW_")]

container_df = (
    pd.melt(
        br_pg_kw,
        id_vars    = ["Branche","Produktgruppe","Jahr"],
        value_vars = calender_columns,
        var_name   = "KW",
        value_name = "Brutto"
    )
)

################################################################################
## Create Datum column:

years_temp = container_df.loc[:,"Jahr"]
kws_temp   = container_df.loc[:,"KW"].map(lambda x : int(x[3:5]))

date_series = (
    years_temp
        .combine(
            kws_temp,
            func = lambda year, kw: iso_to_datetime(year=year, kw=kw, day=1)
        )
)

container_df.loc[:,"Datum"] = date_series

br_pg_kw_datum = (
    container_df
        #.loc[:,["Branche","Datum","Brutto"]]
        .sort_values(["Branche","Produktgruppe","Datum"])
        .pipe(clean_up_categoricals)
)
################################################################################
## Turn Datum column into Integer, so lmplot can handle it:

kw_to_decimal = br_pg_kw_datum.KW.map(lambda x : int(x[3:5]))/53
Datum_Rel = (br_pg_kw_datum.Jahr + kw_to_decimal )

br_pg_kw_datum.loc[:,"Datum_Relativ"] = Datum_Rel # integer series

################################################################################
## Create Brutto relative to min and max of Produktgruppe:

br_pg_kw_datum = calc_col_partitioned(
    df      = br_pg_kw_datum,
    col     = "Brutto_Relativ",
    fun     = lambda s: (s / s.max()),
    on      = "Brutto",
    part_by = ["Branche","Produktgruppe"]
)

################################################################################
## Show me the result:

display(br_pg_kw_datum.head())

# Find patterns

## `br_kw`

In [None]:
branchen_liste = list(set(br_kw.loc[:,"Branche"]))
#branchen_liste

In [None]:
## Remove 2019, because it's still not finished:

row_without_2019 = (br_kw.loc[:,"Jahr"] < 2019)

################################################################################
## Create facet grid with regression and confidence interval:

g = sns.lmplot(
    x        = "Jahr",
    y        = "Brutto_Jahr_Relativ",
    col      = "Branche",
    hue      = "Branche", # kinda useless, but nice to look at!
    data     = br_kw.loc[row_without_2019,:],
    col_wrap = 3, # How many plots per row
    height   = 5,
    order    = 1, # polynomial regression of order, best for 1 or 2
    #lowess = True,
    #ci       = 95,
).set(xlim=(2008.5, 2019),ylim=(-0.1,1.1))

for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

plt.savefig(file_dir / 'plots' /'br_lmplot' / 'lmplot_branche_jahr.png')

################################################################################
## Cleanup

del row_without_2019


***
> 1. Not enough data points. Granularity is too low.
***

## `br_kw_datum`

In [None]:
before_this_date = (br_kw_datum.Datum < iso_to_datetime(year=2019,kw=30,day=1))

g = sns.lmplot(
    x        = "Datum_Relativ",
    y        = "Brutto_Relativ",
    col      = "Branche",
    hue      = "Branche", # kinda useless, but nice to look at!
    data     = br_kw_datum.loc[before_this_date,:],
    col_wrap = 3, # How many plots per row
    height   = 10,
    #order    = 3, # polynomial regression of order, best for 1 or 2
    lowess  = True,
    #ci       = 95,
).set(
    xlim=(br_kw_datum.Datum_Relativ.min(), br_kw_datum.Datum_Relativ.max()),
    ylim=(br_kw_datum.Brutto_Relativ.min()-0.01, br_kw_datum.Brutto_Relativ.max()+0.01)
)

for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

plt.savefig(file_dir / 'plots' /'br_lmplot' / 'lmplot_branche_datum.png')

***
> 1. Regression is not good enough for fitting
2. Periodic effects (yearls, seasonal, holidays) are not taken into account. The plots look messy.
***

### `fbprophet` Fourier Transformation

- https://facebook.github.io/prophet/docs/quick_start.html
- https://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a

In [None]:
from fbprophet import (
    Prophet)

from fbprophet.plot import (
    add_changepoints_to_plot)

import holidays
CH_holidays = holidays.CH()

In [None]:
def build_prophet_branche(branche_xyz):
    ################################################################################
    ## Prepare Dataset for Prophet:

    #branche_xyz = branchen_liste[16]
    latest_date = iso_to_datetime(year=2019,kw=30, day=1)

    print(f"Branche: {branche_xyz}")

    select_branche = (br_kw_datum.Branche == branche_xyz)
    before_maxdate = (br_kw_datum.Datum < latest_date)


    df = (
        br_kw_datum
            .loc[select_branche & before_maxdate,["Datum","Brutto"]]
            .rename(columns={"Datum": "ds", "Brutto": "y"}) # Has to be "ds" and "y"
    )

    print(f"df.shape: {df.shape}")

    ################################################################################
    ## Define holidays (dataframe):

    ch_holidays = (
        pd.DataFrame(
            holidays.CH(years = list(range(2005,2021))).items(),
            columns=["ds","holiday"]
        )
    )

    ################################################################################
    ## Define Prophet's hyper variables:

    m = Prophet(
            weekly_seasonality = False,
            daily_seasonality  = False,
            holidays           = ch_holidays,
    )
    # Fit/Train Prophet on dataset:
    m.fit(df)

    ################################################################################
    ## Create dataset on which shall be predicted:

    future = m.make_future_dataframe(
        periods         = 365,
        freq            = 'D',  # Frequency can be Days or Months 
        include_history = True  # Boolean

    )

    print(f"future.shape: {future.shape}")

    # Let Prophet predict the future
    forecast = m.predict(future)

    ################################################################################
    # print("forecast.tail()")
    # display(forecast.tail())
    ################################################################################
    return (branche_xyz,m,forecast)

In [None]:
def plot_prophet(branche_xyz,m,forecast):
    ################################################################################
    ## Plot the known past as well the predicted future:

    print(f"{branche_xyz}: Plot TimeSeries")
    fig1 = (
        m.plot(
            forecast,
            xlabel = "Datum",
            ylabel = "Brutto",
        )
    )

    plt.title(branche_xyz, loc="right",)
    fig1.set_size_inches(100,10)

    # Plot underlying significant changepoints for trend computation
    a = add_changepoints_to_plot(
        fig1.gca(),
        m,
        forecast)
    
    file_name = (branche_xyz+'_ts.png').replace("/","")
    plt.savefig(file_dir / 'plots' / 'br_prophet' / file_name)
    plt.show()
    
    ################################################################################
    ## Plot Trend, holidays, yearly, weekly, whatever
    print("\n")
    print(f"{branche_xyz}: Plot Components")
    fig2 = m.plot_components(forecast)

    fig2.set_size_inches(10,15)
    file_name = (branche_xyz+'_pltcmp.png').replace("/","")
    plt.savefig(file_dir / 'plots' / 'br_prophet' / file_name)
    plt.show()

    print("\n"+80*"#")
    ################################################################################

In [None]:
import warnings

In [None]:
%%time
warnings.filterwarnings("ignore")

branchen_liste = list(set(br_kw_datum.loc[:,"Branche"]))

for branche in branchen_liste:
    
    print(f"No: {branchen_liste.index(branche)+1}")
    
    (branche_temp, m_temp, forecast_temp) = build_prophet_branche(branche_xyz=branche)

    plot_prophet(
        branche_xyz = branche_temp,
        m           = m_temp,
        forecast    = forecast_temp,
    )

warnings.filterwarnings("once")

### Conclusions

***
1. Interesting Branches to follow up on:
 1. Detailhandel
 2. Finanzen
 3. Pharma + Gesundheit
 4. Verkehrsbetriebe
2. Second priority to follow up on:
 1. Tabakwaren
***

In [None]:
branchen_liste

## `br_pg_kw` (Branche + Produktgruppe)

In [75]:
lookup_br_pg = (
    br_pg_kw
        .loc[:,["Branche","Produktgruppe"]]
        .drop_duplicates()
)

display(lookup_br_pg)
print(f"lookup_br_pg.shape: {lookup_br_pg.shape}")

Unnamed: 0,Branche,Produktgruppe
0,"BAUEN, INDUSTRIE, EINRICHTUNG","BAUEN + INDUSTRIE IMAGE, PS"
1,"BAUEN, INDUSTRIE, EINRICHTUNG",BAUSTOFFE
2,"BAUEN, INDUSTRIE, EINRICHTUNG",EINRICHTUNG
3,"BAUEN, INDUSTRIE, EINRICHTUNG",GARTEN
4,"BAUEN, INDUSTRIE, EINRICHTUNG",HAUSAUSSENAUSBAU
5,"BAUEN, INDUSTRIE, EINRICHTUNG",HAUSINNENAUSBAU
6,"BAUEN, INDUSTRIE, EINRICHTUNG",HAUSROHBAU
7,"BAUEN, INDUSTRIE, EINRICHTUNG",MASCHINEN
8,DETAILHANDEL,EINKAUFSZENTREN
9,DETAILHANDEL,EINZELHANDEL


lookup_br_pg.shape: (120, 2)


In [76]:
display(
    lookup_br_pg
        .groupby("Branche")
        .agg({"Produktgruppe": "count"})
)

Unnamed: 0_level_0,Produktgruppe
Branche,Unnamed: 1_level_1
"BAUEN, INDUSTRIE, EINRICHTUNG",8
DETAILHANDEL,6
DIENSTLEISTUNG,4
DIGITAL + HAUSHALT,5
ENERGIE,3
FAHRZEUGE,9
FINANZEN,6
"FREIZEIT, GASTRONOMIE, TOURISMUS",2
GETRAENKE,7
INITIATIVEN + KAMPAGNEN,3


In [None]:
br_pg_kw_datum.sample(5)

### Create folder for plots: Prophet, Produktgruppen 

In [None]:
from os import mkdir

folder_name = list(set(br_pg_kw.Branche))

for folder in folder_name:
    mkdir(file_dir / "plots" / "pg_prophet"/ folder)


### lmplot

In [None]:
before_this_date = (br_pg_kw_datum.Datum < iso_to_datetime(year=2019,kw=30,day=1))
branchen_br_pg = list(set(br_pg_kw_datum.Branche))

for branche_xyz in branchen_br_pg:
    print(80*"#")
    print(f"{branche_xyz}")
    branchen_select = (br_pg_kw_datum.Branche == branche_xyz)
    
    g = sns.lmplot(
        x        = "Datum_Relativ",
        y        = "Brutto_Relativ",
        col      = "Produktgruppe",
        hue      = "Produktgruppe",     
        data     = br_pg_kw_datum.loc[(before_this_date & branchen_select),:].pipe(clean_up_categoricals),
        col_wrap = 3, # How many plots per row
        height   = 10,
        #order    = 3, # polynomial regression of order, best for 1 or 2
        lowess  = True,
        #ci       = 95,
    ).set(
        xlim=(br_pg_kw_datum.Datum_Relativ.min(), br_pg_kw_datum.Datum_Relativ.max()),
        ylim=(br_pg_kw_datum.Brutto_Relativ.min()-0.01, br_pg_kw_datum.Brutto_Relativ.max()+0.01)
    )

    for ax in g.axes.flat:
        for label in ax.get_xticklabels():
            label.set_rotation(45)

    file_name = f"lmplot_{branche_xyz}_br_pg_datum.png".replace("/","")     
    plt.savefig(file_dir / "plots" / "pg_lmplot" / file_name)
    plt.show()

del branchen_br_pg
del before_this_date

### `fbprophet` Fourier Transformation

In [None]:
len(list(set(br_pg_kw_datum.Produktgruppe)))

In [None]:
def build_prophet_br_pg(pg_xyz):
    ################################################################################
    ## Prepare Dataset for Prophet:

    #branche_xyz = branchen_liste[16]
    latest_date = iso_to_datetime(year=2019,kw=30, day=1)

    print(f"Produktgruppe: {pg_xyz}")

    select_pg = (br_pg_kw_datum.Produktgruppe == pg_xyz)
    before_maxdate = (br_pg_kw_datum.Datum < latest_date)


    df = (
        br_pg_kw_datum
            .loc[select_pg & before_maxdate,["Datum","Brutto"]]
            .rename(columns={"Datum": "ds", "Brutto": "y"}) # Has to be "ds" and "y"
    )

    print(f"df.shape: {df.shape}")

    ################################################################################
    ## Define holidays (dataframe):

    ch_holidays = (
        pd.DataFrame(
            holidays.CH(years = list(range(2005,2021))).items(),
            columns=["ds","holiday"]
        )
    )

    ################################################################################
    ## Define Prophet's hyper variables:

    m = Prophet(
            weekly_seasonality = False,
            daily_seasonality  = False,
            holidays           = ch_holidays,
    )
    # Fit/Train Prophet on dataset:
    m.fit(df)

    ################################################################################
    ## Create dataset on which shall be predicted:

    future = m.make_future_dataframe(
        periods         = 365,
        freq            = 'D',  # Frequency can be Days or Months 
        include_history = True  # Boolean

    )

    print(f"future.shape: {future.shape}")

    # Let Prophet predict the future
    forecast = m.predict(future)

    ################################################################################
    # print("forecast.tail()")
    # display(forecast.tail())
    ################################################################################
    return (pg_xyz,m,forecast)

In [None]:
def plot_prophet_pg(pg_xyz,branche_xyz,m,forecast):
    ################################################################################
    ## Plot the known past as well the predicted future:

    print(f"{pg_xyz}: Plot TimeSeries")
    fig1 = (
        m.plot(
            forecast,
            xlabel = "Datum",
            ylabel = "Brutto",
        )
    )

    plt.title(pg_xyz, loc="right",)
    fig1.set_size_inches(100,10)

    # Plot underlying significant changepoints for trend computation
    a = add_changepoints_to_plot(
        fig1.gca(),
        m,
        forecast)
    
    file_name = (pg_xyz+'_ts.png').replace("/","")
    plt.savefig(file_dir / 'plots' / 'pg_prophet' / branche_xyz /file_name)
    plt.show()
    
    ################################################################################
    ## Plot Trend, holidays, yearly, weekly, whatever
    print("\n")
    print(f"{pg_xyz}: Plot Components")
    fig2 = m.plot_components(forecast)

    fig2.set_size_inches(10,15)
    file_name = (pg_xyz+'_pltcmp.png').replace("/","")
    plt.savefig(file_dir / 'plots' / 'pg_prophet' / branche_xyz /file_name)
    plt.show()

    print("\n"+80*"#")
    ################################################################################

In [None]:
import warnings

In [None]:
%%time
warnings.filterwarnings("ignore")

for branche_xyz,pg_xyz in zip(lookup_br_pg.loc[:,"Branche"],lookup_br_pg.loc[:,"Produktgruppe"]):
        
    (pg_xyz, m_temp, forecast_temp) = build_prophet_br_pg(pg_xyz=pg_xyz)

    plot_prophet_pg(
        branche_xyz = branche_xyz,
        pg_xyz      = pg_xyz,
        m           = m_temp,
        forecast    = forecast_temp,
    )

warnings.filterwarnings("once")

### Conclusions (Trends)

***
Produktgruppen mit steigenden Werbeausgaben:

1. __BAUEN, INDUSTRIE, EINRICHTUNG__
    1. MASCHINEN
    
    
2. __DETAILHANDEL__
    1. GROSSVERTEILER
    2. WARENHAEUSER
    
    
3. __DIGITAL + HAUSHALT__
    1. DIGITAL + HAUSHALT
    
    
4. __FAHRZEUGE__
    1. FAHRZEUG- KAROSSERIEBAU
    2. NUTZFAHRZEUGE (GEBRAUCHT)
    3. FAHRZEUGE (SONSTIGE
    
    
5. __FINANZEN__
    1. FINANZEN IMAGE
    2. VERSICHERUNGEN
    3. VORSORGEPRODUKTE
    
    
6. __FREIZEIT,GASTRONOMIE, TOURISMUS__
    1. FREIZEIT
    
    
7. __GETRAENKE__
    1. BIER
    2. KAFFEE, TEE, KAKAO
    3. SPIRITUOSEN
    
    
8. __KOSMETIK + KOERPERPFLEGE__
    1. MUNDPFLEGE
    
    
9. __MEDIEN__
    1. MEDIEN IMAGE


10. __MODE + SPORT__
    1. ACCESSOIRES


11. __NAHRUNGSMITTEL__
    1. BAECKEREI
    2. NAHRUNGSMITTEL IMAGE
    3. SCHOKOLADE + SUESSWAREN
    

12. __PERSOEHNLICHER BEDARF__
    1. OPTIK + AKUSTIK
    

13. __PHARMA + GESUNDHEIT__
    1. OTC PRAEPARATE
    

14. __REINIGEN__
    1. REINIGEN IMAGE
    2. WASCHMITTEL + TEXTILPFLEGE
    

15. __TELEKOMMUNIKATION__
    1. TELEKOM GERAETE


16. __VERKEHRSBETRIEBE__
    1. KURIERE + POSTDIENSTE
    2. LUFTFAHRT
    3. STRASSEN- SCHIENENVERKEHR
    
  

***

# Connecting MediFocus with APG

## `bd_data`

In [83]:
bd_data = load_bin("vkprog\\bd_data.feather")

10:17:51 [INFO] Started loading binary file ...
10:17:51 [INFO] Reading from file C:\Users\stc\data\vkprog\bd_data.feather
10:17:51 [INFO] ... finished loading binary file in 0.16s (0.75s CPU)


In [191]:
# Alll columns that contain any sort of category/branch:
branch_columns = [ele for ele in bd_data.columns if("BRANCH" in ele and "_ID" not in ele)] 

# Create empty dataframe to fill with data:
apg_ek_br_df = pd.DataFrame()

for kategorie in branch_columns:
    add_df = (
        bd_data
            .loc[:,["ENDKUNDE_NR","ENDKUNDE"]+[kategorie]]
            .drop_duplicates()
            .rename(columns={kategorie: "APG_Branche"})
    )
    
    apg_ek_br_df = (
        apg_ek_br_df
            .append(add_df,sort=True)
            .loc[:,["ENDKUNDE_NR","ENDKUNDE", "APG_Branche"]]
    )


# Kick all NaN:
kick_all_nan = ~apg_ek_br_df.APG_Branche.isna()
apg_ek_br_df = apg_ek_br_df.loc[kick_all_nan,:]

# Format Names:
apg_ek_br_df.loc[:,"APG_Branche"] = apg_ek_br_df.APG_Branche.map(lambda x : x[5:])

# Drop duplicates:
apg_ek_br_df = apg_ek_br_df.drop_duplicates()

# Cleanup:
del (add_df, branch_columns, kick_all_nan)

In [192]:
apg_ek_br_df.sample(20)

Unnamed: 0,ENDKUNDE_NR,ENDKUNDE,APG_Branche
359108,475125,VIDEO 2000 SA,Dienstleistung
138006,533812,Dubner Moderne Sàrl,Dienstleistung
44003,482079,the comm gmbh,Dienstleistung
882443,607394,BG Rossi Lyss,Dienstleistung
142075,110125,Jacky Sports,Veranstaltungen
1495865,168208,LUVINA,Handel / Grossverteiler
1345806,505045,Media Concept Schweiz AG,Handel
64146,101844,AXA Leben AG,Dienstleistung
728560,596040,Lussona GmbH,Bekleidung / persönlicher Bedarf
794836,644586,Bike World by SportXX,Freizeit / Touristik


## Matching with Levenshtein-Distance: Produktgruppe => APG Branchen

https://de.wikipedia.org/wiki/Levenshtein-Distanz

In [174]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [219]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
    """
    df_1 is the left table to join
    df_2 is the right table to join
    key1 is the key column of the left table
    key2 is the key column of the right table
    threshold is how close the matches should be to return a match, based on Levenshtein distance
    limit is the amount of matches that will get returned, these are sorted high to low
    """
    s = df_2.loc[:,key2].tolist()

    df_1.loc[:,'matches'] = (
        df_1
            .loc[:,key1]
            .apply(lambda x: process.extract(x, s, limit=limit))
    )

    df_1.loc[:,'matches'] = (
        df_1
            .loc[:,'matches']
            .apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    )

    return df_1

In [239]:
mf_br_pg = (
    br_pg_kw
    .loc[:,["Branche","Produktgruppe"]]
    .drop_duplicates()
    .rename(columns={"Branche":"MF_Branche", "Produktgruppe": "MF_Produktgruppe"})
    .astype({"MF_Produktgruppe": str})
)

apg_br = pd.DataFrame(apg_ek_br_df.loc[:,"APG_Branche"].drop_duplicates())

In [245]:
mf2apg_match = (
    fuzzy_merge(
        df_1 = mf_br_pg,
        df_2 = apg_br,
        key1 = "MF_Produktgruppe",
        key2 = "APG_Branche",
        threshold = 70,
        limit = 1,
    )
    .rename(columns={"matches": "APG_Branche"})
    .astype({"MF_Produktgruppe": "category"})
)

In [247]:
mf2apg_match.sample(20)

Unnamed: 0,MF_Branche,MF_Produktgruppe,APG_Branche
80,NAHRUNGSMITTEL,NAEHRMITTEL,Nahrungsmittel
104,TELEKOMMUNIKATION,FIXNET,
114,VERKEHRSBETRIEBE,LUFTFAHRT,
43,GETRAENKE,BIER,Bier
5,"BAUEN, INDUSTRIE, EINRICHTUNG",HAUSINNENAUSBAU,
67,MEDIEN,FILM + PAY-TV,TV / Radio / Film
106,TELEKOMMUNIKATION,MOBILE,
73,MODE + SPORT,SPORT,Freizeit / Sport
92,PHARMA + GESUNDHEIT,OTC PRAEPARATE,
62,KOSMETIK + KOERPERPFLEGE,KOSMETIK,Dekorative Kosmetik


***
1. `mf2apg_match` ist für das Matching: trendige MF_Produktgruppen zu APG Endkunden
***

# Scribbles

## `br_wbt_kw` is weird

In [78]:
display(br_wbt_kw.head())
print("\n")
display(br_wbt_kw.shape)

Unnamed: 0,Branche,Werbungtreibender,KW_01,KW_02,KW_03,KW_04,KW_05,KW_06,KW_07,KW_08,KW_09,KW_10,KW_11,KW_12,KW_13,KW_14,KW_15,KW_16,KW_17,KW_18,KW_19,KW_20,KW_21,KW_22,KW_23,KW_24,KW_25,KW_26,KW_27,KW_28,KW_29,KW_30,KW_31,KW_32,KW_33,KW_34,KW_35,KW_36,KW_37,KW_38,KW_39,KW_40,KW_41,KW_42,KW_43,KW_44,KW_45,KW_46,KW_47,KW_48,KW_49,KW_50,KW_51,KW_52,KW_53
0,"BAUEN, INDUSTRIE, EINRICHTUNG",16ART,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"BAUEN, INDUSTRIE, EINRICHTUNG",1A BRANDS HANDELS GMBH,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1280,1344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1400,0,0,0,0,0,0,0,0
2,"BAUEN, INDUSTRIE, EINRICHTUNG",1A GSM AG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15000,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"BAUEN, INDUSTRIE, EINRICHTUNG",1A HUNKELER FENSTER AG,0,0,0,9562,0,0,2749,0,0,0,0,0,0,7613,0,5362,14384,7613,0,0,0,0,0,10245,0,10245,0,19923,7613,9904,0,17858,0,12836,6815,10020,0,0,0,9904,0,0,10724,0,0,10020,2749,46609,1490,45288,35148,0,0
4,"BAUEN, INDUSTRIE, EINRICHTUNG",1A HUNKELER HOLZBAU AG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10245,0,0,0,9923,0,0,0,0,0,13879,17516,0,9904,7613,10245,7702,17516,15577,0,0,0,9562,0,0,0,0,0,10020,0,0,0,7875,52695,13688,21625,0,0






(35471, 55)

In [79]:
display(
    br_wbt_kw
        .groupby(["Branche","Werbungtreibender"])
        .agg({"KW_01": "count"})
        .sort_values("KW_01", ascending=False)
        .head()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,KW_01
Branche,Werbungtreibender,Unnamed: 2_level_1
FAHRZEUGE,AMAG AUTOMOBIL- + MOTOREN AG,28
DETAILHANDEL,CENTRE COMMERCIAL,4
FAHRZEUGE,AUTO-TRACHSLER AG,3
DETAILHANDEL,EKZ NEUMARKT,3
"BAUEN, INDUSTRIE, EINRICHTUNG",MENZ AG,2


In [77]:
row_select_temp = (
    br_wbt_kw.Branche.isin(["DETAILHANDEL"]) &
    br_wbt_kw.Werbungtreibender.isin(["CENTRE COMMERCIAL"])
)

display(
    br_wbt_kw.loc[row_select_temp,:]
)

Unnamed: 0,Branche,Werbungtreibender,KW_01,KW_02,KW_03,KW_04,KW_05,KW_06,KW_07,KW_08,KW_09,KW_10,KW_11,KW_12,KW_13,KW_14,KW_15,KW_16,KW_17,KW_18,KW_19,KW_20,KW_21,KW_22,KW_23,KW_24,KW_25,KW_26,KW_27,KW_28,KW_29,KW_30,KW_31,KW_32,KW_33,KW_34,KW_35,KW_36,KW_37,KW_38,KW_39,KW_40,KW_41,KW_42,KW_43,KW_44,KW_45,KW_46,KW_47,KW_48,KW_49,KW_50,KW_51,KW_52,KW_53
8075,DETAILHANDEL,CENTRE COMMERCIAL,0,0,0,0,0,0,0,0,0,0,0,0,0,7413,0,0,0,0,0,0,0,19285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15387,0,0,0,21068,3115,0,0,0,8239,0,0,0,0
8076,DETAILHANDEL,CENTRE COMMERCIAL,0,0,0,0,0,0,0,0,28559,0,0,0,19455,0,0,0,13627,0,0,0,13024,27152,1740,0,0,8910,0,0,0,0,0,0,0,0,46636,0,0,0,0,0,0,0,0,41670,3480,0,0,0,43600,0,0,0,0
8077,DETAILHANDEL,CENTRE COMMERCIAL,0,0,0,0,231,231,4015,4015,1700,0,0,5850,850,0,0,0,0,0,11546,9216,0,0,1990,0,0,0,0,0,0,0,0,0,0,0,3154,8954,21712,13470,6422,0,0,0,0,0,0,0,0,5646,0,12338,25879,2400,0
8078,DETAILHANDEL,CENTRE COMMERCIAL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3516,0,4080,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
