# Data-Prep: CRM Data

# Load libs

In [1]:

# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
parent_dir = file_dir.parent
sys.path.append(str(parent_dir))

import pandas as pd
pd.options.display.max_columns = None


## Libraries & Settings ##
from pa_lib.file import load_bin
from pa_lib.util import cap_words
from pa_lib.log import time_log, info

import datetime as dt
from dateutil.relativedelta import relativedelta

from pa_lib.data import (
    clean_up_categoricals,
    unfactorize,
)

from pa_lib.data import desc_col

In [2]:
from pa_lib.data import boxplot_histogram
import numpy as np


# Load CRM data

In [None]:
def load_crm_data():
    raw_data = load_bin("vkprog\\crm_data_vkprog.feather").rename(
        mapper=lambda name: cap_words(name, sep="_"), axis="columns"
    )
    return raw_data.astype({"Year": "int64", "KW_2": "int64"})


In [None]:
raw_crm_data = load_crm_data()

In [None]:
#raw_crm_data.eval("YYYYKW_2 = Year * 100 + KW_2", inplace=True)

# View raw crm data

In [None]:
display(raw_crm_data.pivot_table(
    index=["Kanal"],
    columns=["Year"],
    values=["Endkunde_NR"],
    aggfunc="count",
    fill_value=0)
)

# Define Groups

In [None]:
from functools import reduce

all_kanal =set(raw_crm_data.loc[:,"Kanal"])
kanal_grps = {}

kanal_grps["Besprechung"]         = {"Besprechung"}
kanal_grps["Besuch"]              = {"Besuch"}
kanal_grps["Brief_Dankeskarte"]   = {"Brief","Dankeskarte"}
kanal_grps["E-Mail"]              = {"E-Mail"}
kanal_grps["Event_Veranstaltung"] = {"Event","Veranstaltung"}
kanal_grps["Telefon"]             = {"Telefon"}

# Stuff all the rest into "Anderes":
kanal_grps["Anderes"]             =  all_kanal - reduce(set.union,kanal_grps.values())  

# Yearly aggregation per ``Kanal`` group element

In [None]:
def contacts_grouped_yrly(date_view,kanal_grps,year_span):
    ####
    def yrl_kanal_contacts(date_view, group_name, rel_year):
        return (raw_crm_data.loc[(raw_crm_data.loc[:,"Kanal"].isin(kanal_grps[group_name]) &     # adjust to key
                         (raw_crm_data.loc[:,"Datum"] <  date_view  - relativedelta(years= rel_year   )) &
                         (raw_crm_data.loc[:,"Datum"] >= date_view  - relativedelta(years= rel_year+1 )) # adjust years
                         ),:]
                    .groupby("Endkunde_NR").count()
                    .reset_index(inplace=False)
                    .loc[:,["Endkunde_NR","Kanal"]]
                    .rename(columns={"Kanal": f"RY_{rel_year}_Anz_{group_name}"})   # adjust "Anzahl"
                    #.sort_values("Anzahl", ascending=False)
               )
    #####
    container_df = yrl_kanal_contacts(date_view=date_view,
                                      group_name=list(kanal_grps.keys())[1],rel_year=0).loc[:,"Endkunde_NR"]
    for name in kanal_grps.keys():
        for i in range(year_span):
            rel = yrl_kanal_contacts(date_view=date_view,
                                     group_name=name,
                                     rel_year=i)
            container_df = pd.merge(container_df,
                                    rel,
                                    on="Endkunde_NR",
                                    how="inner")
    return container_df

## Delta(view_date, last_contact)

In [None]:
def delta_contact(date_view,kanal_grps):
    for name in kanal_grps.keys():
        raw_crm_data.loc[raw_crm_data.Kanal.isin(kanal_grps[name]), "Kanal_Grps"] = name
    
    max_vertical_df = (raw_crm_data.loc[(raw_crm_data.loc[:,"Datum"] <  date_view) # adjust years
                     ,:]
                .groupby(["Endkunde_NR", "Kanal_Grps"])
                .agg({"Datum": np.max})
                .reset_index(inplace=False)
                   )
    max_vertical_df["delta_days"] = (date_view - max_vertical_df.loc[:,"Datum"]).apply(lambda x: x.total_seconds()) / 86400  # delta in days
    
    flatten_df = max_vertical_df.pivot_table(
        index   = "Endkunde_NR",
        columns = ["Kanal_Grps"],
        values  = ["delta_days"],
        aggfunc = "min").reset_index(inplace=False)
    
    flatten_df  = pd.DataFrame(flatten_df.to_records(index=False))
    flatten_df.columns = ["Endkunde_NR"]+[ "Letzter_Kontakt_Delta_"+x.replace("'","").replace("(","").replace("delta_days, ","").replace(",","").replace(" ","").replace(")","") for x in flatten_df.columns[1:]]
    
    flatten_df["Letzter_Kontakt_Delta_global"] = flatten_df.iloc[:,1:].min(axis = 1, skipna = True)
    
    #Letzte_Buchung_Delta
    
    return flatten_df

# Wrapper Function, that does everything in one go!

In [None]:
def crm_train_scoring(day, month, year_score, year_train, year_span):
    date_now      = dt.datetime(year_score,month,day) # only works for odd calendar weeks!!!
    date_training = dt.datetime(year_train,month,day) # only works for odd calendar weeks!!!
    
    def crm_prep(date_view,year_span):
        last_contacts_df      = delta_contact(date_view=date_view,
                                              kanal_grps=kanal_grps)
        grpd_yrly_contacts_df = contacts_grouped_yrly(date_view=date_view,
                                                      kanal_grps=kanal_grps,
                                                      year_span=year_span)

        return pd.merge(grpd_yrly_contacts_df,last_contacts_df,on="Endkunde_NR",how="inner")
    
    crm_train_df = crm_prep(date_view=date_training, year_span=year_span)
    crm_score_df = crm_prep(date_view=date_now,      year_span=year_span)
    
    return (crm_train_df, crm_score_df)
    
    

# Testing Wrapper-Function

In [None]:
crm_train_df, crm_score_df = crm_train_scoring(day=23,
                                               month=9,
                                               year_score=2019,
                                               year_train=2018,
                                               year_span=4)

In [None]:
crm_train_df.head()

In [None]:
def lazy_bxplt_hstgrm(delta_contacts_df):
    for name in delta_contacts_df.columns[1:]:
        print(name)
        boxplot_histogram(delta_contacts_df.loc[delta_contacts_df.loc[:,name] > 0, name])

In [None]:
desc_col(crm_train_df)

In [None]:
lazy_bxplt_hstgrm(crm_train_df)

In [None]:
crm_train_df.iloc[:,28:].describe()

In [None]:
2181/365