# Load Modules

In [None]:
# make imports from pa_lib possible (parent directory of file's directory)
import sys
from pathlib import Path

file_dir = Path.cwd()
print("file_dir: -> ", file_dir)

parent_dir = file_dir.parent
print(parent_dir)

sys.path.append(str(parent_dir))

In [None]:
%load_ext autoreload
%autoreload
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pa_lib.data import clean_up_categoricals, calc_col_partitioned, desc_col

from pa_lib.vis import boxplot_histogram

from pa_lib.file import project_dir, load_bin, load_csv, load_xlsx, store_xlsx
from pa_lib.util import cap_words
import os, fnmatch

# Lazy Recursive Job Dependency Request:
from pa_lib.job import request_job


pd.set_option("display.max_rows", min(raw_feedback.shape[0] + 1, 3000))
pd.set_option("display.max_colwidth", 500)

from matplotlib.pyplot import show
import altair as alt

alt.data_transformers.disable_max_rows()

https://stackabuse.com/python-list-files-in-a-directory/

# Load Data

## Load Feedback

In [None]:
def load_feedback(sales_folder):
    sales_subfolder = os.listdir(sales_folder)
    pattern = "EK_LIST*.xlsx"

    container_df = pd.DataFrame()
    columns_in_xlsx = [
        "Gepard-Nr. Endkunde",
        "Endkunde",
        "Chance",
        "Feedback - bitte auswählen",
        "falls nicht hilfreich, bitte hier einen kurzen Kommentar angeben - entweder pro Zeile oder für die Gesamt-Liste",
    ]
    rename_columns = ["Endkunde_NR", "Endkunde", "Prob_1", "Feedback", "Kommentar"]

    for kw_folder in sales_subfolder:

        if kw_folder >= "2019_07_15":  # Begin of automatisation

            all_files_list = os.listdir(sales_folder / kw_folder)

            for file in all_files_list:

                if fnmatch.fnmatch(file, pattern):
                    # print(f"{kw_folder}  ---- {file}") # Check!
                    vkber_df = load_xlsx(sales_folder / kw_folder / file).loc[
                        :, columns_in_xlsx
                    ]  # .iloc[:,[0,1,-3,-2,-1]])

                    vkber_df.columns = rename_columns

                    vkber_df.loc[:, "folder"] = kw_folder
                    vkber_df.loc[:, "file_name"] = file

                    container_df = container_df.append(vkber_df, ignore_index=False)

    container_df.loc[:, "Kuerzel"] = container_df.loc[:, "file_name"].apply(
        lambda x: x[19:22]
    )

    # Remove all redundant rows:
    feedback_exists = (
        #         (  # Feedback or Kommentar exists
        #             ~container_df.loc[:, "Feedback"].isna()
        #             | ~container_df.loc[:, "Kommentar"].isna()
        #         )
        #         &
        ~container_df.loc[:, "Endkunde_NR"].isna()  # Endkunde_NR must exist
    )
    container_df.loc[:, "Feedback"] = container_df.Feedback.fillna("untouched")

    row_select = ~container_df.Kommentar.isna() & container_df.Feedback.isin(
        {"untouched"}
    )

    container_df.loc[row_select, "Feedback"] = "nur kommentiert"

    return container_df.loc[feedback_exists, :].sort_values(
        ["folder", "file_name"], ascending=[False, True]
    )

In [None]:
sales_folder = Path('P:\Service\Kennzahlen\Verkauf\PredictiveAnalytics')
print(sales_folder)

In [None]:
%%capture
raw_feedback = load_feedback(sales_folder=sales_folder)

In [None]:
display(raw_feedback.sample(5))

## Load VB Information

In [None]:
request_job(job_name="vkber_prepare.py", current="Today")  # output: vkber_data.csv

with project_dir("vkprog"):
    vb_list = load_csv("vkber_data.csv", sep=",", encoding="UTF-8")
    
display(vb_list.head(10))
display(desc_col(vb_list))

# Data Prep: `overview_kuerzel` = `raw_feedback` + `vb_list`

In [None]:
def lazy_replacer(string, to_replace, replaced_with):
    for x in to_replace:
        string = string.replace(x, replaced_with)
    return string

########################################################################################
## Aggregate Data: Counts per Feedback kind

kuerzel_bearbeitung = (
    pd.pivot_table(
        raw_feedback,
        values="Endkunde_NR",
        index=["Kuerzel", "Feedback", "folder"],
        aggfunc="count",
    )
    .reset_index()
    .rename(columns={"Endkunde_NR": "Anzahl"})
    .sort_values(["folder", "Kuerzel", "Feedback"])
)

 
########################################################################################
## Compute "Total" given leads per VB

kuerzel_total_leads = (
    pd.pivot_table(
        raw_feedback, values="Endkunde_NR", index=["folder", "Kuerzel"], aggfunc="count"
    )
    .reset_index()
    .rename(columns={"Endkunde_NR": "Anzahl"})
)

kuerzel_total_leads.loc[:, "Feedback"] = "Total"

kuerzel_bearbeitung = pd.concat(
    [kuerzel_bearbeitung, kuerzel_total_leads], sort=False
).sort_values(["folder", "Kuerzel", "Feedback"])

########################################################################################
## Flatten data:

kuerzel_overview = kuerzel_bearbeitung.pivot_table(
    index=["folder", "Kuerzel"],
    columns=["Feedback"],
    values=["Anzahl"],
    aggfunc="sum",
    fill_value=0,
).reset_index(inplace=False)

########################################################################################
## Compute "touched"

list_rep = ["' ", " '"] + "' ( ) , Anzahl".split()

kuerzel_overview.columns = [
    lazy_replacer(string=str(col), to_replace=list_rep, replaced_with="")
    for col in list(kuerzel_overview.columns)
]
kuerzel_overview.loc[:, "touched"] = kuerzel_overview.Total - kuerzel_overview.untouched

########################################################################################
## Add vb_list

kuerzel_overview = (
    pd.merge(
        kuerzel_overview, vb_list, how="left", left_on="Kuerzel", right_on="KURZZEICHEN"
    )
    .drop(columns=["MA_ID", "KURZZEICHEN"])
    .sort_values("FUNKTION")
)

In [None]:
display(kuerzel_overview.sample(5))

## Summary: file description

***
> 1. `raw_feedback`: Raw data of all excels
2. `overview_kuerzel`: Aggregated feedback, one line per VB per VKPROG

``Total`` = ``hilfreich`` + ``nicht bearbeitet`` + ``nicht hilfreich`` + ``nur kommentiert`` + ``untouched``

``Total`` = ``untouched`` + ``touched``

***

# Excel-Deployment

In [None]:
with pd.ExcelWriter("feedback_report_02.xlsx") as writer:
    kuerzel_overview.to_excel(writer, sheet_name="Overview", index=False)
    raw_feedback.loc[
        :,
        [
            "Endkunde_NR",
            "Endkunde",
            "Prob_1",
            "Feedback",
            "Kommentar",
            "folder",
            "Kuerzel",
        ],
    ].to_excel(writer, sheet_name="Feedback", index=False)
    
print(f"Check file in folder: {file_dir}")

In [None]:
store_xlsx(kuerzel_overview, "test.xlsx",index=False)

# Remarks

***
> 1. Seit der Umstellung auf Random Forest `2019_11_18` ist die  Anzahl der `hilfreich` grösser als `nicht hilfreich`
2. In den einzelnen Listen befinden sind politische Gruppierungen vorhanden. Nicht erwünscht.
3. Einige der Endkunden sind aufgelöst (zb. Insolvenz)
4. Einige Kunden buchen ausschliesslich über Agentur
5. Kunden sind eigentlich bei anderem Verkausberater
6. Einige Kunden haben für einmalige Events bei uns in der Vergangenheit gebucht. Keine Wiederholungen geplant (zb. Jubiläen)
***

# Feedback Count

In [None]:
raw_feedback.sample(5)

# Counts: Deployed Leads

In [None]:
(raw_feedback.groupby("folder").agg({"Endkunde_NR": "count"})).tail(5)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(8,8))
ax = (
    sns.countplot(
        x     = "folder",
        #y    = "Feedback",
        data  = raw_feedback,
        alpha = 0.6,
        #order = raw_feedback.loc[:,'Kuerzel'].value_counts().index
        )
    )

plt.title("Total: Anzahl Kunden pro Verkaufsprognosen")
plt.xticks(rotation=30, ha="right")
plt.xlabel("Verkaufsprognosen")
plt.ylabel("Anzahl")

## Count Distribution by Feedback

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(15,8))
ax = (
    sns.countplot(
        x     = "folder",
        hue   = "Feedback",
        data  = raw_feedback,
        alpha = 0.6,
        )
    )
plt.legend(loc='best')

plt.title("Anzahl Feedbacks pro Verkaufsprognose")
plt.xticks(rotation=30, ha="right")
plt.xlabel("Verkaufsprognosen")
plt.ylabel("Anzahl")

In [None]:
raw_feedback.loc[:, "test"] = 1

alt.Chart(
    # raw_feedback.loc[raw_feedback.folder.isin(last_vkrpogs), :]
    raw_feedback
).mark_bar(opacity=0.6).encode(
    y=alt.X("sum(test)", stack="normalize", title="Anteil",),
    x=alt.Y("folder",title="Verkaufsprognose"),
    color="Feedback",
    order=alt.Order("Feedback",sort="ascending")
).properties(
    height=500, width=500
)


In [None]:
last_vkrpogs = set(sorted(list(set(raw_feedback.folder)), reverse=True)[:4])

display(last_vkrpogs)

In [None]:
yticks = range(0, 21)

g = sns.catplot(
    x="folder",
    hue="Feedback",
    col="Kuerzel",
    data=raw_feedback.loc[raw_feedback.folder.isin(last_vkrpogs), :],
    kind="count",
    height=8,
    aspect=1,
    col_wrap=3,
    alpha=0.6,
)

g.set(yticks=yticks)

# plt.xticks(rotation=30, ha="right")

## Report: Bearbeitungsgrad pro VB

In [None]:
row_select = raw_feedback.folder.isin({"2019_12_30"})

sns.set(style="darkgrid")
plt.figure(figsize=(10, 20))

ax = sns.countplot(
    y="Kuerzel", hue="Feedback", data=raw_feedback.loc[row_select, :], alpha=0.5
)
plt.legend(loc="best")

plt.title("Anzahl Feedbacks pro Verkaufsprognose")
plt.xticks(rotation=30, ha="right")
plt.ylabel("Kuerzel")
plt.xlabel("Anzahl")

## Remarks

***
>1. Vorsicht: Die Verkaufsberater erhalten jeweils eine unterschiedliche Anzahl Leads.
***

# Feedback so far: helpful? not helpful?

In [None]:
vkprog_set = {"2019_12_30", "2019_12_16"}
(
raw_feedback.loc[raw_feedback.folder.isin(vkprog_set),:]
    .groupby(["folder","Feedback"])
    .agg({"Endkunde": "count"})
    .sort_values("folder",ascending=False)
)

# Scribbles (IGNORE)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(25,10))
ax = (
    sns.countplot(
        x     = "Kuerzel",
        #y    = "Feedback",
        data  = raw_feedback,
        alpha = 0.6,
        order = raw_feedback.loc[:,'Kuerzel'].value_counts().index
        )
    )
    
plt.title("Total: Anzahl Leads pro Gruppe")
plt.xticks(rotation=30, ha="right")
plt.xlabel("Kuerzel")
plt.ylabel("Anzahl")

In [None]:
heatmap_data = pd.pivot_table(
    raw_feedback,
    index      = "folder",
    columns    = "Kuerzel",
    values     = "Endkunde_NR",
    aggfunc    = np.count_nonzero,
    fill_value = 0
    ).T

plt.figure(figsize=(12,15))

ax = sns.heatmap(
    data       = heatmap_data,
    linewidths = 0.5,
    annot      = True,
    annot_kws={"size": 15},
    )

ax.set_yticklabels(ax.get_yticklabels(), rotation=0, horizontalalignment='right')

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

plt.title("Heatmap: Anzahl Kunden pro Verkaufsprognosen & Kuerzel")
plt.xlabel("Verkaufsprognosen")
plt.ylabel("Kuerzel")