In [1]:
%load_ext autoreload
%autoreload 2

# IMPORTS

In [1]:
import os
from dotenv import load_dotenv

# data
import pandas as pd
import numpy as np

# other
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# src
from src.general.io import to_pickle, from_pickle
from src.features.nlp import lemmatize, tokenize, filter_text_column, filter_rows_with_string, generate_ngrams

# PATHS & NAMES

In [5]:
load_dotenv()

RAW_DATA_PATHS = {}
for short_name, filename in zip(["feb", "mar", "apr", "may", "feedback"],
                                ["event_stg_user_input_web_cw6.txt",
                                "event_stg_user_input_web_cw10.txt",
                                "event_stg_user_input_web_cw15.txt",
                                "event_stg_user_input_web_cw19.txt",
                                "current_user_feedback_text.xlsx",
                                ]):
    RAW_DATA_PATHS[short_name] = f"..{os.getenv('RAW_DATA_DIR')}/{filename}"


EXT_DATA_PATHS = {}
for short_name, filename in zip(["rasa"],
                                ["rasa_train_data.pkl"]):
    EXT_DATA_PATHS[short_name] = f"..{os.getenv('EXTERNAL_DATA_DIR')}/{filename}"


INTERIM_DATA_PATHS = {}
for short_name, filename in zip(["rasa_docs", "rasa_emb"],
                                ["rasa_docs.pkl",
                                "rasa_embedings.pkl",
                                ]):
    INTERIM_DATA_PATHS[short_name] = f"..{os.getenv('INTERIM_DATA_DIR')}/{filename}"

# add more interims
for short_name in RAW_DATA_PATHS.keys():
    INTERIM_DATA_PATHS[f"{short_name}_doc"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_docs.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_emb"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_embeding.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_tok"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_tokens.pkl"


EDA_REPORT_PATHS = {}
for short_name in RAW_DATA_PATHS.keys():
    EDA_REPORT_PATHS[short_name] = f"..{os.getenv('REPORT_DIR')}/eda_{short_name}.html"


SBERT_PATH = f"..{os.getenv('MODELS_DIR')}/sbert"


# Feedback data variables
KEEP_COLS_FEEDBACK = ["session_id"
                        ,"created_dt"
                        ,"channel"
                        ,"feedback_text_list"
                        ,"dialog_engine"
                        ,"topic_list"
                        ,"topic_count"]

# User input variables
KEEP_COLS_USERINPUT = ["session_id"
                        ,"event_id"
                        ,"created_dt"
                        ,"user_input"
                        ,"top_intent_1"
                        ,"content_version"]
                        
TEXT_COL_USERINPUT = "user_input"
ID_COL_USERINPUT = "session_id"

N_TOPICS = 20

In [None]:
embeding_model = SentenceTransformer(SBERT_PATH)

# DATA

## feedback data

In [4]:
feedback = pd.read_excel(RAW_DATA_PATHS["feedback"])

In [25]:
df = feedback.loc[:,KEEP_COLS_FEEDBACK]
feedback_docs = df["feedback_text_list"].to_list()
to_pickle(feedback_docs, INTERIM_DATA_PATHS["feedback_doc"])

True

## Rasa NLU training examples

In [26]:
rasa_data = from_pickle(EXT_DATA_PATHS["rasa"])
rasa_docs = rasa_data["example"].to_list()
to_pickle(rasa_docs, INTERIM_DATA_PATHS["rasa_docs"])

rasa_embedings = np.vstack(rasa_data["embeding"].to_numpy())
to_pickle(rasa_embedings, INTERIM_DATA_PATHS["rasa_emb"])

True

## User input examples

In [10]:
months_to_process = ["feb", "mar", "apr", "may"]
pbar = tqdm(len(months_to_process))

for month in months_to_process:
    pbar.set_description(f"{month=}")
    # read data
    df = pd.read_csv(RAW_DATA_PATHS[month], sep='|', low_memory=False)

    # clean up - keep TKS only, omit number inputs and spec characters
    cleant = filter_text_column(df, TEXT_COL_USERINPUT)
    cleant = filter_rows_with_string(cleant, "tks", "top_intent_1")

    # group conversation into one sentence
    grouped = cleant.groupby(ID_COL_USERINPUT)[TEXT_COL_USERINPUT].apply(' '.join).reset_index()
    
    docs = grouped[TEXT_COL_USERINPUT].to_list()
    to_pickle(docs, INTERIM_DATA_PATHS[f"{month}_doc"])

    # embedings = grouped[TEXT_COL_USERINPUT].apply(model.encode).to_numpy() #np.vstack()
    # to_pickle(embedings, INTERIM_DATA_PATHS[f"{month}_emb"])

    lemmatized_docs = lemmatize(docs=docs, model_name="de_core_news_lg")
    tokens = tokenize(docs=lemmatized_docs)
    tokens = generate_ngrams(tokens, n=4)
    to_pickle(tokens, INTERIM_DATA_PATHS[f"{month}_tok"])

    pbar.update(1)
    
pbar.close()

month='may': : 4it [02:01, 30.38s/it]
