In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# IMPORTS

In [14]:
import os
from dotenv import load_dotenv

# data
import pandas as pd

# gensim
import gensim
from gensim.corpora.dictionary import Dictionary

# spacy
from spacy.lang.de.stop_words import STOP_WORDS
STOP_WORDS = list(STOP_WORDS)

# scikit
from sklearn.feature_extraction.text import TfidfVectorizer # ,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
# from sklearn.datasets import fetch_20newsgroups

# topicwizard
import topicwizard
from topicwizard.pipeline import make_topic_pipeline
# from topicwizard.compatibility import gensim_pipeline

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

# src
from src.general.io import to_pickle, from_pickle
from src.general.io import disable_warnings
from src.models.custom_lda import CustomLDA

In [15]:
disable_warnings()

# PATHS & NAMES

In [16]:
load_dotenv()

RAW_DATA_PATHS = {}
for short_name, filename in zip(["feb", "mar", "apr", "may", "feedback"],
                                ["event_stg_user_input_web_cw6.txt",
                                "event_stg_user_input_web_cw10.txt",
                                "event_stg_user_input_web_cw15.txt",
                                "event_stg_user_input_web_cw19.txt",
                                "current_user_feedback_text.xlsx",
                                ]):
    RAW_DATA_PATHS[short_name] = f"..{os.getenv('RAW_DATA_DIR')}/{filename}"


EXT_DATA_PATHS = {}
for short_name, filename in zip(["rasa"],
                                ["rasa_train_data.pkl"]):
    EXT_DATA_PATHS[short_name] = f"..{os.getenv('EXTERNAL_DATA_DIR')}/{filename}"


INTERIM_DATA_PATHS = {}
for short_name, filename in zip(["rasa_docs", "rasa_emb"],
                                ["rasa_docs.pkl",
                                "rasa_embedings.pkl",
                                ]):
    INTERIM_DATA_PATHS[short_name] = f"..{os.getenv('INTERIM_DATA_DIR')}/{filename}"

# add more interims
for short_name in RAW_DATA_PATHS.keys():
    INTERIM_DATA_PATHS[f"{short_name}_doc"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_docs.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_emb"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_embeding.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_tok"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_tokens.pkl"


EDA_REPORT_PATHS = {}
for short_name in RAW_DATA_PATHS.keys():
    EDA_REPORT_PATHS[short_name] = f"..{os.getenv('REPORT_DIR')}/eda_{short_name}.html"

                        
TEXT_COL_USERINPUT = "user_input"
ID_COL_USERINPUT = "session_id"

N_TOPICS = 20
MONTHS = ["feb", "mar", "apr", "may"]

In [17]:
# RAWFILE_FOLDER = "../data/raw"
# INTERIM_FOLDER = "../data/interim"
# EXTERNAL_FOLDER = "../data/external"
# REPORT_FOLDER = "../reports"
# MODELS_FOLDER = "../models"

# # processed
# FEEDBACK_PROCESSED_FILEPATH = f"{INTERIM_FOLDER}/feedback_docs.pkl"
# RASA_DOCS_FILEPATH = f"{INTERIM_FOLDER}/rasa_docs.pkl"
# RASA_EMBEDINGS_FILEPATH = f"{INTERIM_FOLDER}/rasa_embedings.pkl"
# RASA_FILEPATH = f"{EXTERNAL_FOLDER}/rasa_train_data.pkl"

# FEB_DOCS_FILEPATH = f"{INTERIM_FOLDER}/february_docs.pkl"
# MAR_DOCS_FILEPATH = f"{INTERIM_FOLDER}/march_docs.pkl"
# APR_DOCS_FILEPATH = f"{INTERIM_FOLDER}/april_docs.pkl"
# MAY_DOCS_FILEPATH = f"{INTERIM_FOLDER}/may_docs.pkl"

# FEB_TOKENS_FILEPATH = f"{INTERIM_FOLDER}/february_tokens.pkl"
# MAR_TOKENS_FILEPATH = f"{INTERIM_FOLDER}/march_tokens.pkl"
# APR_TOKENS_FILEPATH = f"{INTERIM_FOLDER}/april_tokens.pkl"
# MAY_TOKENS_FILEPATH = f"{INTERIM_FOLDER}/may_tokens.pkl"

# FEB_EMBEDING_FILEPATH = f"{INTERIM_FOLDER}/february_embeding.pkl"
# MAR_EMBEDING_FILEPATH = f"{INTERIM_FOLDER}/march_embeding.pkl"
# APR_EMBEDING_FILEPATH = f"{INTERIM_FOLDER}/april_embeding.pkl"
# MAY_EMBEDING_FILEPATH = f"{INTERIM_FOLDER}/may_embeding.pkl"

# CUSTOM_LDA_PATH = f"{MODELS_FOLDER}/lda.pkl"

# SEED = 42
# N_TOPICS = 20

# DATA

In [18]:
token_data = {}
docs_data = {}
for month in MONTHS:
    token_data[month] = from_pickle(INTERIM_DATA_PATHS[f"{month}_tok"])
    docs_data[month] = from_pickle(INTERIM_DATA_PATHS[f"{month}_doc"])

# Topic modelling

## gensim

### LDA

In [19]:
# # Fit LDA model
# gensim_model = gensim.models.ldamodel.LdaModel(
#     corpus = corpus,      # Document-Term Matrix
#     id2word = id2word,    # Map word IDs to words
#     num_topics = N_TOPICS,      # Number of latent topics to extract
#     random_state = SEED,
#     passes = 100,         # N° of passes through the corpus during training
#     )

In [20]:
# # Visualize with pyLDAvis
# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim_models.prepare(gensim_model, corpus, id2word, mds = "mmds", R = 21)
# visualization

### Custom LDA

In [21]:
custom_lda = CustomLDA(token_data["feb"], seed=int(os.getenv('SEED')))

#### hyper-parameters optimisation

In [22]:
if 1==2:
    custom_lda.grid_search_hyperparameters()
    to_pickle(custom_lda, f"{os.getenv('MODELS_DIR')}/lda.pkl")

    print(f"{custom_lda.best_num_topics=}, {custom_lda.best_alpha=}, {custom_lda.best_beta=}")
    custom_lda.df_grid_results.head()

#### 50 topics

##### Febraury

In [24]:
custom_lda.fit() # when no parameters are provied best (estimated by grid search) would be used
custom_lda.visualize("train")

##### March

In [26]:
custom_lda.add_corpus(token_data["mar"], "mar")
custom_lda.visualize("mar")

In [None]:
# # new corpus
# corpus_mar = [custom_lda.id2word.doc2bow(doc) for doc in tokens_mar]
# predictions_apr = custom_lda.model[corpus_mar]

# # visual
# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim_models.prepare(custom_lda.model, corpus_apr, custom_lda.id2word, mds="mmds", R=21)
# visualization

##### April

In [27]:
custom_lda.add_corpus(token_data["apr"], "apr")
custom_lda.visualize("apr")

In [26]:
# # new corpus
# corpus_apr = [custom_lda.id2word.doc2bow(doc) for doc in tokens_apr]
# predictions_apr = custom_lda.model[corpus_apr]

# # visual
# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim_models.prepare(custom_lda.model, corpus_apr, custom_lda.id2word, mds="mmds", R=21)
# visualization

##### May

In [28]:
custom_lda.add_corpus(token_data["may"], "may")
custom_lda.visualize("may")

In [29]:
# # new corpus
# corpus_may = [custom_lda.id2word.doc2bow(doc) for doc in tokens_may]
# predictions_may = custom_lda.model[corpus_may]

# # visual
# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim_models.prepare(custom_lda.model, corpus_may, custom_lda.id2word, mds="mmds", R=21)
# visualization 

### NMF

In [11]:
# Fit NMF model
nmf_model = gensim.models.Nmf(
    corpus = corpus,     # Document-Term Matrix
    id2word = id2word,   # Map word IDs to words
    num_topics = N_TOPICS,     # Number of latent topics to extract
    random_state = SEED,
    passes = 100,        # N° of passes through the corpus during training
    )

# Get the topics sorted by sparsity
pd.DataFrame(nmf_model.show_topics(), columns=["topic", "keywords"])

Unnamed: 0,topic,keywords
0,17,"0.365*""der"" + 0.015*""nicht"" + 0.013*""beheben"" ..."
1,18,"0.119*""techniker"" + 0.109*""zu"" + 0.066*""kommen..."
2,6,"0.289*""ein"" + 0.040*""techniker"" + 0.026*""haben..."
3,19,"0.314*""mein"" + 0.043*""nicht"" + 0.019*""auf"" + 0..."
4,11,"0.146*""nicht"" + 0.125*""da"" + 0.125*""warum"" + 0..."
5,14,"0.373*""nicht"" + 0.311*""funktionieren"" + 0.022*..."
6,3,"0.364*""festnetz"" + 0.056*""mein"" + 0.027*""mit"" ..."
7,16,"0.332*""gehen"" + 0.314*""nicht"" + 0.015*""telefon..."
8,1,"0.419*""wlan"" + 0.054*""verbindung"" + 0.015*""anz..."
9,7,"0.489*""telefon"" + 0.012*""beispielort"" + 0.012*..."


## Sklearn

In [12]:
vectorizer = TfidfVectorizer(stop_words=STOP_WORDS)

In [13]:
model_nmf = NMF(n_components=N_TOPICS, random_state=SEED)
model_lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=SEED)

### NMF

In [14]:
topic_pipeline = make_topic_pipeline(vectorizer, model_nmf)
topic_pipeline.fit(docs)
topicwizard.visualize(docs, model=topic_pipeline)

Preprocessing
Inferring topical content for documents.


### LDA

In [15]:
topic_pipeline = make_topic_pipeline(vectorizer, model_lda)
topic_pipeline.fit(docs)
topicwizard.visualize(docs, model=topic_pipeline)

Preprocessing
Inferring topical content for documents.
