In [1]:
%load_ext autoreload
%autoreload 2

# IMPORTS

In [2]:
import os
from dotenv import load_dotenv

# data
import pandas as pd

# gensim
import gensim
from gensim.corpora.dictionary import Dictionary

# spacy
from spacy.lang.de.stop_words import STOP_WORDS
STOP_WORDS = list(STOP_WORDS)

# scikit
from sklearn.feature_extraction.text import TfidfVectorizer # ,CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
# from sklearn.datasets import fetch_20newsgroups

# topicwizard
import topicwizard
from topicwizard.pipeline import make_topic_pipeline
# from topicwizard.compatibility import gensim_pipeline

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

# src
from src.general.io import to_pickle, from_pickle
from src.general.io import disable_warnings
from src.models.custom_lda import CustomLDA

In [3]:
disable_warnings()

# PATHS & NAMES

In [4]:
load_dotenv()

RAW_DATA_PATHS = {}
for short_name, filename in zip(["feb", "mar", "apr", "may", "feedback"],
                                ["event_stg_user_input_web_cw6.txt",
                                "event_stg_user_input_web_cw10.txt",
                                "event_stg_user_input_web_cw15.txt",
                                "event_stg_user_input_web_cw19.txt",
                                "current_user_feedback_text.xlsx",
                                ]):
    RAW_DATA_PATHS[short_name] = f"..{os.getenv('RAW_DATA_DIR')}/{filename}"


EXT_DATA_PATHS = {}
for short_name, filename in zip(["rasa"],
                                ["rasa_train_data.pkl"]):
    EXT_DATA_PATHS[short_name] = f"..{os.getenv('EXTERNAL_DATA_DIR')}/{filename}"


INTERIM_DATA_PATHS = {}
for short_name, filename in zip(["rasa_docs", "rasa_emb"],
                                ["rasa_docs.pkl",
                                "rasa_embedings.pkl",
                                ]):
    INTERIM_DATA_PATHS[short_name] = f"..{os.getenv('INTERIM_DATA_DIR')}/{filename}"

# add more interims
for short_name in RAW_DATA_PATHS.keys():
    INTERIM_DATA_PATHS[f"{short_name}_doc"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_docs.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_emb"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_embeding.pkl"
    INTERIM_DATA_PATHS[f"{short_name}_tok"] = f"..{os.getenv('INTERIM_DATA_DIR')}/{short_name}_tokens.pkl"


EDA_REPORT_PATHS = {}
for short_name in RAW_DATA_PATHS.keys():
    EDA_REPORT_PATHS[short_name] = f"..{os.getenv('REPORT_DIR')}/eda_{short_name}.html"

                        
TEXT_COL_USERINPUT = "user_input"
ID_COL_USERINPUT = "session_id"

N_TOPICS = 20
MONTHS = ["feb", "mar", "apr", "may"]

# DATA

In [6]:
token_data = {}
docs_data = {}
for month in MONTHS:
    token_data[month] = from_pickle(INTERIM_DATA_PATHS[f"{month}_tok"])
    docs_data[month] = from_pickle(INTERIM_DATA_PATHS[f"{month}_doc"])

# Topic modelling

## gensim

### LDA

In [7]:
# # Fit LDA model
# gensim_model = gensim.models.ldamodel.LdaModel(
#     corpus = corpus,      # Document-Term Matrix
#     id2word = id2word,    # Map word IDs to words
#     num_topics = N_TOPICS,      # Number of latent topics to extract
#     random_state = SEED,
#     passes = 100,         # N° of passes through the corpus during training
#     )

In [8]:
# # Visualize with pyLDAvis
# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim_models.prepare(gensim_model, corpus, id2word, mds = "mmds", R = 21)
# visualization

### Custom LDA

#### hyper-parameters optimisation

In [11]:
if 1==2:
    custom_lda = CustomLDA(token_data["feb"], seed=int(os.getenv('SEED')))
    custom_lda.grid_search_hyperparameters()
    to_pickle(custom_lda, f"../{os.getenv('MODELS_DIR')}/lda.pkl")
else:
    custom_lda = from_pickle(f"../{os.getenv('MODELS_DIR')}/lda.pkl")

print(f"{custom_lda.best_num_topics=}, {custom_lda.best_alpha=}, {custom_lda.best_beta=}")
custom_lda.df_grid_results.head()

custom_lda.best_num_topics=50, custom_lda.best_alpha=0.61, custom_lda.best_beta=0.61


Unnamed: 0,Topics,Alpha,Beta,Coherence
0,10,0.01,0.01,0.372782
1,10,0.01,0.31,0.424901
2,10,0.01,0.61,0.471481
3,10,0.01,0.91,0.487
4,10,0.01,symmetric,0.390388


In [16]:
custom_lda = CustomLDA(token_data["feb"], seed=int(os.getenv('SEED')))

#### Best params

##### Febraury

In [17]:
custom_lda.fit(50, 0.61, 0.61) # when no parameters are provied best (estimated by grid search) would be used
custom_lda.visualize("train")

##### March

In [18]:
custom_lda.add_corpus(token_data["mar"], "mar")
custom_lda.visualize("mar")

##### April

In [20]:
custom_lda.add_corpus(token_data["apr"], "apr")
custom_lda.visualize("apr")

##### May

In [22]:
custom_lda.add_corpus(token_data["may"], "may")
custom_lda.visualize("may")

### NMF

In [11]:
# Fit NMF model
nmf_model = gensim.models.Nmf(
    corpus = corpus,     # Document-Term Matrix
    id2word = id2word,   # Map word IDs to words
    num_topics = N_TOPICS,     # Number of latent topics to extract
    random_state = SEED,
    passes = 100,        # N° of passes through the corpus during training
    )

# Get the topics sorted by sparsity
pd.DataFrame(nmf_model.show_topics(), columns=["topic", "keywords"])

Unnamed: 0,topic,keywords
0,17,"0.365*""der"" + 0.015*""nicht"" + 0.013*""beheben"" ..."
1,18,"0.119*""techniker"" + 0.109*""zu"" + 0.066*""kommen..."
2,6,"0.289*""ein"" + 0.040*""techniker"" + 0.026*""haben..."
3,19,"0.314*""mein"" + 0.043*""nicht"" + 0.019*""auf"" + 0..."
4,11,"0.146*""nicht"" + 0.125*""da"" + 0.125*""warum"" + 0..."
5,14,"0.373*""nicht"" + 0.311*""funktionieren"" + 0.022*..."
6,3,"0.364*""festnetz"" + 0.056*""mein"" + 0.027*""mit"" ..."
7,16,"0.332*""gehen"" + 0.314*""nicht"" + 0.015*""telefon..."
8,1,"0.419*""wlan"" + 0.054*""verbindung"" + 0.015*""anz..."
9,7,"0.489*""telefon"" + 0.012*""beispielort"" + 0.012*..."


## Sklearn

In [12]:
vectorizer = TfidfVectorizer(stop_words=STOP_WORDS)

In [13]:
model_nmf = NMF(n_components=N_TOPICS, random_state=SEED)
model_lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=SEED)

### NMF

In [14]:
topic_pipeline = make_topic_pipeline(vectorizer, model_nmf)
topic_pipeline.fit(docs)
topicwizard.visualize(docs, model=topic_pipeline)

Preprocessing
Inferring topical content for documents.


### LDA

In [15]:
topic_pipeline = make_topic_pipeline(vectorizer, model_lda)
topic_pipeline.fit(docs)
topicwizard.visualize(docs, model=topic_pipeline)

Preprocessing
Inferring topical content for documents.
