In [7]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from gensim.parsing.preprocessing import preprocess_string
import spacy
from typing import Dict, Any, List, Tuple
from pprint import pprint
import pymongo
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt

# %matplotlib inline

# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

nlp = None


def split_in_sentences(text: str) -> List[str]:
    doc = spacy_nlp(text)
    return [str(sent).strip() for sent in doc.sents]


def remove_stopwords(word: str) -> str:
    word = word.replace("(", " ")
    word = word.replace(")", " ")
    word = word.replace("/", " ")
    word = word.replace("II", " ")
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    wl = spacy_nlp(word)
    tokens = [word for word in wl if not word.is_stop and word.pos_ in allowed_postags]
    return " ".join(str(x) for x in tokens), tokens


def spacy_nlp(x: str):
    global nlp
    if nlp == None:
        nlp = spacy.load("de_core_news_md")
        nlp.disable_pipe("ner")
        nlp.disable_pipe("attribute_ruler")
        nlp.add_pipe('sentencizer')

    y = nlp(x)
    return y


  and should_run_async(code)


In [8]:
# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = spacy_nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def tm_test(docs: any):
    data_words= []
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    for doc in docs:
        txt = doc["text"]
        txt = txt.replace("\n", " ")
        paragraphs: List[str] = split_in_sentences(txt)
        for p in paragraphs:
            pt, ignore = remove_stopwords(p)
            p = preprocess_string(pt)
            if len(p)>0:
                data_words.append(list(p))
            # print(data_words)

    # print(data_words)
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    # print(bigram_mod)
    # print(trigram_mod)

    data_words_bigrams = [bigram_mod[doc] for doc in data_words]
    data_lemmatized = data_words_bigrams

    id2word = corpora.Dictionary(data_lemmatized)
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]
    # print(corpus[:1])
    # print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[0:10]])

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    # coherence_lda = coherence_model_lda.get_coherence()
    # print('\nCoherence Score: ', coherence_lda)                                   

    pprint(lda_model.print_topics())    
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    vis




  and should_run_async(code)


In [9]:
def extractDocs():
    # uri1 = os.getenv("MONGO_CONNECTION")
    # uri1 = "mongodb://localhost:27017"
    uri1 = "mongodb+srv://klsuser:Kb.JHQ-.HrCs6Fw@cluster0.7qi8s.mongodb.net/test?authSource=admin&replicaSet=atlas-o1jpuq-shard-0&readPreference=primary&appname=MongoDB%20Compass&ssl=true"

    myclient = pymongo.MongoClient(uri1)
    # myclient._topology_settings

    mydb = myclient["kibardoc"]
    
    samples = mydb["samples"]
    # extractText("C:\\Data\\test\\topics",
    #             samples, "http://localhost:9998")
    texts = []
    # for s in samples.find({"path": "C:\\Data\\test\\topics\\baumfällung"})[:]:
    #     texts.append(s)
    # for s in samples.find({"path": "C:\\Data\\test\\topics\\werbung"})[:]:
    #     texts.append(s)
    for s in samples.find({"path": "C:\\Data\\test\\topics\\fenster"})[:]:
        texts.append(s)
    tm_test(texts)

extractDocs()

  and should_run_async(code)



Perplexity:  -24.91640243005216
[(0,
  '0.030*"ausgetauscht" + 0.022*"nehmen" + 0.021*"thema" + 0.019*"weg" + '
  '0.016*"substanz" + 0.010*"krefeld" + 0.000*"holzfenst" + 0.000*"finden" + '
  '0.000*"gangbaren" + 0.000*"fassadengestaltung"'),
 (1,
  '0.692*"fenster" + 0.059*"historisch" + 0.025*"erhalten" + 0.013*"lassen" + '
  '0.011*"einbruchschutz" + 0.007*"stand" + 0.006*"wartung" + '
  '0.002*"gleichzeitig" + 0.002*"handwerklich" + 0.000*"verlust"'),
 (2,
  '0.326*"fensterbau" + 0.068*"wert" + 0.056*"fenstern" + 0.025*"kosten" + '
  '0.016*"schäden" + 0.015*"denkmalgerecht" + 0.012*"energetisch" + '
  '0.006*"historischen_fenstern" + 0.003*"erkennen" + 0.000*"rung"'),
 (3,
  '0.009*"beitrag" + 0.007*"hierzu" + 0.002*"interess" + 0.001*"mal" + '
  '0.000*"fest" + 0.000*"beispielsweis" + 0.000*"regen" + 0.000*"widerspruch" '
  '+ 0.000*"direkten" + 0.000*"notwendigkeit"'),
 (4,
  '0.000*"preisbeispielen" + 0.000*"abm" + 0.000*"dekorationsprofil" + '
  '0.000*"standardmäßig" + 0.00