In [26]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import Nmf, LdaModel
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default='notebook'
from bertopic import BERTopic
from umap import UMAP
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [27]:
comments = pd.read_csv('comments_clean.csv')

# BERTopic

In [28]:
docs = comments.clean_body.to_list()

In [29]:
umap_model = UMAP(n_neighbors=100, 
                  n_components=10, 
                  metric ='cosine', 
                  random_state=100)

In [30]:
model = BERTopic(language="russian", verbose = True, nr_topics=100)
#model.reduce_topics(docs, nr_topics=100)

In [31]:
topics, probs = model.fit_transform(docs)

Batches:   0%|          | 0/7068 [00:00<?, ?it/s]

2022-12-26 19:28:40,773 - BERTopic - Transformed documents to Embeddings
2022-12-26 19:30:48,189 - BERTopic - Reduced dimensionality
2022-12-26 19:31:03,701 - BERTopic - Clustered reduced embeddings
2022-12-26 19:32:31,487 - BERTopic - Reduced number of topics from 1541 to 101


In [32]:
topic_info = model.get_topic_info()

In [50]:
model.get_topic(2)

[('краска', 0.05429293804646796),
 ('цвет', 0.03557368048768363),
 ('красить', 0.03013561179194114),
 ('слой', 0.027287003461947897),
 ('банка', 0.02121166858311329),
 ('белый', 0.01830594242537827),
 ('покрасить', 0.01763762311075046),
 ('ложиться', 0.013466149834927908),
 ('банк', 0.013057215773057084),
 ('покраска', 0.01186323870923308)]

In [42]:
model.save("berttopic_100")


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [47]:
model.reduce_topics(docs, nr_topics=85)

2022-12-26 19:49:08,878 - BERTopic - Reduced number of topics from 91 to 86


<bertopic._bertopic.BERTopic at 0x2afc941f0>

In [48]:
model.visualize_hierarchy()

In [49]:
model.visualize_barchart(top_n_topics=12)

In [9]:
model.visualize_topics(top_n_topics=100)

In [35]:

documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

nmpi_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_npmi')
nmpi = nmpi_model.get_coherence()

u_mass_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='u_mass')
u_mass = u_mass_model.get_coherence()

print("BERTOPIC model")
print ("coherence: {}".format(coherence))
print("nmpi: {}".format(nmpi))
print("mass: {}".format(u_mass))

BERTOPIC model
coherence: 0.652033346626809
nmpi: 0.14749986082419472
mass: -0.4063863378463766


# NMF

In [51]:
X_token = []

for index in range(len(docs)):
    X_token.append(docs[index].split())

dictionary = corpora.Dictionary(X_token)
corpus = [dictionary.doc2bow(text) for text in X_token]

#Model
nmf = Nmf(corpus, num_topics=100, id2word=dictionary)
nmf_topics = nmf.print_topics(num_words=10)

for i,topic in enumerate(nmf_topics):
    print("\nTopic #{}:".format(i))
    print(topic)
    print("-"*70)


Topic #0:
(40, '0.052*"провод" + 0.018*"розетка" + 0.014*"отверстие" + 0.012*"корпус" + 0.011*"светильник" + 0.011*"инструмент" + 0.009*"кабель" + 0.008*"нужный" + 0.007*"подключать" + 0.007*"материал"')
----------------------------------------------------------------------

Topic #1:
(63, '0.081*"приобретать" + 0.068*"окно" + 0.043*"крепление" + 0.020*"пластиковый" + 0.020*"дом" + 0.017*"сетка" + 0.014*"комплект" + 0.008*"штора" + 0.006*"собирать" + 0.006*"сборка"')
----------------------------------------------------------------------

Topic #2:
(65, '0.108*"знать" + 0.054*"панель" + 0.038*"клей" + 0.026*"купить" + 0.017*"обычный" + 0.014*"монтаж" + 0.010*"кабель" + 0.007*"пвх" + 0.006*"плинтус" + 0.006*"общий"')
----------------------------------------------------------------------

Topic #3:
(95, '0.153*"рука" + 0.015*"держать" + 0.012*"упаковка" + 0.010*"инструмент" + 0.010*"лежать" + 0.008*"легко" + 0.008*"материал" + 0.007*"расти" + 0.005*"кривой" + 0.005*"диск"')
-------------

# LDA

In [52]:
lda = LdaModel(corpus, num_topics=100 ,  id2word=dictionary)
lda_topics = lda.print_topics(num_words=10)

for i,topic in enumerate(lda_topics):
    print("\nTopic #{}:".format(i))
    print(topic)
    print("-"*70)


Topic #0:
(76, '0.156*"дизайн" + 0.137*"интерьер" + 0.100*"дорогой" + 0.081*"низкий" + 0.075*"качество" + 0.061*"деталь" + 0.057*"цена" + 0.041*"пять" + 0.031*"бренд" + 0.031*"изначально"')
----------------------------------------------------------------------

Topic #1:
(30, '0.131*"полный" + 0.117*"заказ" + 0.086*"радиатор" + 0.062*"вскрывать" + 0.052*"входить" + 0.048*"освещать" + 0.035*"шуруп" + 0.027*"смотря" + 0.026*"горячий" + 0.026*"разочарование"')
----------------------------------------------------------------------

Topic #2:
(77, '0.224*"упаковка" + 0.133*"ужасный" + 0.094*"претензия" + 0.094*"качество" + 0.077*"друг" + 0.047*"достойный" + 0.040*"компактный" + 0.035*"сверху" + 0.031*"аналог" + 0.029*"иметься"')
----------------------------------------------------------------------

Topic #3:
(3, '0.179*"лампочка" + 0.132*"минута" + 0.069*"уходить" + 0.060*"просвечивать" + 0.055*"холодный" + 0.046*"снова" + 0.039*"расход" + 0.038*"начинать" + 0.024*"тк" + 0.024*"время"')
-

# Quality

In [53]:
#quality fuctions
def coherence_f(model, texts, dictionary):
    coherence = CoherenceModel(model=model, texts=texts,
        dictionary=dictionary, coherence='c_v')
    return coherence.get_coherence()

def nmpi_f(model, texts, dictionary):
    nmpi = CoherenceModel(model=model, texts=texts,
        dictionary=dictionary, coherence='c_npmi')
    return nmpi.get_coherence()

def mass_f(model, texts, dictionary):   
    mass = CoherenceModel(model=model, texts=texts,
        dictionary=dictionary, coherence='u_mass')
    return mass.get_coherence()

In [54]:
print("NMF model")
print ("coherence: {}".format(coherence_f(model=nmf, texts=X_token, dictionary=dictionary)))
print("nmpi: {}".format(nmpi_f(model=nmf, texts=X_token, dictionary=dictionary)))
print("mass: {}".format(mass_f(model=nmf, texts=X_token, dictionary=dictionary)))

NMF model
coherence: 0.5959593812120916
nmpi: 0.04190082875729775
mass: -3.982464334606963


In [55]:
print("LDA model")
print ("coherence: {}".format(coherence_f(model=lda, texts=X_token, dictionary=dictionary)))
print("nmpi: {}".format(nmpi_f(model=lda, texts=X_token, dictionary=dictionary)))
print("mass: {}".format(mass_f(model=lda, texts=X_token, dictionary=dictionary)))


LDA model
coherence: 0.37572598764985365
nmpi: -0.10185296511322636
mass: -6.562132526283091
