<a href="https://colab.research.google.com/github/alya-atm/Reviews-summarization-for-product-description/blob/main/topic_modeling_BigArtm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bigartm10

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import artm 
import pandas as pd
import glob
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from os.path import join
from collections import Counter
from gensim.models import  CoherenceModel
from gensim import corpora

In [3]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True) 

Mounted at /content/drive/


In [4]:
path = '/content/drive/My Drive/review_summarization/'

In [5]:
comments = pd.read_csv('/content/drive/My Drive/review_summarization/comments_clean.csv')

In [20]:
X = comments['clean_body']

In [85]:
number_topics = 75
topic_names = ['topic_{}'.format(i) for i in range(number_topics)]

Prepare data for model, batch vectorization

In [62]:
texts = comments['clean_body'].tolist()

In [63]:
token_frequencies  = []
for text in texts:
  counter = dict(Counter(text.split()))
  token_frequencies.append([(key + ':' + str(value)) for key, value in counter.items()])


In [65]:
with open(join(path, 'data_comments'), 'w') as file:
        for n in range(len(texts)):
            file.write(f'doc{n} {" ".join(token_frequencies[n])}\n')

In [67]:
batch_vectorizer = artm.BatchVectorizer(data_path=join(path, 'data_comments'),
                                            data_format='vowpal_wabbit',
                                            target_folder=join(path, 'batches'))

Model

In [87]:
model_artm = artm.ARTM( num_topics=number_topics, 
                  topic_names=topic_names, 
              
                  scores=[artm.PerplexityScore(name='PerplexityScore',dictionary=batch_vectorizer.dictionary)],
                  regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta',tau=-0.55)],
                  cache_theta = True)

In [88]:
model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore',num_tokens=10))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityPhiScore'))
model_artm.scores.add(artm.TopicKernelScore(name='TopicKernelScore',
                                                  probability_mass_threshold=0.5))

model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_reg',tau = -0.25))
model_artm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_reg',tau = -1))
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorellator_phi_reg',tau = 10))

In [91]:
model_artm.initialize(dictionary=batch_vectorizer.dictionary)
model_artm.num_document_passes = 1
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25)

In [90]:
for i, topic_name in enumerate(model_artm.topic_names):
      print("\nTopic #{}:".format(i))
      print(model_artm.score_tracker['TopTokensScore'].last_tokens[topic_name])
      print("-"*70)


Topic #0:
['затирка', 'шов', 'цвет', 'плитка', 'очень', 'получаться', 'затирать', 'белый', 'лезвие', 'весь']
----------------------------------------------------------------------

Topic #1:
['плита', 'утеплитель', 'материал', 'стена', 'очень', 'дом', 'вата', 'весь', 'мм', 'кнауф']
----------------------------------------------------------------------

Topic #2:
['запах', 'средство', 'весь', 'перчатка', 'очень', 'ожидание', 'вонять', 'превосходить', 'пахнуть', 'купить']
----------------------------------------------------------------------

Topic #3:
['колесо', 'нож', 'рубанок', 'тачка', 'купить', 'весь', 'очень', 'кг', 'ограничитель', 'строгать']
----------------------------------------------------------------------

Topic #4:
['очень', 'купить', 'stanley', 'справа', 'слева', 'поливать', 'вместительный', 'цена', 'салфетка', 'леруа']
----------------------------------------------------------------------

Topic #5:
['стеллаж', 'полка', 'собирать', 'уголок', 'стойка', 'сборка', 'комплек

In [95]:
topics2 = [model_artm.score_tracker['TopTokensScore'].last_tokens[topic_name] for topic_name in model_artm.topic_names]

In [102]:

def coherence( topics, texts, dictionary):
    coherence = CoherenceModel(topics=topics, texts=texts,
        dictionary=dictionary, coherence='c_v')
    return coherence.get_coherence()

def nmpi(topics, texts, dictionary):
    nmpi = CoherenceModel(topics=topics, texts=texts,
        dictionary=dictionary, coherence='c_npmi')
    return nmpi.get_coherence()

def mass(topics, texts, dictionary):   
    mass = CoherenceModel(topics=topics, texts=texts,
        dictionary=dictionary, coherence='u_mass')
    return mass.get_coherence()

In [103]:
X_token = []

for index in range(len(X)):
    X_token.append(X[index].split())

dictionary = corpora.Dictionary(X_token)



In [105]:
print("BigArtm model")
print ("coherence: {}".format(coherence( topics2, X_token, dictionary)))
print("nmpi: {}".format(nmpi( topics2, X_token, dictionary)))
print("mass: {}".format(mass( topics2, X_token, dictionary)))

BigArtm model
coherence: 0.5939930908593856
nmpi: 0.057380060177706284
mass: -3.4436995630965868
