In [29]:
from pprint import pprint

import gensim.corpora as corpora
import nltk
import pandas as pd
import plotly.graph_objects as go
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel, LdaMulticore
from tqdm.notebook import tqdm

In [41]:
articles = pd.read_pickle('dataframes/pro_articles.pkl')
comments = pd.read_pickle('dataframes/analyzed_comments.pkl')

In [3]:
bigrams = [bigram for tokens in comments['tokens'] for bigram in nltk.bigrams(tokens)]
trigrams = [trigram for tokens in comments['tokens'] for trigram in nltk.trigrams(tokens)]

bigram_freq = nltk.FreqDist(bigrams)
trigram_freq = nltk.FreqDist(trigrams)

n = 20

most_common_bigrams = bigram_freq.most_common(n)
most_common_trigrams = trigram_freq.most_common(n)

bigrams, bigrams_frequencies = map(list, zip(*most_common_bigrams))
bigrams = list(' - '.join(i) + '\t\t' for i in bigrams)

trigrams, trigrams_frequencies = map(list, zip(*most_common_trigrams))
trigrams = list(' - '.join(i) + '\t\t' for i in trigrams)

In [39]:
fig_bigrams = go.Figure(
    go.Bar(x=bigrams_frequencies, y=bigrams, orientation='h', marker_color='cyan')
)
fig_bigrams.update_layout(
    title_text=f'Топ-{n} наиболее частых биграмм',
    xaxis_title='Частота',
    yaxis_title='Биграммы',
    template='plotly_dark',
    plot_bgcolor='rgba(255,255,255,1)',
    paper_bgcolor='rgba(240,240,240,1)',
    font=dict(
        family="Arial, sans-serif",
        size=11,
        color="black"
    )
)

fig_bigrams.update_xaxes(showgrid=True, gridwidth=1, gridcolor='DimGray')
fig_bigrams.update_yaxes(showgrid=True, gridwidth=1, gridcolor='DimGray')

fig_bigrams.show()

In [37]:
fig_trigrams = go.Figure(
    go.Bar(x=trigrams_frequencies, y=trigrams, orientation='h', marker_color='cyan')
)
fig_trigrams.update_layout(
    title_text=f'Топ-{n} наиболее частых триграмм',
    xaxis_title='Частота',
    yaxis_title='Триграммы',
    template='plotly_dark',
    plot_bgcolor='rgba(255,255,255,1)',
    paper_bgcolor='rgba(240,240,240,1)',
    font=dict(
        family="Arial, sans-serif",
        size=11,
        color="black"
    )
)

fig_trigrams.update_xaxes(showgrid=True, gridwidth=1, gridcolor='DimGray')
fig_trigrams.update_yaxes(showgrid=True, gridwidth=1, gridcolor='DimGray')

fig_trigrams.show()

In [6]:
labels = ('Нейтральные', 'Позитивные', 'Негативные')
sentiment_counts = comments['sentiment'].apply(labels.__getitem__).value_counts()

In [36]:
fig = go.Figure(
    go.Pie(
        labels=sentiment_counts.index,
        values=sentiment_counts.values,
        pull=[0.1 if i == sentiment_counts.idxmax() else 0 for i in sentiment_counts.index]
    )
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(
    title_text='Распределение настроений в комментариях',
    plot_bgcolor='rgba(255,255,255,1)',
    paper_bgcolor='rgba(240,240,240,1)',
    font=dict(
        family="Arial, sans-serif",
        size=14,
        color="black"
    ),
    template='plotly_dark',
)

fig.show()

In [8]:
comments_arr = comments['tokens']
comments_arr.shape[0]

18431

In [9]:
documents = comments_arr.to_list()
id2word = corpora.Dictionary(documents)
corpus = [id2word.doc2bow(doc) for doc in documents]

In [10]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step), desc="Calculating Coherence Values"):
        model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    return model_list, coherence_values

In [11]:
model_list, coherence_values = compute_coherence_values(
    dictionary=id2word, corpus=corpus, texts=documents, start=2, limit=50, step=6
)

Calculating Coherence Values:   0%|          | 0/8 [00:00<?, ?it/s]

In [13]:
num_topics = list(range(2, 50, 6))

In [21]:
fig = go.Figure(
    go.Scatter(x=num_topics, y=coherence_values, mode='lines+markers', name='Coherence Value')
)

fig.update_layout(
    title='Зависимость Coherence Value от количества тем',
    xaxis_title='Количество тем',
    yaxis_title='Coherence Value',
    plot_bgcolor='rgba(255,255,255,1)',
    paper_bgcolor='rgba(240,240,240,1)',
    font=dict(
        family="Arial, sans-serif",
        size=14,
        color="black"
    ),
)

fig.show()

In [22]:
for m, cv in zip(num_topics, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.544
Num Topics = 8  has Coherence Value of 0.5187
Num Topics = 14  has Coherence Value of 0.4878
Num Topics = 20  has Coherence Value of 0.4885
Num Topics = 26  has Coherence Value of 0.4991
Num Topics = 32  has Coherence Value of 0.4922
Num Topics = 38  has Coherence Value of 0.4881
Num Topics = 44  has Coherence Value of 0.4946


In [25]:
optimal_model = model_list[1]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.009*"статья" + 0.008*"код" + 0.005*"проект" + 0.004*"проблема" + '
  '0.004*"приложение" + 0.004*"пример" + 0.004*"написать" + '
  '0.004*"использовать" + 0.003*"разработка" + 0.003*"автор"'),
 (1,
  '0.006*"статья" + 0.006*"код" + 0.004*"сайт" + 0.004*"например" + 0.003*"1" '
  '+ 0.003*"проблема" + 0.003*"использовать" + 0.003*"простой" + '
  '0.003*"пример" + 0.003*"разработчик"'),
 (2,
  '0.004*"http" + 0.004*"статья" + 0.004*"сайт" + 0.004*"комментарий" + '
  '0.004*"использовать" + 0.003*"код" + 0.003*"com" + 0.003*"проблема" + '
  '0.003*"решение" + 0.003*"любой"'),
 (3,
  '0.008*"статья" + 0.006*"использовать" + 0.004*"0" + 0.004*"решение" + '
  '0.003*"проблема" + 0.003*"язык" + 0.003*"например" + 0.003*"1" + '
  '0.003*"пример" + 0.003*"приложение"'),
 (4,
  '0.006*"использовать" + 0.006*"сайт" + 0.004*"проблема" + 0.004*"код" + '
  '0.004*"статья" + 0.004*"задача" + 0.003*"например" + 0.003*"проект" + '
  '0.003*"тема" + 0.003*"приложение"'),
 (5,
  '0.005*"тип" + 

In [35]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis.html')