In [1]:
import pandas as pd 
import numpy as np

import os

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.set_option('display.width', 1000)

In [2]:
FOLDER = './vk_groups'
group_names = {'–ú–æ–π –≥–æ—Ä–æ–¥ –ü–µ—Ä–º—å': 'vikiperm', '59.RU': 'news59ru', 'BusinessNews': 'gazetabc'}

In [3]:
def get_vk_group_posts_with_topics(group_name, folder=FOLDER):
    if os.path.isfile(f'{folder}/{group_name}/{group_name}_posts_with_topic.csv'):
        path = f'{folder}/{group_name}/{group_name}_posts_with_topic.csv'
        df = pd.read_csv(path, index_col=0)
        return df
    else:
        print('–§–∞–π–ª–∞ {group_name}_posts_with_topic.json –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç')

In [4]:
FOLDER_ANALYTICS = './report'
def save_diagram(fig, fig_name, folder=FOLDER_ANALYTICS):
    if os.path.exists(f'{folder}'):
        path = f'{folder}/{fig_name}.html'
        fig.write_html(path)

        path = f'{folder}/{fig_name}.png'
        fig.write_image(path, width=1600, height=720)
    else:
        os.mkdir(FOLDER_ANALYTICS)
        path = f'{folder}/{fig_name}.html'
        fig.write_html(path)

        path = f'{folder}/{fig_name}.png'
        fig.write_image(path, scale=1, width=1600, height=720)

## –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [5]:
list_df_posts = []
for group_name in group_names.values():
    df_posts = get_vk_group_posts_with_topics(group_name)
    list_df_posts.append(df_posts)

df_posts = pd.concat(list_df_posts)
df_posts.reset_index(drop=True, inplace=True)

In [6]:
df_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330 entries, 0 to 329
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         330 non-null    int64  
 1   text         330 non-null    object 
 2   topic        330 non-null    object 
 3   topic_proba  330 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 10.4+ KB


–î–ª—è —Ç–æ–≥–æ —á—Ç–æ–±—ã —Å–Ω–∏–∑–∏—Ç—å –∏—Å–∫–∞–∂–µ–Ω–∏–µ –∞–Ω–∞–ª–∏—Ç–∏–∫–∏ —Ç–µ–º –ø–æ—Å—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–ª–∞ –Ω–µ–π—Ä–æ—Å–µ—Ç—å <br>
–æ—Ç—Ñ–∏–ª—å—Ç—Ä—É–µ–º –ø–æ—Å—Ç—ã —Å –Ω–∏–∑–∫–æ–π –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é –ø—Ä–æ–≥–Ω–æ–∑–∞ —Ç–µ–º—ã.

In [7]:
limit_proba = 0.6
filtr = df_posts['topic_proba'] > limit_proba
df_posts = df_posts.loc[filtr].reset_index(drop=True)

In [8]:
df_posts.shape

(215, 4)

In [9]:
df_posts.sample(n=10)

Unnamed: 0,date,text,topic,topic_proba
152,1684256460,–í [club135161380|–ó–∞–∫–æ–Ω–æ–¥–∞—Ç–µ–ª—å–Ω–æ–º –°–æ–±—Ä–∞–Ω–∏—è –ü–µ—Ä–º...,–≠–∫–æ–Ω–æ–º–∏–∫–∞,0.613511
46,1684296060,–í –ü–µ—Ä–º–∏ –ø—Ä–æ–π–¥—ë—Ç –Ω–æ—á–Ω–æ–π –±–ª–∞–≥–æ—Ç–≤–æ—Ä–∏—Ç–µ–ª—å–Ω—ã–π –∑–∞–±–µ–≥...,–°–ø–æ—Ä—Ç,0.623267
197,1683613500,–í –ü–µ—Ä–º—Å–∫–æ–º –∫—Ä–∞–µ –≤—ã–±–∏—Ä–∞—é—Ç –ø–æ–¥—Ä—è–¥—á–∏–∫–æ–≤ –¥–ª—è —Ä–µ–º–æ–Ω...,–ì–æ—Ä–æ–¥,0.805498
90,1684511415,–í —Ü–µ–Ω—Ç—Ä–µ –ü–µ—Ä–º–∏ –∑–∞–≥–æ—Ä–µ–ª–æ—Å—å –∑–∞–±—Ä–æ—à–µ–Ω–Ω–æ–µ –∑–¥–∞–Ω–∏–µ. ...,–ö—É–ª—å—Ç—É—Ä–∞,0.643033
95,1684504803,"–¢–µ–ø–ª—ã–π –≤–æ–∑–¥—É—Ö –æ–±–¥—É–≤–∞–µ—Ç –≤–æ–ª–æ—Å—ã, –æ—Ç –≤–æ–¥—ã –≤–µ–µ—Ç —Å–≤...",–ì–æ—Ä–æ–¥,0.643358
167,1684054804,–ö –≤–æ–∑–≤–µ–¥–µ–Ω–∏—é —É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–æ–π —Å–ø–æ—Ä—Ç–∏–≤–Ω–æ–π –∞—Ä–µ–Ω—ã –≤ ...,–ì–æ—Ä–æ–¥,0.630398
67,1684590720,–ó–∞ –ø–æ—Å–ª–µ–¥–Ω–∏–µ —Å—É—Ç–∫–∏ –≤ –ü–µ—Ä–º—Å–∫–æ–º –∫—Ä–∞–µ –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä...,–ì–æ—Ä–æ–¥,0.716685
91,1684508820,"–£–∂–µ —Å –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫–∞, 22 –º–∞—è, —Å–ª–µ–¥—É—é—â–∞—è –ø–æ –∫–æ–ª—å—Ü...",–ì–æ—Ä–æ–¥,0.938504
186,1683725220,–í —Ä–∞—Å–ø–∏—Å–∞–Ω–∏–∏ –∞—ç—Ä–æ–ø–æ—Ä—Ç–∞ –ë–æ–ª—å—à–æ–µ –°–∞–≤–∏–Ω–æ –ø–æ—è–≤–∏–ª—Å—è...,–ì–æ—Ä–æ–¥,0.965539
31,1684406760,"–°–æ–ª–Ω–µ—á–Ω–∞—è –Ω–∞–±–µ—Ä–µ–∂–Ω–∞—è, –º–∞–π –≤ –ü–µ—Ä–º–∏ üå∏\n\n–§–æ—Ç–æ [i...",–ì–æ—Ä–æ–¥,0.964992


In [10]:
# –°–æ—Ä—Ç–∏—Ä–æ–≤–∫–∞ –ø–æ—Å—Ç–æ–≤ –ø–æ –¥–∞—Ç–µ
df_posts['date'] = pd.to_datetime(df_posts['date'], unit='s').dt.strftime('%Y-%m-%d')

In [11]:
df_posts['date'] = df_posts['date'].astype(dtype='datetime64[D]')

In [12]:
df_posts.sort_values(by='date', ascending=False, inplace=True)
df_posts.reset_index(drop=True, inplace=True)

In [13]:
with pd.option_context('display.max_colwidth', 100):
    print(df_posts.head(10))

        date                                                                                                 text              topic  topic_proba
0 2023-05-21  –î–æ—Å—Ç–∞–≤–∏–º –∑–∞ 77 –º–∏–Ω—É—Ç –∏–ª–∏ —Å–µ—Ç —Ä–æ–ª–ª–æ–≤ - –±–µ—Å–ø–ª–∞—Ç–Ω–æ! üç±\n\n–ö—Ä—É—Ç–∞—è –∞–∫—Ü–∏—è –≤ –¢–æ–º–º–∏ –§–∏—à - –¥–æ—Å—Ç–∞–≤–∏–º –≤–∞—à –∑–∞...    –ù–∞—É–∫–∞ –∏ —Ç–µ—Ö–Ω–∏–∫–∞     0.642851
1 2023-05-21  –î–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—é –¥–ª—è –∏–∑—ä—è—Ç–∏—è –≤ –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—É—é —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç—å –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏ —Å —Ü–µ–ª—å—é —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–∞ –≤—Ç–æ—Ä...              –ì–æ—Ä–æ–¥     0.690082
2 2023-05-21  –í–æ–ø—Ä–æ—Å –æ —Ç–æ–º, –∫—É–¥–∞ –ø–æ–µ—Ö–∞—Ç—å –≤ –æ—Ç–ø—É—Å–∫, –≤—Ç–æ—Ä–æ–µ –ª–µ—Ç–æ –ø–æ–¥—Ä—è–¥ –∑–∞–Ω–∏–º–∞–µ—Ç –≥–æ–ª–æ–≤—ã –Ω–∞—à–∏—Ö —Å–æ–æ—Ç–µ—á–µ—Å—Ç–≤–µ–Ω–Ω–∏–∫–æ–≤....          –≠–∫–æ–Ω–æ–º–∏–∫–∞     0.685514
3 2023-05-21  –ú—ã –ø—Ä–æ–¥–æ–ª–∂–∞–µ–º –∑–∞–¥–∞–≤–∞—Ç—å –Ω–µ–ª–æ–≤–∫–∏–µ –≤–æ–ø—Ä–æ—Å—ã –≤—Ä–∞—á–∞–º. –í —ç—Ç–æ–º –≤—ã–ø—É—Å–∫–µ –º—ã —Ä–∞–∑–±–∏—Ä–∞–µ

In [14]:
df_posts['topic'].unique()

array(['–ù–∞—É–∫–∞ –∏ —Ç–µ—Ö–Ω–∏–∫–∞', '–ì–æ—Ä–æ–¥', '–≠–∫–æ–Ω–æ–º–∏–∫–∞', '–°–∏–ª–æ–≤—ã–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã',
       '–ö—É–ª—å—Ç—É—Ä–∞', '–°–ø–æ—Ä—Ç'], dtype=object)

In [15]:
df_posts.groupby(by='date').count()

Unnamed: 0_level_0,text,topic,topic_proba
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-07,5,5,5
2023-05-08,7,7,7
2023-05-09,7,7,7
2023-05-10,9,9,9
2023-05-11,4,4,4
2023-05-12,4,4,4
2023-05-13,6,6,6
2023-05-14,5,5,5
2023-05-15,10,10,10
2023-05-16,20,20,20


In [16]:
df_posts.drop(columns=['topic_proba'], inplace=True)

## –§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ –æ—Ç—á–µ—Ç–∞

–í –∫–∞—á–µ—Å—Ç–≤–µ –æ—Ç—á–µ—Ç–∞ –ø–æ—Å—Ç—Ä–æ–∏–º —Å—Ç–æ–ª–±—á–∞—Ç—ã–µ –¥–∏–∞–≥—Ä–∞–º–º—ã, –∫–æ—Ç–æ—Ä—ã–µ –±—É–¥—É—Ç –æ—Ç—Ä–∞–∂–∞—Ç—å <br>
–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤ –≤ –¥–µ–Ω—å –≤ —Ç–µ—á–µ–Ω–∏–µ –≤—ã–±—Ä–∞–Ω–Ω–æ–≥–æ –ø—Ä–æ–º–µ–∂—É—Ç–∫–∞

–ü–æ—Å—Ç—Ä–æ–∏–º –¥–∏–∞–≥—Ä–∞–º–º—É –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –ø–æ—Å—Ç–æ–≤ –≤ –¥–µ–Ω—å

In [22]:
#–û—Ç–±–∏—Ä–∞–µ–º –ø–æ—Å—Ç—ã –∑–∞ –∑–∞–¥–∞–Ω–Ω—ã–π –∏–Ω—Ç–µ—Ä–≤–∞–ª
start_date = '2023-05-07'
end_time = 'today'

filtr = (df_posts['date'] >= start_date) & (df_posts['date'] <= end_time)
df_posts_in_interval = df_posts.loc[filtr]


In [23]:
df_posts_in_interval = df_posts_in_interval.groupby(by=['date'], as_index=False).count()

In [24]:
dates = df_posts_in_interval['date'].dt.strftime('%d/%m')
topics_count = df_posts_in_interval['topic'] 
fig = go.Figure(data=[
    go.Bar(x=dates, y=topics_count, marker_color = 'rgb(26, 118, 255)')])

fig.update_layout(
    font=dict(family='Courier New, monospace', size=20),
    xaxis_title='–î–∞—Ç–∞',
    yaxis_title='–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤',
    title=dict(text="–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤ –≤ –µ–∂–µ—Å—É—Ç–æ—á–Ω–æ–º —Ä–∞–∑—Ä–µ–∑–µ", font=dict(size=26), automargin=True, yref='paper')
)
save_diagram(fig, '–æ–±—â–µ–µ_–∫–æ–ª–∏—á–µ—Å—Ç–≤–æ_–ø–æ—Å—Ç–æ–≤_–≤_–¥–µ–Ω—å')
fig.show()

–ü–æ—Å—Ç—Ä–æ–∏–º –¥–∏–∞–≥—Ä–∞–º–º—É —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è —Ç–µ–º –ø–æ—Å—Ç–æ–≤ –≤ –º–µ–∂—Å—É—Ç–æ—á–Ω–æ–º —Ä–∞–∑—Ä–µ–∑–µ

In [25]:
#–û—Ç–±–∏—Ä–∞–µ–º –ø–æ—Å—Ç—ã –∑–∞ –∑–∞–¥–∞–Ω–Ω—ã–π –∏–Ω—Ç–µ—Ä–≤–∞–ª
start_date = '2023-05-07'
end_time = 'today'

filtr = (df_posts['date'] >= start_date) & (df_posts['date'] <= end_time)
df_topic_slice = df_posts.loc[filtr]

#–ì—Ä—É–ø–ø–∏—Ä—É–º –¥–∞–Ω–Ω—ã–µ –ø–æ –¥–∞—Ç–µ –∏ —Ç–µ–º–µ
df_topic_slice['date'] = df_topic_slice['date'].dt.strftime('%d/%m')
df_topic_slice = df_topic_slice.groupby(by=['date', 'topic'], as_index=False).count()

#–ø—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Ç–∞–±–ª–∏—Ü—ã, —Ç–∞–∫ —á—Ç–æ–±—ã –≤ —Å—Ç–æ–ª–±—Ü–∞—Ö –±—ã–ª–∏ –Ω–∞–∑–≤–∞–Ω–∏—è —Ç–µ–º, –∏–Ω–¥–µ–∫—Å–æ–º –±—ã–ª–∞ –¥–∞—Ç–∞, –∞
# –∑–Ω–∞—á–µ–Ω–∏—è–º–∏ –±—ã–ª–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤
df_topic_slice = df_topic_slice.pivot(index=['topic'], columns=['date'], values=['text'])
df_topic_slice.columns = df_topic_slice.columns.droplevel()
df_topic_slice.reset_index(inplace=True)
df_topic_slice.columns.name = None
df_topic_slice

Unnamed: 0,topic,07/05,08/05,09/05,10/05,11/05,12/05,13/05,14/05,15/05,16/05,17/05,18/05,19/05,20/05,21/05
0,–ì–æ—Ä–æ–¥,4.0,5.0,5.0,6.0,3.0,2.0,3.0,3.0,9.0,9.0,11.0,15.0,13.0,11.0,3.0
1,–ö—É–ª—å—Ç—É—Ä–∞,1.0,,2.0,1.0,,2.0,1.0,2.0,,4.0,4.0,5.0,12.0,8.0,3.0
2,–ù–∞—É–∫–∞ –∏ —Ç–µ—Ö–Ω–∏–∫–∞,,,,,,,1.0,,,,,5.0,2.0,5.0,3.0
3,–°–∏–ª–æ–≤—ã–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—ã,,1.0,,,1.0,,,,,2.0,,3.0,5.0,6.0,3.0
4,–°–ø–æ—Ä—Ç,,,,,,,,,,2.0,1.0,,3.0,1.0,
5,–≠–∫–æ–Ω–æ–º–∏–∫–∞,,1.0,,2.0,,,1.0,,1.0,3.0,3.0,5.0,3.0,2.0,3.0


In [26]:
dates = df_topic_slice.columns[1:]
topics = df_topic_slice['topic']
data = []

for i, topic in enumerate(topics):
    data.append(go.Bar(name=topic, 
                       x=dates, 
                       y=df_topic_slice.iloc[i, 1:]))
    
fig = go.Figure(data=data)
# Change the bar mode
fig.update_layout(barmode='group')

fig.update_layout(
    font=dict(family='Courier New, monospace', size=20),
    xaxis_title='–î–∞—Ç–∞',
    yaxis_title='–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤',
    legend_title='–¢–µ–º—ã –ø–æ—Å—Ç–æ–≤',
    title=dict(text="–¢–µ–º—ã –ø–æ—Å—Ç–æ–≤ –≤ –µ–∂–µ—Å—É—Ç–æ—á–Ω–æ–º —Ä–∞–∑—Ä–µ–∑–µ", font=dict(size=26), automargin=True, yref='paper')
)
save_diagram(fig, '—Ç–µ–º—ã_–ø–æ—Å—Ç–æ–≤_–≤_–µ–∂–µ—Å—É—Ç–æ—á–Ω–æ–º_—Ä–∞–∑—Ä–µ–∑–µ_–≤–∞—Ä–∏–∞–Ω—Ç_1')
fig.show()

In [27]:
dates = df_topic_slice.columns[1:]
topics = df_topic_slice['topic']
data = []

for i, topic in enumerate(topics):
    data.append(go.Bar(name=topic, 
                       x=dates, 
                       y=df_topic_slice.iloc[i, 1:]))
    
fig = go.Figure(data=data)
# Change the bar mode
fig.update_layout(barmode='relative')

fig.update_layout(
    font=dict(family='Courier New, monospace', size=20),
    xaxis_title='–î–∞—Ç–∞',
    yaxis_title='–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤',
    legend_title='–¢–µ–º—ã –ø–æ—Å—Ç–æ–≤',
    title=dict(text="–¢–µ–º—ã –ø–æ—Å—Ç–æ–≤ –≤ –µ–∂–µ—Å—É—Ç–æ—á–Ω–æ–º —Ä–∞–∑—Ä–µ–∑–µ", font=dict(size=26), automargin=True, yref='paper')
)

save_diagram(fig, '—Ç–µ–º—ã_–ø–æ—Å—Ç–æ–≤_–≤_–µ–∂–µ—Å—É—Ç–æ—á–Ω–æ–º_—Ä–∞–∑—Ä–µ–∑–µ_–≤–∞—Ä–∏–∞–Ω—Ç_2')
fig.show()

–í –¥–∞–Ω–Ω–æ–π –¥–∏–∞–≥—Ä–∞–º–º–µ –≤—ã—Å–æ—Ç–∞ –∫–∞–∂–¥–æ–≥–æ —Å—Ç–æ–ª–±—Ü–∞ —Ä–∞–≤–Ω–∞ –æ–±—â–µ–º—É –∫–æ–ª–∏—á–µ—Å—Ç–≤—É –ø–æ—Å—Ç–æ–≤ –∑–∞ –¥–µ–Ω—å. <br>
–ü–æ —Ü–≤–µ—Ç—É –≤—ã–ø–æ–ª–Ω–µ–Ω–æ —Ä–∞–∑–±–∏–µ–Ω–∏–µ –∫–∞–∂–¥–æ–≥–æ —Å—Ç–æ–ª–±—Ü–∞ –Ω–∞ —Ç–µ–º—ã, –∏—Å—Ö–æ–¥—è –∏–∑ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –ø–æ—Å—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –µ–π —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—Ç.