In [105]:
# %%script true

import re
import pandas as pd
import numpy as np
import plotly.express as px # pip install plotly
import matplotlib.pyplot as plt # pip install matplotlib
import plotly.io as pio
pio.renderers.default = 'browser'
%matplotlib inline
import textwrap

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Clustered.csv")
df_topics = pd.read_csv("../dataset/_compiled/Keywords.csv")

In [106]:
df['LDA Topic'] = df['LDA Topic'].astype(str)
df['Union'] = df['Union'].astype(str)
df['Date'] = pd.to_datetime(df['Epoch'], unit='s').dt.to_period('M')
df = df.sort_values(by='Timestamp')

### Nutshell Plot

In [79]:
%%script true

# Plot submissions as colored points
design_book=[
              '#ffb000', 
              '#785ef0', 
              '#dc267f', 
              '#fe6100', 
              '#648fff', 
              '#bcbd21', 
              '#00cc96', 
              '#a65628', 
              '#dede00',
              '#984ea3'
            ]

df["LDA Topic"] = df["LDA Topic"].astype(str)
df.sort_values('LDA Topic', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

fig = px.scatter(df, x='X', y='Y', color='LDA Topic', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Hovertext',
                 size='Engagements',
                 color_discrete_sequence=design_book,
                 hover_data={'X':False, 'Y':False, 'LDA Topic':False, 'Hovertext':False, 'Breakdown':True})

fig.add_annotation(
    x=0,
    y=-0.2*(1/10)-0.15,
    text="Top 10 Most Frequent Keywords per Topic",
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color='white', family='Arial', size=16, weight='normal')
  )

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.25,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'], family='Arial', size=16, weight='normal')
  )

fig.update_traces(mode='markers', 
                  opacity=1,
                  marker=dict(
                    sizemode='area',
                    sizeref=2.*max(df['Engagements'])/(110**2), 
                    line_color='white',
                    line_width=1),
                  )

fig.update_layout(height=1680,
                  width=1680,
                  xaxis=dict(
                    range=[-60, 60],
                    showgrid=True,
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  yaxis=dict(
                    range=[-75, 75],
                    showgrid=True,
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),  
                  margin=dict(b=360),
                  title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
                  showlegend=False, 
                  paper_bgcolor='#1b181c',
                  plot_bgcolor='#1b181c',
                )

fig.update_xaxes(showline=False, 
                 showticklabels=False,
                 zeroline=False,
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="white")

fig.update_yaxes(showline=False,
                 showticklabels=False,
                 zeroline=False,
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="#FFFFFF")
                
# !pip install nbformat
# restart kernel
pio.show(fig)

### R1 Plot

In [114]:
# LDA Labeled

total_count = df.groupby('Date').nunique()
total_count = total_count['Hovertext'].cumsum()

cumulative_count = df.groupby(['Date','LDA Topic']).nunique()
cumulative_count = cumulative_count.pivot_table('Hovertext', 'Date', 'LDA Topic').fillna(0).cumsum()
rel_freq = cumulative_count.div(total_count, axis=0)

cumulative_count = cumulative_count.stack(0).reset_index()
cumulative_count.columns = ['Date', 'LDA Topic', 'Frequency']
cumulative_count['Date'] = [x.strftime('%b %Y') for x in cumulative_count['Date']]

rel_freq = rel_freq.stack(0).reset_index()
rel_freq.columns = ['Date', 'LDA Topic', 'Relative Frequency']
rel_freq['Date'] = [x.strftime('%b %Y') for x in rel_freq['Date']]

monthly_count = df.groupby(['Date','LDA Topic']).size()
monthly_count = monthly_count.reset_index()
monthly_count.columns = ['Date', 'LDA Topic', 'Frequency']
monthly_count['Date'] = [x.strftime('%b %Y') for x in monthly_count['Date']]

print(rel_freq)

         Date LDA Topic  Relative Frequency
0    Feb 2022         1            0.000000
1    Feb 2022         2            1.000000
2    Feb 2022         3            0.000000
3    Feb 2022         4            0.000000
4    Feb 2022         5            0.000000
5    Mar 2022         1            0.000000
6    Mar 2022         2            0.500000
7    Mar 2022         3            0.250000
8    Mar 2022         4            0.250000
9    Mar 2022         5            0.000000
10   Apr 2022         1            0.000000
11   Apr 2022         2            0.444444
12   Apr 2022         3            0.222222
13   Apr 2022         4            0.333333
14   Apr 2022         5            0.000000
15   May 2022         1            0.058824
16   May 2022         2            0.411765
17   May 2022         3            0.176471
18   May 2022         4            0.235294
19   May 2022         5            0.117647
20   Jun 2022         1            0.049180
21   Jun 2022         2         

In [89]:
# Manually Labeled
df['Union'] = df['Union'].astype(int)

cumulative_count_manual = df.groupby(['Date','Union']).nunique()
cumulative_count_manual = cumulative_count_manual.pivot_table('Hovertext', 'Date', 'Union').fillna(0).cumsum()
cumulative_count_manual = cumulative_count_manual.stack(0).reset_index()
cumulative_count_manual.columns = ['Date', 'Union', 'Frequency']
cumulative_count_manual['Date'] = [x.strftime('%b %Y') for x in cumulative_count_manual['Date']]

rel_freq_manual = cumulative_count_manual.div(total_count, axis=0)
rel_freq_manual = rel_freq_manual.stack(0).reset_index()
rel_freq_manual.columns = ['Date', 'Union', 'Relative Frequency']
rel_freq_manual['Date'] = [x.strftime('%b %Y') for x in rel_freq_manual['Date']]

monthly_count_manual = df.groupby(['Date','Union']).size()
monthly_count_manual = monthly_count_manual.reset_index()
monthly_count_manual.columns = ['Date', 'Union', 'Frequency']
monthly_count_manual['Date'] = [x.strftime('%b %Y') for x in monthly_count_manual['Date']]

         Date  Union  Frequency
0    Feb 2022      1        0.0
1    Feb 2022      2        1.0
2    Feb 2022      3        0.0
3    Feb 2022      4        0.0
4    Feb 2022      5        0.0
5    Mar 2022      1        0.0
6    Mar 2022      2        2.0
7    Mar 2022      3        1.0
8    Mar 2022      4        1.0
9    Mar 2022      5        0.0
10   Apr 2022      1        0.0
11   Apr 2022      2        4.0
12   Apr 2022      3        2.0
13   Apr 2022      4        3.0
14   Apr 2022      5        0.0
15   May 2022      1        2.0
16   May 2022      2       14.0
17   May 2022      3        6.0
18   May 2022      4        8.0
19   May 2022      5        4.0
20   Jun 2022      1        3.0
21   Jun 2022      2       24.0
22   Jun 2022      3       10.0
23   Jun 2022      4       16.0
24   Jun 2022      5        8.0
25   Jul 2022      1        4.0
26   Jul 2022      2       30.0
27   Jul 2022      3       13.0
28   Jul 2022      4       20.0
29   Jul 2022      5       11.0
30   Aug

In [91]:
fig = px.line(rel_freq, x='Date', y='Relative Engagements', color='Union',
              title='Relative Frequency vs Time',
              color_discrete_sequence=['#00cc96', '#984ea3'],
              labels={'Relative_Frequency': 'Relative Frequency', 'Date': 'Date'}
             )

fig.update_xaxes(nticks=10)

fig.update_layout(height=1080,
                  width=1920,
                  xaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  yaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  title=dict(font=dict(color='white', family='Roboto', size=24)),
                  legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16)),
                  paper_bgcolor='#1a181c',
                  plot_bgcolor='#1a181c',
                 )

fig.update_xaxes(showline=True,
                 linewidth=2,
                 linecolor='#232024',
                 gridcolor='#232024',
                 zerolinecolor='#232024',
                 title_font_color="white")

fig.update_yaxes(showline=True,
                 linewidth=2,
                 linecolor='#232024',
                 gridcolor='#232024',
                 zerolinecolor='#232024',
                 title_font_color="#FFFFFF")

# Show the plot
fig.show()

### R2 Plot

In [112]:
# Manually Labeled

total_engage = df.groupby(['Date'])['Engagements'].sum()
# total_engage['Date'] = [x.strftime('%b %Y') for x in total_engage['Date']]

cumulative_engage_manual = pd.DataFrame(df.groupby(['Date','Union'])['Engagements'].sum())
cumulative_engage_manual = cumulative_engage_manual.pivot_table('Engagements', 'Date', 'Union').fillna(0).cumsum()
rel_engage_manual = cumulative_engage_manual.div(total_engage, axis=0)

cumulative_engage_manual = cumulative_engage_manual.stack(0).reset_index()
cumulative_engage_manual.columns = ['Date', 'Union', 'Engagements']
cumulative_engage_manual['Date'] = [x.strftime('%b %Y') for x in cumulative_engage_manual['Date']]


monthly_engage_manual = df.groupby(['Date','Union'])['Engagements'].sum()
monthly_engage_manual = monthly_engage_manual.reset_index()
monthly_engage_manual.columns = ['Date', 'Union', 'Engagements']
monthly_engage_manual['Date'] = [x.strftime('%b %Y') for x in monthly_engage_manual['Date']]

rel_engage_manual = rel_engage_manual.stack(0).reset_index()
rel_engage_manual.columns = ['Date', 'Union', 'Relative Engagements']
rel_engage_manual['Date'] = [x.strftime('%b %Y') for x in rel_engage_manual['Date']]
print(rel_engage_manual)

        Date Union  Relative Engagements
0   Feb 2022     0              1.000000
1   Feb 2022     1              0.000000
2   Mar 2022     0              0.324503
3   Mar 2022     1              0.749290
4   Apr 2022     0              1.093903
5   Apr 2022     1              0.216827
6   May 2022     0              2.265537
7   May 2022     1              0.360739
8   Jun 2022     0              3.290846
9   Jun 2022     1              0.377089
10  Jul 2022     0              7.013292
11  Jul 2022     1              0.689046
12  Aug 2022     0              3.576146
13  Aug 2022     1              0.302475
14  Sep 2022     0              5.476720
15  Sep 2022     1              0.378647
16  Oct 2022     0              6.837200
17  Oct 2022     1              0.502141
18  Nov 2022     0              5.741359
19  Nov 2022     1              0.423495
20  Dec 2022     0              6.594595
21  Dec 2022     1              0.414933
22  Jan 2023     0             15.365834
23  Jan 2023    

In [115]:
# LDA Labeled
cumulative_engage = df.groupby(['Date','LDA Topic']).nunique()
cumulative_engage = cumulative_engage.pivot_table('Hovertext', 'Date', 'LDA Topic').fillna(0).cumsum()
rel_engage = cumulative_engage.div(total_engage, axis=0)

cumulative_engage = cumulative_engage.stack(0).reset_index()
cumulative_engage.columns = ['Date', 'LDA Topic', 'Frequency']
cumulative_engage['Date'] = [x.strftime('%b %Y') for x in cumulative_engage['Date']]

rel_engage = rel_engage.stack(0).reset_index()
rel_engage.columns = ['Date', 'LDA Topic', 'Relative Engagements']
rel_engage['Date'] = [x.strftime('%b %Y') for x in rel_engage['Date']]

monthly_engage = df.groupby(['Date','LDA Topic']).size()
monthly_engage = monthly_engage.reset_index()
monthly_engage.columns = ['Date', 'LDA Topic', 'Frequency']
monthly_engage['Date'] = [x.strftime('%b %Y') for x in monthly_engage['Date']]

In [117]:
fig = px.line(rel_engage, x='Date', y='Relative Engagements', color='LDA Topic',
              title='Relative Frequency vs Time',
              color_discrete_sequence=['#00cc96', '#984ea3'],
              labels={'Relative_Frequency': 'Relative Frequency', 'Date': 'Date'}
             )

fig.update_xaxes(nticks=10)

fig.update_layout(height=1080*0.5,
                  width=1920*0.5,
                  xaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  yaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  title=dict(font=dict(color='white', family='Roboto', size=24)),
                  legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16)),
                  paper_bgcolor='#1a181c',
                  plot_bgcolor='#1a181c',
                 )

fig.update_xaxes(showline=True,
                 linewidth=2,
                 linecolor='#232024',
                 gridcolor='#232024',
                 zerolinecolor='#232024',
                 title_font_color="white")

fig.update_yaxes(showline=True,
                 linewidth=2,
                 linecolor='#232024',
                 gridcolor='#232024',
                 zerolinecolor='#232024',
                 title_font_color="#FFFFFF")

# Show the plot
fig.show()