### Importing dependencies and set configurations

In [312]:
import re
import pandas as pd 
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import spacy    
import spacy_transformers
nlp = spacy.load('en_core_web_trf')
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn import decomposition
import numpy as np 
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px # pip install plotly
import matplotlib.pyplot as plt # pip install matplotlib
%matplotlib inline
import plotly.io as pio
pio.renderers.default = 'browser'
import textwrap

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Labelled.csv")

### Tokenization, TF-IDF Vectorization, LDA Topic Modelling

In [313]:
def remove_punctuation(cell):
    return re.sub(r"[^\w\s]", "", cell) 

def my_tokenizer(text):
    # 1-gram tokens of more than a character
    word_tokens = [x for x in word_tokenize(text) if len(x) > 2]
    # remove stop words 
    filtered_tokens = [word for word in word_tokens if word not in set([remove_punctuation(x) for x in [*stopwords.words('english'), *nlp.Defaults.stop_words, *[str(x) for x in open("stop_words.txt", "r").read().split(" ")]]])]
    # lemmatize the tokens 
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return filtered_tokens

In [314]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

# Vectorize using TF-IDF

tf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer,
                                min_df = 3, max_df=0.7, max_features=15000, 
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df["Translated"])  

# Topic Modelling using LDA

n_topics = 5
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=25, 
                                              learning_method='online', learning_offset=45, n_jobs=1, random_state=420)
W = lda.fit_transform(tf_vectors)
H = lda.components_

# Show top 15 relevant words for each topic

num_words = 15
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [' '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
df_topics

Unnamed: 0,Keywords,Topic ID
0,leave company health pay sick salary absent money medical contract time mental vacation hospital resign,1
1,job company interview human offer resource manager time resignation applied resign hiring salary application process,2
2,people team company good boss time job management coworkers bad companies post business experience toxic,3
3,labor pay department company employees employment employer clearance meeting coe overtime human file resource need,4
4,job hours time salary working company home shift office overtime increase pay schedule experience good,5


In [315]:
# Assign topic to each submision
topicid = ["Topic " + str(i+1) for i in range(lda.n_components)]
postid = ["Submission " + str(i+1) for i in range(len(df["Translated"]))]

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=postid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}' 
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True) 
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(5)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,dominant_topic,breakdown
Submission 1,0.0,0.0,0.35,0.65,0.0,4,Topic 4: 0.65\nTopic 3: 0.35
Submission 2,0.0,0.71,0.2,0.06,0.03,2,Topic 2: 0.71\nTopic 3: 0.2\nTopic 4: 0.06\nTopic 5: 0.03
Submission 3,0.99,0.0,0.0,0.0,0.0,1,Topic 1: 0.99
Submission 4,0.74,0.0,0.0,0.0,0.26,1,Topic 1: 0.74\nTopic 5: 0.26
Submission 5,0.32,0.0,0.0,0.15,0.52,5,Topic 5: 0.52\nTopic 1: 0.32\nTopic 4: 0.15


### t-SNE Clustering and Dimensionality Reduction

In [316]:
# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=690, perplexity=150)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:n_topics])

# Create a new dataframe with t-SNE coordinates and cluster labels
def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Submission': df["Translated"],
                                  'Engagement': df["Engagements"],
                                  'Timestamp': df['Epoch'],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means

df_topics_cluster['Submission'] = df_topics_cluster['Submission'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

df_top_10 = df_topics_cluster.sort_values('Engagement', ascending=False).groupby('Cluster').head(10)

### Nutshell Plot

In [317]:
# Plot submissions as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#785ef0', '#dede00']

Design_Book=[
              '#ffb000', 
              '#785ef0', 
              '#dc267f', 
              '#fe6100', 
              '#57c4ff', 
              '#00cc96', 
              '#bcbd21', 
              '#a65628', 
              '#dede00',
              '#984ea3',
            ]

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Submission',
                 size='Engagement',
                 color_discrete_sequence=Design_Book,
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

fig.add_annotation(
    x=0,
    y=-0.2*(1/10)-0.15,
    text="Top 10 Most Frequent Keywords per Topic",
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color='white', family='Arial', size=16, weight='normal')
  )

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.25,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'], family='Arial', size=16, weight='normal')
  )

SIZE_MULTIPLIER = 1

fig.update_traces(mode='markers', 
                  opacity=1,
                  marker=dict(
                    sizemode='area',
                    sizeref=2.*max(df['Engagements'])/((125*(SIZE_MULTIPLIER))**2), 
                    line_color='#1a181c',
                    line_width=2),
                  )

fig.update_layout(height=1080*SIZE_MULTIPLIER,
                  width=1480*SIZE_MULTIPLIER,
                  xaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  yaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  margin=dict(b=360*SIZE_MULTIPLIER),
                  title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
                  showlegend=False, 
                  paper_bgcolor='#1a181c',
                  plot_bgcolor='#1a181c',
                )

fig.update_xaxes(showline=True, 
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="white")

fig.update_yaxes(showline=True,
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="#FFFFFF")
                

# !pip install nbformat
# restart kernel
pio.show(fig)

### RQ 1

In [318]:
# df_topics_cluster['Date'] = pd.to_datetime(df_topics_cluster['Timestamp'], unit='s').dt.to_period('M')
# df_topics_cluster = df_topics_cluster.sort_values(by='Timestamp')

# total_count = df_topics_cluster.groupby('Date').nunique()
# total_count = total_count['Submission'].cumsum()
# cumulative_count = df_topics_cluster.groupby(['Date','Cluster']).nunique()
# cumulative_count = cumulative_count.pivot_table('Submission', 'Date', 'Cluster').fillna(0).cumsum()

# rel_freq = cumulative_count.div(total_count, axis=0)
# rel_freq = rel_freq.stack(0).reset_index()
# rel_freq.columns = ['Date', 'Cluster', 'Relative Frequency']
# rel_freq['Date'] = [x.strftime('%b %Y') for x in rel_freq['Date']]

# monthly_count = df_topics_cluster.groupby(['Date','Cluster']).size()
# monthly_count = monthly_count.reset_index()
# monthly_count.columns = ['Date', 'Cluster', 'Frequency']
# monthly_count['Date'] = [x.strftime('%b %Y') for x in monthly_count['Date']]

# cumulative_count = cumulative_count.stack(0).reset_index()
# cumulative_count.columns = ['Date', 'Cluster', 'Frequency']
# cumulative_count['Date'] = [x.strftime('%b %Y') for x in cumulative_count['Date']]

In [319]:
# fig = px.area(monthly_count, x='Date', y='Frequency', color='Cluster',
#               title='Relative Frequency vs Time',
#               color_discrete_sequence=Design_Book,
#               labels={'Relative_Frequency': 'Relative Frequency', 'Date': 'Date'}
#             )

# fig.update_xaxes(nticks=7)

# fig.update_layout(height=1080,
#                   width=2160,
#                   xaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   yaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
#                   legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16, weight='normal')),
#                   paper_bgcolor='#1a181c',
#                   plot_bgcolor='#1a181c',
#                 )

# fig.update_xaxes(showline=True, 
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="white")

# fig.update_yaxes(showline=True,
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="#FFFFFF")

# # Show the plot
# # fig.show()

### RQ 2