### Tokenizing + Vectorization

In [1]:
import re

# pip install pandas
import pandas as pd 

# pip install nltk
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import spacy    
import spacy_transformers
nlp = spacy.load('en_core_web_trf')

# pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn import decomposition

# pip install numpy
import numpy as np 

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Compiled.csv")

In [3]:
def remove_punctuation(cell):
    return re.sub(r"[^\w\s]", "", cell) 

def my_tokenizer(text):
    # 1-gram tokens of more than a character
    word_tokens = [x for x in word_tokenize(text) if len(x) > 2]
    # remove stop words 
    filtered_tokens = [word for word in word_tokens if word not in set([remove_punctuation(x) for x in [*stopwords.words('english'), *nlp.Defaults.stop_words, "im", "were", "youre", "thats", "theres", "ive", "still", 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'per', 'like', 'said', 'even', 'want', 'really']])]
    # lemmatize the tokens 
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return filtered_tokens

In [4]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# vectorize to a matrix of TF-IDF features
# df = document frequency.  
 
tf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer,
                                min_df = 4, max_df=0.7, max_features=15000, 
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df["Translated"])  

In [5]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

n_topics = 7
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=30, 
                                              learning_method='online', learning_offset=10, n_jobs=1, random_state=42)
W = lda.fit_transform(tf_vectors)
H = lda.components_

In [6]:
# Show top 15 relevant words for each topic
num_words = 15
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [' '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
df_topics

Unnamed: 0,Keywords,Topic ID
0,leave resignation contract days pay company sick letter vacation overtime day absent month resign year,1
1,business outsourcing process work people company balance corporate companies life job time graduate years christmas,2
2,interview job offer company position applied human resource role hiring application experience current salary asked,3
3,work team people management know boss hard job good think system company managers sales feel,4
4,human resource department employment pay resigned friend boss told labor company clearance coe final previous,5
5,work salary hours time office pay increase home day job working minimum money training health,6
6,work company know manager people time red job colleague task post tasks boss toxic use,7


In [7]:
# Assign topic to each tweet
topicid = ["Topic " + str(i+1) for i in range(lda.n_components)]
tweetid = ["Submission " + str(i+1) for i in range(len(df["Translated"]))]

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=tweetid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}' 
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True) 
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(5)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,dominant_topic,breakdown
Submission 1,0.0,0.32,0.0,0.0,0.67,0.0,0.0,5,Topic 5: 0.67\nTopic 2: 0.32
Submission 2,0.0,0.0,0.58,0.0,0.42,0.0,0.0,3,Topic 3: 0.58\nTopic 5: 0.42
Submission 3,0.99,0.0,0.0,0.0,0.0,0.0,0.0,1,Topic 1: 0.99
Submission 4,0.08,0.61,0.24,0.0,0.0,0.07,0.0,2,Topic 2: 0.61\nTopic 3: 0.24\nTopic 1: 0.08\nTopic 6: 0.07
Submission 5,0.39,0.0,0.0,0.5,0.1,0.0,0.0,4,Topic 4: 0.5\nTopic 1: 0.39\nTopic 5: 0.1


In [8]:
# Visualize topics
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px # pip install plotly
import matplotlib.pyplot as plt # pip install matplotlib
%matplotlib inline
import plotly.io as pio
pio.renderers.default = 'browser'

# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:n_topics])

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# Apply K-means clustering 
# kmeans = KMeans(n_clusters=n_topics, n_init=10, random_state=42)
# cluster_labels = kmeans.fit_predict(df_topics_lda.iloc[:,:n_topics])

In [9]:
# Create a new dataframe with t-SNE coordinates and cluster labels
import textwrap

def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Submission': df["Translated"],
                                  'Engagement': df["Engagements"],
                                  'Timestamp': df['Epoch'],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means

df_topics_cluster['Submission'] = df_topics_cluster['Submission'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

In [10]:
df_top_10 = df_topics_cluster.sort_values('Engagement', ascending=False).groupby('Cluster').head(10)

In [11]:
# Plot submissions as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#785ef0', '#dede00']

Design_Book=[
              '#ffb000', 
              '#785ef0', 
              '#dc267f', 
              '#fe6100', 
              '#00cc96', 
              '#57c4ff', 
              '#bcbd21', 
              '#a65628', 
              '#dede00',
              '#984ea3',
            ]

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Submission',
                 size='Engagement',
                 color_discrete_sequence=Design_Book,
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

fig.add_annotation(
    x=0,
    y=-0.2*(1/10)-0.15,
    text="Top 10 Most Frequent Keywords per Topic",
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color='white', family='Arial', size=16, weight='normal')
  )

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.25,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    # font=dict(color=fig.data[i].marker['color'], family='Arial', size=16, weight='normal')
    font=dict(color='white', family='Arial', size=16, weight='normal')
  )

SIZE_MULTIPLIER = 1

fig.update_traces(mode='markers', 
                  opacity=1,
                  marker=dict(
                    sizemode='area',
                    sizeref=2.*max(df['Engagements'])/((125*(SIZE_MULTIPLIER))**2), 
                    line_color='#1a181c',
                    line_width=2),
                  )

fig.update_layout(height=1080*SIZE_MULTIPLIER,
                  width=1480*SIZE_MULTIPLIER,
                  xaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  yaxis=dict(
                    gridwidth=2,
                    title='',
                    color='gray',
                  ),
                  margin=dict(b=360*SIZE_MULTIPLIER),
                  title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
                  showlegend=False, 
                  paper_bgcolor='#1a181c',
                  plot_bgcolor='#1a181c',
                )

fig.update_xaxes(showline=True, 
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="white")

fig.update_yaxes(showline=True,
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="#FFFFFF")
                

# !pip install nbformat
# restart kernel
pio.show(fig)

In [12]:
# # Plot submissions as colored points
# df_top_10.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)
# CB_color_cycle = ['#377eb8', '#ff7f00', '#4daf4a',
#                   '#f781bf', '#a65628', '#984ea3',
#                   '#999999', '#785ef0', '#dede00']

# Design_Book=[
#               '#ffb000', 
#               '#785ef0', 
#               '#dc267f', 
#               '#fe6100', 
#               '#00cc96', 
#               '#648fff', 
#               '#bcbd21', 
#               '#a65628', 
#               '#dede00',
#               '#984ea3',
#             ]

# fig = px.scatter(df_top_10, x='X', y='Y', color='Cluster', 
#                  title='Topic Clustering using LDA and t-SNE',
#                  hover_name='Submission',
#                  size='Engagement',
#                  color_discrete_sequence=Design_Book,
#                  hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

# fig.add_annotation(
#     x=0,
#     y=-0.2*(1/10)-0.15,
#     text="Top 10 Most Frequent Keywords per Topic",
#     showarrow=False,
#     xref='paper',
#     yref='paper',
#     align='left',
#     font=dict(color='white', family='Arial', size=16, weight='normal')
#   )

# for i, keyword in enumerate(df_topics['Keywords']):
#   fig.add_annotation(
#     x=0,
#     y=-0.2*(i/5)-0.25,
#     text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
#     showarrow=False,
#     xref='paper',
#     yref='paper',
#     align='left',
#     font=dict(color=fig.data[i].marker['color'], family='Arial', size=16, weight='normal')
#   )

# SIZE_MULTIPLIER = 1

# fig.update_traces(mode='markers', 
#                   opacity=1,
#                   marker=dict(
#                     sizemode='area',
#                     sizeref=2.*max(df['Engagements'])/((125*(SIZE_MULTIPLIER))**2), 
#                     line_color='#1a181c',
#                     line_width=2),
#                   )

# fig.update_layout(height=1080*SIZE_MULTIPLIER,
#                   width=1680*SIZE_MULTIPLIER,
#                   xaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   yaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   margin=dict(b=360*SIZE_MULTIPLIER, r=100*SIZE_MULTIPLIER),
#                   title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
#                   legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16, weight='normal')),
#                   paper_bgcolor='#1a181c',
#                   plot_bgcolor='#1a181c',
#                 )

# fig.update_xaxes(showline=True, 
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="white")

# fig.update_yaxes(showline=True,
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="#FFFFFF")
                

# # !pip install nbformat
# # restart kernel
# # fig.show()

In [13]:
# df_topics_cluster['Date'] = pd.to_datetime(df_topics_cluster['Timestamp'], unit='s').dt.to_period('M')
# df_topics_cluster = df_topics_cluster.sort_values(by='Timestamp')

# # Cumulative count
# df_topics_cluster['CumulativeCount'] = df_topics_cluster.groupby('Cluster').cumcount() + 1

# # Total cumulative count by YearMonth
# df_topics_cluster['CumulativeTotal'] = df_topics_cluster.groupby('Date').cumcount() + 1

# # Cumulative frequency calculation
# cumulative_freq_df = df_topics_cluster.groupby(['Date', 'Cluster']).size().groupby(level=0).cumsum().reset_index(name='CumulativeCount')

# total_cumulative_freq = cumulative_freq_df.groupby('Date')['CumulativeCount'].transform('sum')
# cumulative_freq_df['CumulativeRelativeFrequency'] = cumulative_freq_df['CumulativeCount'] / total_cumulative_freq

# cumulative_freq_df['Date'] = cumulative_freq_df['Date'].dt.to_timestamp().dt.date

# fig = px.line(cumulative_freq_df, x='Date', y='CumulativeCount', color='Cluster',
#               title='Relative Frequency vs Time',
#               color_discrete_sequence=Design_Book,
#               labels={'Relative_Frequency': 'Relative Frequency', 'Date': 'Date'}
#             )

# fig.update_layout(height=1080,
#                   width=2160,
#                   xaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   yaxis=dict(
#                     gridwidth=2,
#                     title='',
#                     color='gray',
#                   ),
#                   title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
#                   legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16, weight='normal')),
#                   paper_bgcolor='#1a181c',
#                   plot_bgcolor='#1a181c',
#                 )

# fig.update_xaxes(showline=True, 
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="white")

# fig.update_yaxes(showline=True,
#                  linewidth=2, 
#                  linecolor='#232024', 
#                  gridcolor='#232024', 
#                  zerolinecolor='#232024',
#                  title_font_color="#FFFFFF")

# # Show the plot
# # fig.show()