### Tokenizing + Vectorization

In [53]:
import re

# pip install pandas
import pandas as pd 

# pip install nltk
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn import decomposition

# pip install numpy
import numpy as np 

In [54]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Compiled2.csv")

In [55]:
def remove_punctuation(cell):
    return re.sub(r"[^\w\s]", "", cell) 

def my_tokenizer(text):
    # 1-gram tokens of more than a character
    word_tokens = [x for x in word_tokenize(text) if len(x) > 1]
    # remove stop words 
    filtered_tokens = [word for word in word_tokens if word not in set([remove_punctuation(x) for x in [*stopwords.words('english'), "im", "were", "youre", "thats", "theres"]])]
    # lemmatize the tokens 
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return filtered_tokens

In [56]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# vectorize to a matrix of TF-IDF features
# df = document frequency.  
 
tf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer,
                                min_df = 6, max_df=0.80, max_features=5500, 
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df["Translated"])  

In [57]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

n_topics = 10
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=35, 
                                              learning_method='online', learning_offset=35, n_jobs=1, random_state=42)
W = lda.fit_transform(tf_vectors)
H = lda.components_

In [58]:
# Show top 15 relevant words for each of the 25 topics
num_words = 10
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [' '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
df_topics

Unnamed: 0,Keywords,Topic ID
0,job time work manager company supervisor since also family task,1
1,leave resignation pay days last employer company employment labor department,2
2,training company salary allowance job increase per human offer position,3
3,work really like even want team boss feel cant know,4
4,mental health bond company store sales multiple know product friend,5
5,work overtime hours pay shift working day time pm company,6
6,said boss team school human told post leader sub officer,7
7,interview job offer human said resource company applied hiring application,8
8,like work even us go time home hours get said,9
9,companies business philippines outsourcing job people work company salary workers,10


In [59]:
# Assign topic to each tweet
topicid = ["Topic " + str(i+1) for i in range(lda.n_components)]
tweetid = ["Submission " + str(i+1) for i in range(len(df["Translated"]))]

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=tweetid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}' 
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True) 
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(5)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,dominant_topic,breakdown
Submission 1,0.0,0.65,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,Topic 2: 0.65\nTopic 3: 0.35
Submission 2,0.0,0.0,0.73,0.0,0.0,0.0,0.0,0.27,0.0,0.0,3,Topic 3: 0.73\nTopic 8: 0.27
Submission 3,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,2,Topic 2: 0.96\nTopic 10: 0.03
Submission 4,0.0,0.0,0.1,0.0,0.0,0.34,0.56,0.0,0.0,0.0,7,Topic 7: 0.56\nTopic 6: 0.34\nTopic 3: 0.1
Submission 5,0.0,0.3,0.0,0.69,0.0,0.0,0.0,0.0,0.0,0.0,4,Topic 4: 0.69\nTopic 2: 0.3


In [60]:
# Visualize topics
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px # pip install plotly
import matplotlib.pyplot as plt # pip install matplotlib
%matplotlib inline

# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:n_topics])

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# Apply K-means clustering 
kmeans = KMeans(n_clusters=n_topics, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(df_topics_lda.iloc[:,:n_topics])

In [61]:
# Create a new dataframe with t-SNE coordinates and cluster labels
import textwrap

def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Submission': df["Translated"],
                                  'Engagement': df["Engagements"],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})

df_topics_cluster['Submission'] = df_topics_cluster['Submission'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

In [98]:
# Plot submissions as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Submission',
                 size='Engagement',
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.08,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'], family='Roboto', size=16, weight='normal')
  )

fig.update_layout(height=800,
                  xaxis_title='', yaxis_title='',
                  margin=dict(b=200),
                  title=dict(font=dict(color='white', family='Roboto', size=24, weight='bold')),
                  legend=dict(title="Topic", font=dict(color='white', family='Roboto', size=16, weight='normal')),
                  paper_bgcolor='#1a181c',
                  plot_bgcolor='#1a181c',
                )

fig.update_xaxes(showline=True, 
                 linewidth=2, 
                 linecolor='#232024', 
                 gridcolor='#232024', 
                 zerolinecolor='#232024',
                 title_font_color="white")
fig.update_yaxes(showline=True,
                linewidth=2, 
                linecolor='#232024', 
                gridcolor='#232024', 
                zerolinecolor='#232024',
                title_font_color="#FFFFFF")

# !pip install nbformat
# restart kernel
fig.show()