### Tokenizing + Vectorization

In [1]:
import string
import pandas as pd # pip install pandas
import nltk # pip install nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer # pip install scikit-learn
from sklearn import decomposition
import numpy as np # pip install numpy

In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Compiled.csv")

In [3]:
def remove_punctuation(cell):
    return cell.translate(str.maketrans('', '', string.punctuation))

def tokenizer(text):
    # 1-gram tokens of more than 3 characters
    word_tokens = [x for x in word_tokenize(text) if len(x) > 3]
    # remove stop words 
    filtered_tokens = [word for word in word_tokens if word not in set([remove_punctuation(x) for x in [*stopwords.words('english'), "im", "were", "youre", "thats", "theres"]])]
    # lemmatize the tokens 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

In [4]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# vectorize to a matrix of TF-IDF features
# df = document frequency. 

tf_vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                max_df=0.75, max_features=10000, stop_words = "english",
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df["Translated"])  



In [5]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

n_topics = 10
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=30, 
                                              learning_method='online', learning_offset=60, n_jobs=1, random_state=42)
W = lda.fit_transform(tf_vectors)
H = lda.components_

In [6]:
# Show top 15 relevant words for each of the 25 topics
num_words = 10
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [', '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
df_topics

Unnamed: 0,Keywords,Topic ID
0,"employee, philippine, sagility, cial, market, department, australian, worker, stock, aspect",1
1,"information, technology, work, company, like, bos, want, time, project, manager",2
2,"information, technology, month, resignation, day, immediately, hospital, render, leave, resign",3
3,"post, applicant, recruiter, reference, technology, comment, information, reddit, quitting, multiple",4
4,"interview, information, technology, company, said, offer, human, resource, time, hour",5
5,"work, party, company, people, christmas, technology, information, friend, really, like",6
6,"salary, philippine, information, technology, experience, increase, year, company, position, know",7
7,"said, employee, vacation, information, labor, payment, terminated, technology, school, work",8
8,"vacation, food, hour, leaf, worklife, balance, work, information, technology, people",9
9,"organization, mainte, monthly, health, comptia, professor, program, course, information, technology",10


In [7]:
# Assign topic to each tweet
topicid = ["Topic " + str(i+1) for i in range(lda.n_components)]
tweetid = ["Submission " + str(i+1) for i in range(len(df["Translated"]))]

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=tweetid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}' 
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True) 
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,dominant_topic,breakdown
Submission 1,0.0,0.54,0.0,0.0,0.0,0.0,0.0,0.45,0.0,0.0,2,Topic 2: 0.54\nTopic 8: 0.45
Submission 2,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,8,Topic 8: 0.75\nTopic 2: 0.25
Submission 3,0.0,0.0,0.05,0.19,0.39,0.06,0.0,0.3,0.0,0.0,5,Topic 5: 0.39\nTopic 8: 0.3\nTopic 4: 0.19\nTopic 6: 0.06\nTopic 3: 0.05
Submission 4,0.0,0.0,0.0,0.0,0.7,0.0,0.15,0.0,0.15,0.0,5,Topic 5: 0.7\nTopic 7: 0.15\nTopic 9: 0.15
Submission 5,0.0,0.92,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,Topic 2: 0.92\nTopic 3: 0.07
Submission 6,0.0,0.1,0.18,0.0,0.11,0.0,0.0,0.0,0.0,0.62,10,Topic 10: 0.62\nTopic 3: 0.18\nTopic 5: 0.11\nTopic 2: 0.1
Submission 7,0.0,0.35,0.0,0.0,0.1,0.03,0.0,0.52,0.0,0.0,8,Topic 8: 0.52\nTopic 2: 0.35\nTopic 5: 0.1\nTopic 6: 0.03
Submission 8,0.0,0.37,0.0,0.62,0.0,0.0,0.0,0.0,0.0,0.0,4,Topic 4: 0.62\nTopic 2: 0.37
Submission 9,0.88,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Topic 1: 0.88\nTopic 3: 0.1
Submission 10,0.0,0.75,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,Topic 2: 0.75\nTopic 3: 0.25


In [8]:
# Visualize topics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px # pip install plotly

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:10])

# Apply K-means clustering
n_topics = 10
kmeans = KMeans(n_clusters=n_topics, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(df_topics_lda.iloc[:,:10])

In [9]:
# Create a new dataframe with t-SNE coordinates and cluster labels
import textwrap

def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Submission': df["Translated"],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})

df_topics_cluster['Submission'] = df_topics_cluster['Submission'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

df_topics_cluster.head(10)

Unnamed: 0,X,Y,Submission,Cluster,Breakdown
0,-16.487108,18.768225,have all the leave credits really been<br>earned now in my previous company<br>because they are all given at the<br>beginning of the year you can take leave<br>days even in january edit i dont get the<br>down votes on my comments im asking<br>because do not know if these are dumb<br>questions you can just let information<br>technology go,2,Topic 2: 0.54<br>Topic 8: 0.45
1,-22.146912,24.107485,this company didnt pass the probationary<br>period haha ​​if we can pass or fail<br>during probation is the company also<br>able to the leadership holds meetings<br>but the issue is not addressed<br>insufficient resources they expect a<br>developer to work on to projects im a pm<br>myself but im afraid that i might get<br>infected by the behavior of the people<br>here without regards to the developers,8,Topic 8: 0.75<br>Topic 2: 0.25
2,-22.046141,-4.459234,i hated the job i wished for with each<br>passing day i was more and more anxious<br>to go to work first job and i hated this<br>role happy and all but since then<br>information technology has taken a long<br>time and i am being called even on<br>vacation leave sick leave holiday off<br>the clock for urgent issues and out of<br>message status in teams im slowly<br>starting to hate my workplace look im<br>out of office right i expect you to<br>answer stupid i noticed that the smile<br>had disappeared when i entered i was<br>always frowning and i became distant<br>from my manager to the point that i<br>ranted about my manager to my friends i<br>also know that my salary is below the<br>minimum that the company can offer for<br>the position so im watching your<br>linkedin job posting the salary range is<br>a bit of an issue and someone will<br>message me putangina whats up again you<br>want information technology is always<br>the response of my mind applying for<br>other jobs since february i hope to get<br>information technology right away so i<br>can render and leave this punyeta place<br>im full of them hahaha,5,Topic 5: 0.39<br>Topic 8: 0.3<br>Topic 4: 0.19<br>Topic 6: 0.06<br>Topic 3: 0.05
3,-12.231421,-11.624436,but there are human resources that are<br>still rushing i applied to this job and<br>the human resources called me after a<br>day asking stuff like where do you live<br>then when i said i am willing to<br>relocate he said where are you moving do<br>you have friends here i of course<br>answered etc despite the weird questions<br>while doing background research i saw<br>that this company is composed of about<br>persons so i also messaged right away<br>about my salary expectations so we dont<br>waste our time with the proper interview<br>i asked nicely etc then he didnt reply<br>so i assumed out of their budget then a<br>while ago human resources called again<br>saying that this is the first time im<br>like this i dont want to ask him clearly<br>if its within the budget and then he<br>cant answer yes directly hes saying a<br>lot about information technology thats<br>why theres an interview theres no first<br>its my time to encounter something like<br>this usually because the human resources<br>that i messaged regarding the<br>expectation are gracious so that we dont<br>waste any more time mother,5,Topic 5: 0.7<br>Topic 7: 0.15<br>Topic 9: 0.15
4,30.410311,-1.973994,share some of your kupal techniques<br>while working from home so i have do the<br>needfulindian who is a senior web<br>developer and he likes to give orders<br>even though we have a direct boss its<br>just a simple code to make you look like<br>an engot so when he chats available to<br>talks i say that i have a meeting in<br>minutes and then i set a meeting<br>appointment just for myself to show my<br>status in the teams that are in a call<br>when i want to be more realistic i just<br>share the screen in the meeting where i<br>am the only participant,2,Topic 2: 0.92<br>Topic 3: 0.07
5,-1.329584,5.244882,im really praying for my current<br>companys downfall i just passed my<br>resignation letter last week and the<br>managers that i havent spoken to<br>suddenly message me that they are sad to<br>hear about information technology and<br>maybe i could just transfer teams that<br>is more aligned to my personal goals i<br>personally didnt take offense at<br>information technology since its a<br>harmless way of winning back an employee<br>a lot of my colleagues also passed their<br>resignation letters stating about the<br>personap goals are no longer aligned<br>then there is a survey suddenly about<br>information technology like one of the<br>questions is what are your personal<br>goals and how does this company help you<br>of course spicy na usana to our gc team<br>due to what transpired also i am<br>currently suffering from a fever and<br>asked for a sick leave today handbook<br>states that i cannot use leaves during<br>day render unless information technology<br>is a sick leave i asked my manager about<br>information technology because usually a<br>day leave doesnt need a med cert and now<br>he said that they need a med cert<br>because mondays and fridays are now<br>critical work days based on our<br>guidelines critical working days are<br>announced a month then suddenly there is<br>a function like this i had to hurry up<br>to go to the nearest doctor but buti the<br>csr of the health mainte ce organization<br>said just use their affiliated app which<br>i did im so pissed just to experience i<br>during my render and information<br>technology feels deliver because its<br>really invented like i even mentioned<br>that its not in the guideline i even<br>took a sick leave before on a friday but<br>there was no warning like this so no med<br>cert needed anyways thanks for reading<br>this rant of mine the company is in<br>mandaluyong near robinsons galleria if<br>you have any kind of test interview or<br>anything ill keep the name in private<br>but if you are curious just go to the<br>area on google maps on the other side of<br>edsa,10,Topic 10: 0.62<br>Topic 3: 0.18<br>Topic 5: 0.11<br>Topic 2: 0.1
6,-19.263718,19.851551,i dont want to go in its a draining low<br>salary plus the boss is somehow<br>enforcing the law in the office we got a<br>screenshot last monday of our tokwa boss<br>because when we look around my coworkers<br>and i are just talking random talks and<br>asking what we should do on a particular<br>scenario that is related to our work<br>sometimes because we really need to talk<br>since we have emails and scenarios where<br>there is only one recipient and we are<br>aware that you are not allowed to turn<br>around yes you are not allowed to just<br>look at your monitor from the beginning<br>of you shift until you go home stiff<br>neck is bad the only weird thing is that<br>the team leads who only ask how to<br>access files etc are then screenshotted<br>and told that nothing is being done so<br>how can we collab our ideas if we are<br>not allowed to talk and turn to each<br>other this is also why we are not<br>allowed to go home on time when you pm<br>you home you must clock out they dont<br>want a honda go even though you are done<br>doing information technology like why<br>dont you pay us overtime after all the<br>emails that are not answered since the<br>agent is off they say information<br>technology should be answered so what<br>will the agent answer even though he has<br>a day off is information technology our<br>fault that the issues are not resolved<br>because you are slow to come up with a<br>solution so the email keeps coming back<br>the number of people who have left this<br>company is mostly our colleagues who are<br>absent without leave because i cant<br>match my increase yet since the tenured<br>people said information technology<br>depends on tokwas trip and then its your<br>leave or even sick leave with valid<br>proof like labs and doctors letter<br>whether approved or not you will be<br>reduced in your key performance index so<br>even if you die because of your illness<br>you have to come in hahaha and finally<br>they installed an intercom speaker in<br>the whole office because they say there<br>will be dancing hahaha there was a<br>screenshot last monday of those who<br>didnt dance why didnt they move its just<br>funny that i put up with such a setup<br>when we can get out of here,8,Topic 8: 0.52<br>Topic 2: 0.35<br>Topic 5: 0.1<br>Topic 6: 0.03
7,-3.591256,-43.286163,why do i get this feeling of quitting my<br>job whenever i make a mistake at work<br>laughing out loud am i the only one i<br>take care of multiple projects and<br>because more and more i cant handle the<br>tasks im not even bad at my work but<br>because of the workload i stumbled im so<br>sad this just happened a few mins ago<br>fml i cant shake this feeling,4,Topic 4: 0.62<br>Topic 2: 0.37
8,39.851849,-24.674173,pesos per hour a month to perform<br>various administrative tasks,1,Topic 1: 0.88<br>Topic 3: 0.1
9,16.375126,-4.173429,boss offers the remaining of the th<br>month and calls them desperate hello<br>everyone i just want to rant about my<br>company some might know which company is<br>this if you know you better leave<br>immediately i will get to the bottom of<br>information technology immediately<br>because of poor management of the bosses<br>our salaries are always on cut off the<br>rest is still taking days before we can<br>give th month the others are still not<br>given then we found out that the<br>government mandated benefits almost a<br>year delayed deducting them but they<br>didnt reremit then one day a colleague<br>of mine was offered half of his th month<br>so long as he didnt have to report to<br>the department of labor and employment<br>why was he the only one the boss said he<br>was the only one who looked desperate<br>for money which is so degrading and its<br>insulting that the wife of her boss who<br>is the president doesnt even know<br>anything about the company,2,Topic 2: 0.75<br>Topic 3: 0.25


In [10]:
# Plot submissions as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Submission',
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.08,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'])
  )

fig.update_layout(height=710,
                  xaxis_title='', yaxis_title='',
                  margin=dict(b=200),
                  paper_bgcolor='#2c3e50',
                  title=dict(font=dict(color='white')),
                  legend=dict(title="Topic", font=dict(color='white')))
# !pip install nbformat
# restart kernel
fig.show()