### Tokenizing + Vectorization

In [48]:
import re

# pip install pandas
import pandas as pd 

# pip install nltk
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn import decomposition

# pip install numpy
import numpy as np 

In [49]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

df = pd.read_csv("../dataset/_compiled/Compiled.csv")

In [50]:
def remove_punctuation(cell):
    return re.sub(r"[^\w\s]", "", cell) 

def my_tokenizer(text):
    # 1-gram tokens of more than a character
    word_tokens = [x for x in word_tokenize(text) if len(x) > 1]
    # remove stop words 
    filtered_tokens = [word for word in word_tokens if word not in set([remove_punctuation(x) for x in [*stopwords.words('english'), "im", "were", "youre", "thats", "theres"]])]
    # lemmatize the tokens 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

In [51]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# vectorize to a matrix of TF-IDF features
# df = document frequency.  
 
tf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer,
                                min_df = 6, max_df=0.60, max_features=10000, 
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df["Translated"])  

In [52]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

n_topics = 5
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=30, 
                                              learning_method='online', learning_offset=30, n_jobs=1, random_state=42)
W = lda.fit_transform(tf_vectors)
H = lda.components_

In [53]:
# Show top 15 relevant words for each of the 25 topics
num_words = 10
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [' '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
df_topics

Unnamed: 0,Keywords,Topic ID
0,pay day month contract company labor employment last employer resignation,1
1,team manager leave work even supervisor company bos day one,2
2,interview job company resource offer human said applied time asked,3
3,job company salary work working year philippine experience people pay,4
4,work like even want really one people time also office,5


In [54]:
# Assign topic to each tweet
topicid = ["Topic " + str(i+1) for i in range(lda.n_components)]
tweetid = ["Submission " + str(i+1) for i in range(len(df["Translated"]))]

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=tweetid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}' 
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True) 
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(5)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,dominant_topic,breakdown
Submission 1,0.71,0.0,0.28,0.0,0.0,1,Topic 1: 0.71\nTopic 3: 0.28
Submission 2,0.0,0.02,0.98,0.0,0.0,3,Topic 3: 0.98\nTopic 2: 0.02
Submission 3,0.72,0.0,0.0,0.0,0.28,1,Topic 1: 0.72\nTopic 5: 0.28
Submission 4,0.0,0.21,0.12,0.67,0.0,4,Topic 4: 0.67\nTopic 2: 0.21\nTopic 3: 0.12
Submission 5,0.32,0.27,0.0,0.0,0.41,5,Topic 5: 0.41\nTopic 1: 0.32\nTopic 2: 0.27


In [55]:
# Visualize topics
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px # pip install plotly
import matplotlib.pyplot as plt # pip install matplotlib
%matplotlib inline

# https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:n_topics])

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# Apply K-means clustering 
kmeans = KMeans(n_clusters=n_topics, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(df_topics_lda.iloc[:,:n_topics])

In [56]:
# Create a new dataframe with t-SNE coordinates and cluster labels
import textwrap

def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Submission': df["Translated"],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})

df_topics_cluster['Submission'] = df_topics_cluster['Submission'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

Unnamed: 0,X,Y,Submission,Cluster,Breakdown
0,-12.832397,-31.309139,esena help i filed a request for<br>assistance at esena re last pay that<br>took months and coe last friday i have<br>the ref number of the request for<br>assistance case that same day suddenly<br>the human resources messaged that the<br>check of my last pay is okay and can i<br>have already picked him up my question<br>is how to cancel the request for<br>assistance since i already got my last<br>pay and coe thank you,1,Topic 1: 0.71<br>Topic 3: 0.28
1,6.597381,-48.546185,dodged a bad company and human resources<br>i guess ive applied for one of the<br>subsidiaries of aboitiz and id really<br>like to share how horrible the<br>application process or how the human<br>resources handled it i guess was bale i<br>just graduated from my electrical<br>engineering degree this january and ive<br>started to apply to a few companies<br>including this subsidiary from aboitiz<br>when compared to companies ive also done<br>interviews with like pldt jg summit and<br>smc my experience with aboitiz was very<br>disappointing i had my expectations for<br>aboitiz since it was known to be one of<br>the top companies and employers in the<br>philippines so maybe the way human<br>resources communicate is formal and<br>classy ​​right its not like im just<br>communicating with someone in a lowend<br>company who is deeply struggling in<br>handling communications the problem here<br>was that i was the one who was<br>understanding and patient even though<br>they were the ones who had lapses in<br>handling my application three months<br>have passed and theyve contacted me<br>saying that my application was already<br>being considered but it was placed on<br>hold due to a problem with the vacancy<br>of the position with that i just had to<br>withdraw and say that i wasnt already<br>comfortable with the process and how<br>poorly they were handling my application<br>i withdrew because maybe my application<br>was a hassle on their end besides maybe<br>human resources is really just waiting<br>for me to break towards their<br>treatmentattitude human resources<br>replied to my withdrawal i cant put<br>exact phrases but heres the gist human<br>resources said the usual stuff first<br>like were deeply sorry about the<br>inconveniences made on your end<br>blablabla sorry to see you go then<br>according to the end they said i was too<br>arty raw for complaining and that i<br>should man up more to understand<br>reallife problems that are being<br>encountered in their office i shouldnt<br>be complaining too because i am not an<br>employee to ask for something on their<br>end what i was just asking for was<br>respect heres one one out of a lot<br>inconvenience problem that happened they<br>asked me to attend an f f interview in<br>their office the two hour travel going<br>to their office didnt matter because i<br>was enjoying my stay in our province to<br>avoid the city heat however when i<br>arrived then waited for two hours in the<br>lobby nobody in the division division or<br>department where the people i met<br>remembered that i had a scheduled<br>interview the human resources<br>responsible for this was called to have<br>a meeting in another office in a<br>different city so nobody was left in<br>their office where i was to<br>arrangeconduct the interview so<br>according to the ending was that i was<br>asked to go home instead forgave them<br>for that because it is understandable<br>that they might have miscommunications<br>in the office in that case the hr still<br>rated me,3,Topic 3: 0.98<br>Topic 2: 0.02
2,-14.781674,-18.974924,sick leave to be tortured im under an<br>agency i filed a sick leave for some<br>reason not because im sick prior to that<br>i asked permission from my client days<br>ahead i also provided med cert then this<br>company we still need a<br>screenshotdocumentation that i asked<br>permission from the client so my med<br>cert is irrelevant there is no<br>acknowledgment from the client is it<br>really this difficult to file a sick<br>leave in the philippines,1,Topic 1: 0.72<br>Topic 5: 0.28
3,17.405867,-4.633053,help a nervous young adult out are the<br>hours a week too much will i still have<br>a worklife balance is starting pay as<br>probationary for months ok but if i<br>become overworked its super lacking haha<br>​​for people who have worked this role<br>how are you kindly let me know if this<br>role is good or what haha ​​​ well thank<br>you very much to those who will answer<br>sorry for the trouble and the questions<br>because i dont have parents adult in<br>life so no one can ask for guidance tips<br>and encouragement haha ​​thank you again,4,Topic 4: 0.67<br>Topic 2: 0.21<br>Topic 3: 0.12
4,-12.253451,19.044353,absent without leave how are you i know<br>this maybe off but im really curious<br>about the former government employees<br>here who have been lost how are you how<br>are you having a hard time getting<br>clearance i really want to leave here as<br>soon as possible since its super toxic<br>and ill never really be in government<br>again,5,Topic 5: 0.41<br>Topic 1: 0.32<br>Topic 2: 0.27
5,-8.502748,32.92289,its not my fault that i will<br>overcompensate because there are only a<br>few of us here in summary is the title<br>long story told is its annoying to come<br>in for days when im also absent and im<br>basically asking for a suspension im a<br>warehouse staff and mostly i did<br>preparation of the requested boxes last<br>wednesday im sorry that my area is clean<br>there are only a few caster boards of<br>boxes the boxes are placed in front of<br>my area when i came back on thursday<br>morning it was as if there was a storm<br>and the boxes there were exploded<br>especially in the front so it will be<br>over until sometime the head afternoon<br>came and i was met with continuous<br>requests for boxes so they piled up in<br>front including the boxes that were<br>already there the front was full i was<br>arranging the boxes when i thought i<br>heard someone call me i went out but i<br>thought the other sound of my name was<br>called so i went back inside to get<br>ready again i was called oh i see and i<br>was a little annoyed that i didnt come<br>right away i was told not to go back too<br>much and finish something first before<br>going somewhere as for other things what<br>i do is prepare the box and take it out<br>of the box g the pile will increase i<br>said that because my work has increased<br>because of the night shifts dont blame<br>me our leader reasoned that there are<br>few people even though i am from the<br>other shift and they are always bored if<br>there is nothing to do thats why the<br>boxes are piling up i just dont speak<br>anymore when im angry i speed up my<br>actions and block out all the things so<br>i cant talk i decided to go out instead<br>of being caught up with oh i see that im<br>leaving and im going to go out that i<br>seemed to be asking for an explanation<br>as to why and it was obvious that i was<br>annoyed so i said that you should sweat<br>and do laundry then i turned around to<br>go to the biometrics and they wanted me<br>to make a presentation and i was told<br>that they had even talked to me to go<br>back to my area and i painted the<br>sleeping place i saw hidden in the boxes<br>in front of my area i didnt come in on<br>friday and maybe talk to me about team<br>building this saturday but im not with<br>minove that week so these days im absent<br>im a little out of the myself because it<br>was the same when i was in my<br>stepfathers small construction company<br>and when i was doing my thesis i was so<br>tired i would overcompensate for people<br>who didnt have anything and then i would<br>be the one to blame for our rate and it<br>becomes with ot a little bit like that<br>but i dont like the movement because im<br>used to it who is always working in our<br>house because if you dont get out of<br>here thats all i can hear from my<br>stepfather and i keep looking like a<br>hero at work but now i feel guilty even<br>though i broke even once i want to quit<br>if only i wasnt tight with money now i<br>started as an oncall in september but i<br>signed a contract with them in december<br>i was given the uniform last tuesday ive<br>been waiting for months and im going to<br>get my coe and im going to start,5,Topic 5: 0.81<br>Topic 1: 0.19
6,-20.92993,4.280958,moonlighting job hello guys any advice<br>please job medical field twice a week<br>duty human resources per duty allowed<br>month vl not stressful but super<br>brainfog job new job human resources<br>field solo human resources compressed<br>days working am pm lots of training<br>shouldered by the management hr away<br>from home and based on working there for<br>days the chief executive officer and<br>management are still very welcoming<br>humble and humanitarian but im not sure<br>if i can handle less than a hundred<br>employees and im the only human<br>resources im sure its going to be<br>stressful better salary compare to job<br>so i started working as a human<br>resources generalist on the job<br>moonlighting is not allowed on the job<br>but it is allowed on the job but now i<br>am thinking about whether i should<br>resign from the job immediately because<br>that is my plan itry work for a month at<br>the job and see if i like it so if i<br>dont return to the job we are allowed to<br>leave the job for a month so there is no<br>problem if i dont go to work for a month<br>and can extend should i take the is it a<br>risk that the job will see it in ssspag<br>ibig and other government stuff that<br>there are two who fall in my sss,2,Topic 2: 0.47<br>Topic 4: 0.25<br>Topic 3: 0.18<br>Topic 1: 0.1
7,-37.615143,4.428259,youll be fine,2,Topic 2: 0.86<br>Topic 3: 0.04<br>Topic 1: 0.03<br>Topic 4: 0.03<br>Topic 5: 0.03
8,-8.0945,-17.758425,how can you tell when your boss is<br>abusing you hi im just asking its my<br>first time to get a job after graduating<br>i chose this job because its close and i<br>only have experience is it correct when<br>i interviewed is it okay to ask for a<br>copy of the health certificate i<br>benifits and after that we didnt have a<br>contract signing but my boss said i<br>should start with basic pay php and then<br>ive had a month of php my rate is still<br>php i want to talk to him about this and<br>these are your coworkers who have been<br>working for almost yrs they say that<br>their health benifits are not paid and<br>they dont receive anything from th month<br>pay so it makes me think that money is<br>coming in to them like they are building<br>a house and have cars but why is that<br>the same since when i asked them about<br>benifts they got the cutoff for this job<br>every week the rest is gone and they<br>dont have a contract signing yet are we<br>going to be abused or are we just<br>overthinking my work is data entry for a<br>uniform sewing machine and my colleagues<br>guys please slap the truth thank you,1,Topic 1: 0.39<br>Topic 4: 0.28<br>Topic 5: 0.24<br>Topic 3: 0.09
9,-27.769974,13.306633,always getting picked on in the office<br>title says it all hahaha im new but im<br>always picked on by the bosses and the<br>staff and im the only one who is always<br>being scolded and blamed for everything<br>without even giving me a chance to<br>explain and then i proceed to slander in<br>the office of my teammates lol i cried a<br>few times because its always like this<br>that makes me look incompetent because<br>its my teammates who teach me wrong as<br>if its sabotage i want to resign<br>effective immediately,2,Topic 2: 0.57<br>Topic 5: 0.29<br>Topic 1: 0.13


In [57]:
# Plot submissions as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', 
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Submission',
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Submission':False, 'Breakdown':True})

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.08,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'])
  )

fig.update_layout(height=710,
                  xaxis_title='', yaxis_title='',
                  margin=dict(b=200),
                  paper_bgcolor='#2c3e50',
                  title=dict(font=dict(color='white')),
                  legend=dict(title="Topic", font=dict(color='white')))

# !pip install nbformat
# restart kernel
fig.show()