In [7]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import HDBSCAN
from stopwordsiso import stopwords

import warnings
warnings.simplefilter("ignore")

# Glassdoor

In [18]:
path = "../data/glassdoor_reviews_clean.csv"
df = pd.read_csv(path, on_bad_lines="skip")
pros = df[['review_pros', 'len_review_pros']].dropna().query('len_review_pros <= 40').sample(50000, random_state=123)
cons = df[['review_cons', 'len_review_cons']].dropna().query('len_review_cons <= 40').sample(50000, random_state=123)

In [19]:
pros_docs = pros['review_pros'].tolist()
cons_docs = cons['review_cons'].tolist()

embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")
pros_embeddings = embedding_model.encode(pros_docs, show_progress_bar=True, convert_to_numpy=True)
cons_embeddings = embedding_model.encode(cons_docs, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 1563/1563 [00:07<00:00, 203.59it/s]
Batches: 100%|██████████| 1563/1563 [00:07<00:00, 211.39it/s]


## Pros

In [22]:
pros_model = BERTopic(verbose=True,
                      min_topic_size=100, 
                      calculate_probabilities=True,
                      embedding_model=embedding_model,
                      vectorizer_model=CountVectorizer(stop_words=list(stopwords('en'))),
                      umap_model=UMAP(random_state=123))

topics_p, probs_p = pros_model.fit_transform(pros_docs, pros_embeddings)

topics_p = pros_model.reduce_outliers(pros_docs, topics=topics_p, probabilities=probs_p, strategy="embeddings")
pros_model.update_topics(pros_docs, topics=topics_p)

2026-02-10 14:25:35,041 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-10 14:26:11,745 - BERTopic - Dimensionality - Completed ✓
2026-02-10 14:26:11,748 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-10 14:26:32,493 - BERTopic - Cluster - Completed ✓
2026-02-10 14:26:32,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-10 14:26:32,909 - BERTopic - Representation - Completed ✓


In [23]:
for row in pros_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")

Topic: 0_pay_benefits_good_salary, N= 6403
Topic: 1_environment_atmosphere_friendly_fun, N= 2670
Topic: 2_free_food_lunch_coffee, N= 1718
Topic: 3_balance_life_work_good, N= 1600
Topic: 4_culture_great_company_and, N= 1592
Topic: 5_team_teams_great_and, N= 1883
Topic: 6_hours_flexible_schedule_time, N= 1368
Topic: 7_training_opportunities_career_growth, N= 1734
Topic: 8_none_nothing_think_any, N= 828
Topic: 9_management_clients_managers_manager, N= 1366
Topic: 10_coworkers_workers_co_with, N= 1683
Topic: 11_pros_no_pro_there, N= 675
Topic: 12_technology_projects_technologies_to, N= 1328
Topic: 13_students_campus_faculty_teachers, N= 930
Topic: 14_place_location_work_to, N= 1196
Topic: 15_discount_discounts_employee_on, N= 719
Topic: 16_home_from_work_working, N= 599
Topic: 17_patients_patient_care_hospital, N= 679
Topic: 18_learning_learn_experience_lot, N= 1086
Topic: 19_schedule_flexible_scheduling_your, N= 885
Topic: 20_company_for_great_to, N= 1568
Topic: 21_pto_unlimited_benefits_

## Cons

In [25]:
cons_model = BERTopic(verbose=True,
                      min_topic_size=170, 
                      calculate_probabilities=True,
                      embedding_model=embedding_model)

topics_c, probs_c = cons_model.fit_transform(cons_docs, cons_embeddings)

topics_c = cons_model.reduce_outliers(cons_docs, topics=topics_c, probabilities=probs_c, strategy="embeddings")
cons_model.update_topics(cons_docs, topics=topics_c)

2026-02-10 14:28:59,086 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-10 14:29:25,532 - BERTopic - Dimensionality - Completed ✓
2026-02-10 14:29:25,536 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-10 14:29:34,780 - BERTopic - Cluster - Completed ✓
2026-02-10 14:29:34,790 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-10 14:29:35,192 - BERTopic - Representation - Completed ✓


In [27]:
for row in cons_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")

Topic: 0_pay_low_salary_is, N= 5535
Topic: 1_none_nothing_say_think, N= 4055
Topic: 2_cons_no_any_con, N= 3151
Topic: 3_work_can_stressful_be, N= 3037
Topic: 4_management_managers_employees_and, N= 3085
Topic: 5_balance_life_work_no, N= 1265
Topic: 6_hours_long_time_schedule, N= 2415
Topic: 7_benefits_insurance_health_no, N= 1168
Topic: 8_company_the_to_growing, N= 2233
Topic: 9_parking_commute_location_traffic, N= 1066
Topic: 10_customers_rude_customer_service, N= 1452
Topic: 11_training_learn_to_you, N= 983
Topic: 12_culture_toxic_environment_and, N= 1066
Topic: 13_career_advancement_opportunities_growth, N= 1192
Topic: 14_technology_processes_systems_slow, N= 1224
Topic: 15_promotion_promotions_promoted_get, N= 668
Topic: 16_shifts_shift_night_hours, N= 667
Topic: 17_students_school_teachers_the, N= 680
Topic: 18_communication_team_teams_departments, N= 1079
Topic: 19_poor_management_lack_pay, N= 2162
Topic: 20_leadership_senior_leaders_lack, N= 974
Topic: 21_hot_cold_season_weather

# Indeed Reviews

In [43]:
path_indeed = "../data/indeed_reviews_clean.csv"
df_indeed = pd.read_csv(path_indeed, on_bad_lines="skip")
reviews_indeed = df_indeed[['main_review', 'len_main_review']].dropna().query('len_main_review <= 150').sample(100000, random_state=42)

In [44]:
docs_indeed = reviews_indeed['main_review'].tolist()
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
embeddings_indeed = embedding_model.encode(docs_indeed, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 3125/3125 [00:28<00:00, 109.85it/s]


In [45]:
indeed_model = BERTopic(verbose=True,
                        calculate_probabilities=False,
                        embedding_model=embedding_model,
                        vectorizer_model=CountVectorizer(stop_words=list(stopwords('en'))),
                        umap_model=UMAP(random_state=123, n_jobs=16), 
                        hdbscan_model = HDBSCAN(min_cluster_size=90, n_jobs=16))

topics_indeed, probs_indeed = indeed_model.fit_transform(docs_indeed, embeddings_indeed)

topics_indeed = indeed_model.reduce_outliers(docs_indeed, topics=topics_indeed, probabilities=probs_indeed, strategy="embeddings")
indeed_model.update_topics(docs_indeed, topics=topics_indeed)

2026-02-10 14:51:12,694 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-10 14:52:43,984 - BERTopic - Dimensionality - Completed ✓
2026-02-10 14:52:43,988 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-10 14:53:41,578 - BERTopic - Cluster - Completed ✓
2026-02-10 14:53:41,597 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-10 14:53:44,070 - BERTopic - Representation - Completed ✓


In [46]:
for row in indeed_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")

Topic: 0_to_and_the_work, N= 60241
Topic: 1_army_military_the_in, N= 6104
Topic: 2_patients_hospital_patient_care, N= 4571
Topic: 3_what_company_at_like, N= 1710
Topic: 4_food_restaurant_the_and, N= 2228
Topic: 5_children_students_kids_school, N= 1606
Topic: 6_guard_national_army_the, N= 998
Topic: 7_air_force_usaf_the, N= 822
Topic: 8_navy_the_in_of, N= 821
Topic: 9_call_calls_center_phone, N= 1277
Topic: 10_walmart_mart_wal_store, N= 734
Topic: 11_bank_banking_financial_and, N= 908
Topic: 12_driver_drivers_you_driving, N= 968
Topic: 13_hotel_guests_hilton_guest, N= 724
Topic: 14_amazon_apple_is_the, N= 603
Topic: 15_de_la_que_trabajo, N= 407
Topic: 16_ups_you_is_to, N= 382
Topic: 17_marine_corps_marines_the, N= 412
Topic: 18_pharmacy_pharmacist_prescriptions_pharmacists, N= 494
Topic: 19_dealership_car_mercedes_cars, N= 609
Topic: 20_mcdonald_mcdonalds_mcdonaldâ_at, N= 425
Topic: 21_target_robert_to_they, N= 288
Topic: 22_disney_walt_cast_disneyland, N= 320
Topic: 23_cvs_to_the_at, N

# Schedule sample

In [47]:
d_pro = pd.DataFrame({"doc": pros_docs, "type": (["p"]*50000), "topic": topics_p, 
                      "schedule": [1 if t in [6, 19, 21, 29, 31, 41, 47] else 0 for t in topics_p]})
d_con = pd.DataFrame({"doc": cons_docs, "type": (["c"]*50000), "topic": topics_c, 
                      "schedule": [1 if t in [5, 6, 28, 29, 30] else 0 for t in topics_c]})
d = pd.concat([d_pro, d_con]).reset_index().drop(columns="index")

In [50]:
d.to_csv("../data/labelled_docs.csv")