In [6]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from stopwordsiso import stopwords

import warnings
warnings.simplefilter("ignore")

In [3]:
path = "../data/glassdoor_reviews_clean.csv"
df = pd.read_csv(path, on_bad_lines="skip")
pros = df[['review_pros', 'len_review_pros']].dropna().query('len_review_pros <= 40').sample(50000, random_state=123)
cons = df[['review_cons', 'len_review_cons']].dropna().query('len_review_cons <= 40').sample(50000, random_state=123)

In [6]:
pros_docs = pros['review_pros'].tolist()
cons_docs = cons['review_cons'].tolist()

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
pros_embeddings = embedding_model.encode(pros_docs, show_progress_bar=True, convert_to_numpy=True)
cons_embeddings = embedding_model.encode(cons_docs, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 1563/1563 [00:08<00:00, 185.38it/s]
Batches: 100%|██████████| 1563/1563 [00:06<00:00, 236.31it/s]


## Pros

In [None]:
pros_model = BERTopic(verbose=True,
                      min_topic_size=100, 
                      calculate_probabilities=True,
                      embedding_model=embedding_model,
                      vectorizer_model=CountVectorizer(stop_words=list(stopwords('en'))),
                      umap_model=UMAP(random_state=123)),

topics_p, probs_p = pros_model.fit_transform(pros_docs, pros_embeddings)

topics_p = pros_model.reduce_outliers(pros_docs, topics=topics_p, probabilities=probs_p, strategy="embeddings")
pros_model.update_topics(pros_docs, topics=topics_p)

In [36]:
for row in pros_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")

Topic: 0_culture_work_great_and, N= 2468
Topic: 1_flexible_schedule_hours_scheduling, N= 2544
Topic: 2_team_teams_great_teamwork, N= 2008
Topic: 3_environment_atmosphere_friendly_fun, N= 2060
Topic: 4_free_food_lunch_coffee, N= 1597
Topic: 5_balance_life_work_good, N= 1629
Topic: 6_place_work_to_great, N= 1833
Topic: 7_coworkers_workers_co_nice, N= 1639
Topic: 8_pay_decent_salary_good, N= 1924
Topic: 9_students_campus_teachers_faculty, N= 994
Topic: 10_none_nothing_think_this, N= 787
Topic: 11_discount_discounts_employee_dress, N= 823
Topic: 12_pros_pro_no_there, N= 669
Topic: 13_technology_projects_technologies_tech, N= 1107
Topic: 14_patients_patient_care_hospital, N= 681
Topic: 15_management_managers_manager_is, N= 1123
Topic: 16_easy_job_you_it, N= 976
Topic: 17_home_from_work_working, N= 585
Topic: 18_staff_friendly_colleagues_very, N= 751
Topic: 19_company_for_great_to, N= 1926
Topic: 20_customers_customer_service_store, N= 672
Topic: 21_pto_unlimited_benefits_generous, N= 432
To

## Cons

In [None]:
cons_model = BERTopic(verbose=True,
                      min_topic_size=170, 
                      calculate_probabilities=True,
                      embedding_model=embedding_model)

topics_c, probs_c = cons_model.fit_transform(cons_docs, cons_embeddings)

topics_c = cons_model.reduce_outliers(cons_docs, topics=topics_c, probabilities=probs_c, strategy="embeddings")
cons_model.update_topics(cons_docs, topics=topics_c)

In [41]:
for row in cons_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")

Topic: 0_pay_low_salary_is, N= 5655
Topic: 1_none_nothing_say_think, N= 4150
Topic: 2_work_can_stressful_be, N= 3186
Topic: 3_cons_any_no_there, N= 1678
Topic: 4_management_managers_employees_and, N= 2880
Topic: 5_balance_life_work_no, N= 1267
Topic: 6_hours_long_time_schedule, N= 2388
Topic: 7_cons_con_no_this, N= 1183
Topic: 8_customers_rude_customer_store, N= 1648
Topic: 9_company_the_growing_to, N= 2263
Topic: 10_benefits_insurance_health_no, N= 1202
Topic: 11_training_to_you_learn, N= 1054
Topic: 12_technology_processes_systems_slow, N= 1177
Topic: 13_career_advancement_opportunities_growth, N= 1219
Topic: 14_poor_management_lack_pay, N= 2088
Topic: 15_shifts_shift_night_hours, N= 679
Topic: 16_promotion_promotions_promoted_get, N= 688
Topic: 17_students_school_teachers_the, N= 690
Topic: 18_leadership_senior_leaders_lack, N= 1007
Topic: 19_culture_and_diversity_the, N= 632
Topic: 20_patient_patients_hospital_care, N= 600
Topic: 21_politics_bureaucracy_political_too, N= 986
Topic:

# Indeed Reviews

In [None]:
path_indeed = "../data/indeed_reviews_clean.csv"
df_indeed = pd.read_csv(path_indeed, on_bad_lines="skip")
reviews_indeed = df_indeed[['main_review', 'len_main_review']].dropna().query('len_main_review <= 40').sample(50000, random_state=123)

In [None]:
docs_indeed = reviews_indeed['main_review'].tolist()

embeddings_indeed = embedding_model.encode(docs_indeed, show_progress_bar=True, convert_to_numpy=True)

In [None]:
indeed_model = BERTopic(verbose=True,
                        min_topic_size=100,
                        calculate_probabilities=True,
                        embedding_model=embedding_model,
                        vectorizer_model=CountVectorizer(stop_words=list(stopwords('en'))),
                        umap_model=UMAP(random_state=123))

topics_indeed, probs_indeed = indeed_model.fit_transform(docs_indeed, embeddings_indeed)

topics_indeed = indeed_model.reduce_outliers(docs_indeed, topics=topics_indeed, probabilities=probs_indeed, strategy="embeddings")
indeed_model.update_topics(docs_indeed, topics=topics_indeed)

In [None]:
for row in indeed_model.get_topic_info().iterrows():
    print(f"Topic: {row[1]['Name']}, N= {row[1]['Count']}")