In [None]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from sklearn.cluster import HDBSCAN
from sentence_transformers import SentenceTransformer
# from gensim.utils import simple_preprocess
# from gensim.corpora import Dictionary
# from gensim.models.coherencemodel import CoherenceModel
import numpy as np

In [47]:
def topic_size_gini(topic_info):
    """
    Gini coefficient over non-outlier topic sizes
    """
    sizes = topic_info.loc[topic_info.Topic != -1, "Count"].values

    if len(sizes) == 0:
        return np.nan

    sizes = np.sort(sizes)
    n = len(sizes)
    cum_sizes = np.cumsum(sizes)

    gini = (n + 1 - 2 * np.sum(cum_sizes) / cum_sizes[-1]) / n
    return gini

In [None]:
def setup_model(min_cluster_size=50,
                n_neighbors=15,
                n_components=5,
                min_dist=0.1,
                random_state=123):
    """
    Sets up a model configuration based on the relevant parameters
    """
    #clustering model
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric="cosine")

    #dimension reduction model
    umap_model = UMAP(n_neighbors=n_neighbors,
                      n_components=n_components,
                      min_dist=min_dist,
                      metric="cosine",
                      random_state=random_state)
    
    #model
    topic_model = BERTopic(umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           calculate_probabilities=False,
                           verbose=False)

    return topic_model

def topic_diversity(topic_model, top_k=10):
    """
    Calculate the topic diversity of a BERTopic model. Higher is better.
    """
    topics = topic_model.get_topics()
    words = []
    for topic_id, topic_words in topics.items():
        if topic_id == -1:
            continue
        words.extend([w for w, _ in topic_words[:top_k]])

    return len(set(words)) / len(words)

def topic_coherence(topic_model, tokenized_docs, top_k=10):
    """
    Calculate the topic coherence of a BERTopic model. Higher is better.
    """
    topics = topic_model.get_topics()

    topic_words = [[w for w, _ in words[:top_k]] for t_id, words in topics.items() if t_id != -1]

    dictionary = Dictionary(tokenized_docs)

    coherence_model = CoherenceModel(topics=topic_words,
                                     texts=tokenized_docs,
                                     dictionary=dictionary,
                                     coherence="c_v")

    return coherence_model.get_coherence()


def fit_diagnose(topic_model, docs, embeddings, tokenized_docs):
    topics, _ = topic_model.fit_transform(docs, embeddings)
    freq = topic_model.get_topic_freq()

    outliers = freq.loc[freq.Topic == -1, "Count"]
    outlier_rate = outliers.iloc[0] / len(docs) if len(outliers) else 0.0

    return {"n_topics": freq.query("Topic != -1").shape[0],
            "outlier_rate": outlier_rate,
            "topic_coherence_cv": topic_coherence(topic_model, tokenized_docs),
            "topic_diversity": topic_diversity(topic_model)}

In [10]:
path = "../data/glassdoor_reviews_clean.csv"
df = pd.read_csv(path, on_bad_lines="skip").sample(50000, random_state=123)
df.head(1)

Unnamed: 0,reviewID,date,rating_overall,rating_business_outlook,rating_worklife_balance,rating_culture_values,rating_diversity_inclusion,rating_leadership,rating_recommend_friend,rating_career_opportunity,...,review_advice,count_helpful,count_nothelpful,language,companyID,companyname,len_review_pros,len_review_cons,len_review_summary,len_review_advice
5503668,36962338,2020-10-09,4,,0.0,0,0,0.0,,0.0,...,,0.0,0.0,eng,307500.0,Kohl's,14.0,9.0,1.0,


In [15]:
pros_docs = df['review_pros'].dropna().tolist()
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
embeddings = embedding_model.encode(pros_docs, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 1563/1563 [00:16<00:00, 97.15it/s] 


In [None]:
dfs = []

for neighbors in [50, 100, 150, 200]:
    for size in [50, 100, 200]:
        umap_model = UMAP(n_neighbors=neighbors, random_state=123)

        model = BERTopic(
            min_topic_size=size,
            umap_model=umap_model
        )
        model.fit(pros_docs, embeddings)

        topic_info = model.get_topic_info()
        dfs.append(topic_info)

        outlier_rate = (topic_info.loc[topic_info.Topic == -1, "Count"].iloc[0] / len(pros_docs))

        gini = topic_size_gini(topic_info)

        print(f"UMAP neighbors: {neighbors}, Min topic size: {size}")
        print(f"Num_topics: {topic_info.shape[0] - 1}, "f"Outlier %: {outlier_rate:.3f}, "f"Gini: {gini:.3f}")
        print("---------------------")