In [None]:
!pip install bertopic
!pip install sentence_transformers
!pip install hdbscan
!pip install umap-learn

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

In [None]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
import numpy as np

df=pd.read_csv("df.csv")


# # ------------------ Data Loading ------------------
# # Load the synthetic automotive conversations dataset
# df = pd.read_csv("synthetic_auto_conversations.csv")
documents = df["merged"].dropna().tolist()

In [None]:
# ------------------ Custom Vectorizer ------------------
# Create a custom vectorizer with stop word removal and n-gram range for better topic extraction.
custom_stop_words = [
    "you", "your", "to", "im", "our", "it", "can", "the", "and", "a", "on",
    "salesman", "customer", "example", "platforms", "multiple", "that", "of",
    "me", "in", "with", "for", "this", "about", "we", "any", "new",
    "have", "is", "how", "more", "what", "specific", "provide",'agent','feel','especially','checked','customer',
    'service','appointment','sorry','hear','soon','commute','feel','heavy traffic','tell','noticed'
]
custom_stop_words = "english"  # You can also pass a list of domain-specific stop words if needed.
vectorizer_model = CountVectorizer(stop_words=custom_stop_words, ngram_range=(1, 2))

In [None]:
# Create your representation models
mmr = MaximalMarginalRelevance(diversity=0.6)

# ------------------ Create & Fit BERTopic Model ------------------
# Create a BERTopic model using the custom vectorizer.
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    language="english",
    min_topic_size=35,
    top_n_words=10,representation_model=mmr,
    calculate_probabilities=True,
    verbose=True
)

# Fit the model on the automotive conversations
topics, probs = topic_model.fit_transform(documents)

2025-02-19 16:03:01,675 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

2025-02-19 16:04:15,786 - BERTopic - Embedding - Completed ✓
2025-02-19 16:04:15,789 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-19 16:04:20,478 - BERTopic - Dimensionality - Completed ✓
2025-02-19 16:04:20,481 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-19 16:04:20,599 - BERTopic - Cluster - Completed ✓
2025-02-19 16:04:20,608 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-19 16:04:22,183 - BERTopic - Representation - Completed ✓


In [None]:
# ------------------ Topic Information ------------------
# Print topic information (this includes topic IDs, frequency, and representative words)
topic_info = topic_model.get_topic_info()
topic_info


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,107,-1_agent_especially_brakes_customer noticed,"[agent, especially, brakes, customer noticed, ...",[Customer: I'm having trouble with my brakes. ...
1,0,608,0_agent_especially_checked customer_feel quite,"[agent, especially, checked customer, feel qui...",[Customer: I'm having trouble with my brakes. ...
2,1,74,1_gear_brakes_noticed_customer,"[gear, brakes, noticed, customer, having troub...",[Customer: I'm having trouble with my brakes. ...
3,2,69,2_agent_gear_sorry hear_soon possible,"[agent, gear, sorry hear, soon possible, espec...",[Customer: I'm having trouble with my gear. La...
4,3,60,3_gear_brakes_noticed_customer,"[gear, brakes, noticed, customer, having troub...",[Customer: I'm having trouble with my gear. La...
5,4,44,4_mobile_agent_brakes_emergency,"[mobile, agent, brakes, emergency, heavy traff...",[Customer: I'm having trouble with my mobile. ...
6,5,38,5_noticed_agent_clutch_daily,"[noticed, agent, clutch, daily, especially hea...",[Customer: I'm having trouble with my gear. La...


In [None]:
# @title Fine Tuning the model


In [None]:
# Create a new list to hold the updated topic assignments.
redistributed_topics = []

for idx, topic in enumerate(topics):
    if topic == -1:
        # For an outlier, examine its probability distribution
        doc_probs = probs[idx]
        # Check if the probability vector is non-zero
        if np.sum(doc_probs) == 0:
            redistributed_topics.append(topic)  # Edge case: keep as outlier
        else:
            # Reassign to the topic with the highest probability
            new_topic = int(np.argmax(doc_probs))
            redistributed_topics.append(new_topic)
    else:
        redistributed_topics.append(topic)

doc_topic_df = pd.DataFrame({
    "document": documents,
    "topic": redistributed_topics
})

# Save the updated document-topic assignments
doc_topic_df.to_csv("document_topics_redistributed.csv", index=False)

# Create updated topic info by calculating frequency counts from redistributed topics.
topic_counts = doc_topic_df["topic"].value_counts().reset_index()
topic_counts.columns = ["Topic", "Count"]

In [None]:
topic_counts

Unnamed: 0,Topic,Count
0,0,641
1,3,94
2,2,88
3,1,75
4,4,64
5,5,38


In [None]:
document_topics_redistributed=doc_topic_df.merge(topic_info, left_on="topic",right_on='Topic', how="left")[['document','topic','Name','Representation']]