In [1]:
import os
import json
from bertopic import BERTopic

from umap import UMAP
from sklearn.cluster import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [2]:
superseg_path = '/home/tim/repos/dynamic-topic-modeling/datasets/superseg/segmentation_file_train.json'

In [3]:
with open(superseg_path, 'r') as f:
    data = json.load(f)

In [4]:
dialogues = data['dial_data']['superseg-v2']
dialogues[0]

{'dial_id': '0c0dd5a4a1dfb23135eec6b77bca2fd5',
 'turns': [{'da': 'query_condition',
   'role': 'user',
   'turn_id': 1,
   'utterance': "I'm looking for information regarding benefits planning, can you help me?",
   'topic_id': 0,
   'segmentation_label': 0},
  {'da': 'query_condition',
   'role': 'agent',
   'turn_id': 2,
   'utterance': 'Are you planning for your own future?',
   'topic_id': 0,
   'segmentation_label': 0},
  {'da': 'response_positive',
   'role': 'user',
   'turn_id': 3,
   'utterance': 'Yes I am',
   'topic_id': 0,
   'segmentation_label': 0},
  {'da': 'respond_solution',
   'role': 'agent',
   'turn_id': 4,
   'utterance': 'You need to think about what your family would do if you died now. Social Security can help your family, provided you have earned enough Social Security credits through your work',
   'topic_id': 0,
   'segmentation_label': 0},
  {'da': 'query_solution',
   'role': 'user',
   'turn_id': 5,
   'utterance': 'What are Social Security credits?',
  

In [5]:
dialogs_corpus = []

for dialog in dialogues:
    dialog_segemnts = []
    current_topic_id = 0
    current_topic_utters = ''
    for utter in dialog['turns']:
        if utter['topic_id'] == current_topic_id:
            current_topic_utters = current_topic_utters + ' ' + utter['utterance']
        else:
            current_topic_id = utter['topic_id']
            dialog_segemnts.append(current_topic_utters.strip())
            current_topic_utters = utter['utterance']
    dialogs_corpus.extend(dialog_segemnts)

In [14]:
dialogs_corpus[3]

'Can you help me with planing the benefits for my survivors? Are you planning for the future of your survivors? Not right now. But I want to know if my family would get any help from Social Security after my passing. yes, Social Security can help your family if you have earned enough credits through your work.'

In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,          # Step 1 - Extract embeddings
    umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
)

In [7]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(dialogs_corpus)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
topic_model.get_document_info(dialogs_corpus)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,I'm looking for information regarding benefits...,528,528_planning_future_die_enough,"[planning, future, die, enough, earned, surviv...",[I want to know about planning Social Security...,planning - future - die - enough - earned - su...,1.000000,False
1,What happens if the amount of spouse's benefit...,91,91_remarries_she_exhusband_widower,"[remarries, she, exhusband, widower, surviving...",[I want to know about the benefits plan for su...,remarries - she - exhusband - widower - surviv...,0.733269,False
2,What happens if I don't have enough credits bu...,158,158_deceaseds_255_onetime_living,"[deceaseds, 255, onetime, living, death, decea...",[is there a death benefit? A one-time payment...,deceaseds - 255 - onetime - living - death - d...,0.075124,False
3,Can you help me with planing the benefits for ...,528,528_planning_future_die_enough,"[planning, future, die, enough, earned, surviv...",[I want to know about planning Social Security...,planning - future - die - enough - earned - su...,0.800237,False
4,"And what about my widow, what benefits would s...",187,187_widow_widower_widows_she,"[widow, widower, widows, she, full, age, 60, r...",[What happens to the benefit if you are a wido...,widow - widower - widows - she - full - age - ...,0.930628,False
...,...,...,...,...,...,...,...,...
23462,"By the way, who can I contact to give me infor...",-1,-1_benefit_that_it_the,"[benefit, that, it, the, if, what, is, call, h...","[Hi, good afternoon. Look, i'm interested on s...",benefit - that - it - the - if - what - is - c...,0.000000,False
23463,What if I've fallen behind on one or more loan...,-1,-1_benefit_that_it_the,"[benefit, that, it, the, if, what, is, call, h...","[Hi, good afternoon. Look, i'm interested on s...",benefit - that - it - the - if - what - is - c...,0.000000,False
23464,I have another question regarding the Military...,-1,-1_benefit_that_it_the,"[benefit, that, it, the, if, what, is, call, h...","[Hi, good afternoon. Look, i'm interested on s...",benefit - that - it - the - if - what - is - c...,0.000000,False
23465,something else I want to ask about FAFSA. What...,264,264_submissions_high_lower_fafsa,"[submissions, high, lower, fafsa, low, complet...",[Is the number that is displayed lower than yo...,submissions - high - lower - fafsa - low - com...,0.276179,False
