In [1]:
import pandas as pd

from umap import UMAP  #from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# #pip install umap
# !pip install umap-learn
# !pip install hdbscan 
# !pip install sentence-transformers 
# !pip install scikit-learn 
# !pip install bertopic 
# !pip install gensim
# !pip install notebook matplotlib plotly


In [3]:
# Read the preprocessed data
data = pd.read_csv('a:/df_cleaned.csv')

# Display the first few rows of the dataframe
data.head()

# Convert the dataframe to a list of strings
docs = data['tokens'].tolist()
docs

["['dhr', 'aj', 'dingemans', 'huisarts', 'streetnaam', 'city', 'datum', 'Kenmerk', 'patientid', 'bsn', 'bsn', 'betreffen', 'mevrouw', 'initials', 'lastname', 'geb', 'birthdate', 'streetnaam', 'zip', 'city', 'tel', 'phonenumber', 'geacht', 'collega', 'bovengenoemde', 'patiënte', 'opnemen', 'afdeling', 'maag', 'darm', 'leverziekt', 'verband', 'melaena', 'rectaal', 'bloedverlie', 'voorgeschiedenis', 'diep', 'veneuaz', 'trombose', 'longembolie', 'cholecystectomie', 'diverticulitis', 'atriumfibrilleren', 'spontaan', 'conversie', 'sinusritme', 'melena', 'waarvoor', 'verklaring', 'vinden', 'verband', 'stabiel', 'hb', 'overleg', 'patiënt', 'expectatief', 'beleid', 'vermoeidheid', 'sinusbradycardie', 'waarvoor', 'stop', 'metoprolol', 'tambocor', 'anamnees', 'vanmiddag', 'fors', 'Helderrood', 'bloedverlie', 'stolsel', 'vermengen', 'ontlasting', 'zwart', 'kleur', 'dag', 'zeuren', 'pijn', 'bovenbuik', 'maagpijn', 'waarvoor', 'stoppen', 'koffie', 'drinken', 'vet', 'eten', 'dag', 'ontlasting', 'inta

In [4]:
data['text'] = data['tokens'].apply(lambda tokens: ''.join(tokens))    
print(data['text'].head())

data['tokens'] = data['tokens'].apply(ast.literal_eval)

texts = data['text'].tolist() 

remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
            'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
            'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
            'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
            'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
            'stop', 'tijd', 'patiënt', 'onderzoek']

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))   
docs = data['text'].tolist() 

0    ['dhr', 'aj', 'dingemans', 'huisarts', 'street...
1    ['samenvatting', 'rectaal', 'bloedverlie', 'ob...
2    ['coloscopie', 'betreffen', 'mw', 'initials', ...
3    ['gastroscopie', 'betreffen', 'mw', 'initials'...
4    ['samenvatting', 'rectaal', 'bloedverlie', 'ee...
Name: text, dtype: object


In [5]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# # embedding_model = SentenceTransformer("all-Mpnet-base-v2")
# embeddings = embedding_model.encode(docs, show_progress_bar=True)


embedding_model1 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embedding_model2 = SentenceTransformer("distiluse-base-multilingual-cased-v2")
# embedding_model3 = SentenceTransformer("KPN/bert-base-dutch-cased")
# embedding_model4 = SentenceTransformer("GroNLP/bert-base-dutch-cased")

embedding_model = embedding_model1
embeddings = embedding_model.encode(docs, show_progress_bar=True)


Batches: 100%|██████████| 300/300 [05:44<00:00,  1.15s/it]


In [6]:
import numpy as np
# Save the embeddings to a file
np.save('embeddings_model1.npy', embeddings)
# # Load the embeddings from the file
embeddings = np.load('embeddings_model1.npy')

In [7]:
def create_model(embeddings, docs, n_topics, n_words):
    
    print(f"Number of topics: {n_topics}")
    print(f"Number of words: {n_words}")
    
    # min_cluster_size = 10
    # Rule of thumb: min_cluster_size ≈ len(docs) / (n_topics * 2)
    min_cluster_size = 10 #len(docs) // (n_topics * 10)
    print(f"Min cluster size: {min_cluster_size}")
    n_neighbors = 15
    min_dist = 0.0
    min_df = 1
    ngram_range = (1, 1)
    umap_metric = 'cosine'
    hdbscan_metric = 'euclidean'
    n_components = 5

    from nltk.corpus import stopwords
    dutch_stopwords = stopwords.words('dutch')
    

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=umap_metric, random_state=42)

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric=hdbscan_metric, cluster_selection_method='eom', prediction_data=True)

    # Step 4 - Tokenize topics
    # vectorizer_model = CountVectorizer(stop_words="english",  min_df=min_df, ngram_range=ngram_range)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords,  min_df=min_df, ngram_range=ngram_range)

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with 
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()

    # All steps together
    topic_model = BERTopic(
    embedding_model=embedding_model,            # Step 1 - Extract embeddings
    umap_model=umap_model,                      # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic representations
    
    # Hyperparameters
    top_n_words= n_words,
    verbose=True
    )    
    
    # Train model
    topic_model.fit_transform(docs, embeddings)
    
    return topic_model

In [8]:
def get_list_of_topics(topic_model):
    topics = topic_model.get_topics()
    # print(f"Type of topics: {type(topics)[0]}")
    # print(f"amount of topics: {len(topics.keys())}")
    
    
    topics_words = []
    len_topics = len(topics.keys())   

    # print('type of topics:', type(topics))
    # print('topics len:', len_topics)
    
    for i in range(len_topics):
        i -= 1
        topic = topic_model.get_topic(i)
        # print('type of topic:', type(topic))
        # print(f"Topic {i}: {topic}")
        topic_words = []

        
        for j in range(len(topic)):        
            word = topic[j][0]
            # print(f"Word: {word}")
            topic_words.append(word)
            # print(f"Topic Words: {topic_words}")
        topics_words.append(topic_words)
    print(f"Topics Words: {topics_words}")
    
    return topics_words

# topic_model = create_model(embeddings, docs, 10, 10)

# test = get_list_of_topics(topic_model)

In [9]:
import ast

def coherence_for_BERTopic(topic_model, docs):
    
    list_of_topics = get_list_of_topics(topic_model)
    

    # # Step 3: Calculate Coherence Score using Gensim
    # # Prepare texts as lists of words for coherence model
    # Convert each document to a list if it's a string representation of a list
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in docs]
    # print('texts:', texts)

    
    # # Create a dictionary and corpus
    # dictionary = Dictionary(texts)
    # corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Create a dictionary and corpus
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # print('dictionary:', dictionary)


    # Choose a coherence measure: 'u_mass', 'c_v', 'c_uci', 'c_npmi'
    coherence_model = CoherenceModel(topics=list_of_topics, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    print(f"Coherence Score: {coherence_score}")

    return coherence_score

In [10]:
from itertools import product
import pandas as pd

def analyze_topic_variations(topic_sizes, word_counts, embeddings, docs): #, 
                            #  min_cluster_sizes, n_neighbors_values, min_dist_values, 
                            #  min_df_values, ngram_ranges, umap_metrics, hdbscan_metrics, 
                            #  n_components_values):
    """
    Analyze combinations of topic sizes, word counts, and model parameters, 
    returning their coherence scores.
    """
    results = []
    
    # Create all combinations of topic sizes, word counts, and model parameters
    combinations = list(product(topic_sizes, word_counts))
                                # , min_cluster_sizes, 
                                # n_neighbors_values, min_dist_values, min_df_values, 
                                # ngram_ranges, umap_metrics, hdbscan_metrics, n_components_values))
                                
    print("Combinations:", combinations)
    
    # for (n_topics, n_words, min_cluster_size, n_neighbors, min_dist, min_df, ngram_range, 
    #      umap_metric, hdbscan_metric, n_components) in combinations:
    for (n_topics, n_words) in combinations:
        print(f"\nAnalyzing model with target {n_topics} topics, {n_words} words per topic...") 
        
        # print(f"\nAnalyzing model with target {n_topics} topics, {n_words} words per topic, "
        #       f"min_cluster_size={min_cluster_size}, n_neighbors={n_neighbors}, min_dist={min_dist}, "
        #       f"min_df={min_df}, ngram_range={ngram_range}, umap_metric={umap_metric}, "
        #       f"hdbscan_metric={hdbscan_metric}, n_components={n_components}...")
        
        # Create and configure the model with the current parameters
        # model = create_model(embeddings, docs, n_topics, n_words, min_cluster_size, 
        #                      n_neighbors, min_dist, min_df, ngram_range, umap_metric, 
        #                      hdbscan_metric, n_components)
        
        model = create_model(embeddings, docs, n_topics, n_words)
        
        # Calculate coherence score
        coherence = coherence_for_BERTopic(model, docs)
        
        # Get topic information
        topic_info = model.get_topic_info()
        
        # Get actual number of topics
        actual_topics = len(topic_info)
        
        # Store results
        results.append({
            'Target Topics': n_topics,
            'Actual Topics': actual_topics,
            'Words per Topic': n_words,
            'Coherence Score': coherence,
            # 'min_cluster_size': min_cluster_size,
            # 'n_neighbors': n_neighbors,
            # 'min_dist': min_dist,
            # 'min_df': min_df,
            # 'ngram_range': ngram_range,
            # 'umap_metric': umap_metric,
            # 'hdbscan_metric': hdbscan_metric,
            # 'n_components': n_components,
            'Topic Info': topic_info
        })
        
    # Create DataFrame and sort by coherence score
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('Coherence Score', ascending=False)
    
    return results_df

# Define your parameter ranges including UMAP n_components values
topic_sizes = [5, 10, 15]
word_counts = [5, 10, 15]
# min_cluster_sizes = [50] #[50, 30, 10]
# n_neighbors_values = [15] #[15, 10, 5]
# min_dist_values = [0.0] # [0.0, 0.2, 0.4]
# min_df_values = [1] #[2, 1, 3]
# ngram_ranges = [(1,2)] #[(1, 1), (1, 2), (2, 2)]
# umap_metrics = ['cosine'] #['cosine', 'euclidean']
# hdbscan_metrics = ['euclidean'] #['euclidean', 'manhattan']
# n_components_values = [5] # [2, 5, 10] 

# Run analysis for different combinations
results_df = analyze_topic_variations(topic_sizes, word_counts, embeddings, docs)  # , 
#                                       min_cluster_sizes, n_neighbors_values, 
#                                       min_dist_values, min_df_values, ngram_ranges, 
#                                       umap_metrics, hdbscan_metrics, n_components_values)

# Display results with formatted output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("\nResults Summary (sorted by coherence score):")
print(results_df.to_string(index=False))

# Print best performing configuration
best_config = results_df.iloc[0]
print("\nBest performing configuration:")
print(f"Number of topics: {best_config['Target Topics']}")
print(f"Words per topic: {best_config['Words per Topic']}")
print(f"Coherence score: {best_config['Coherence Score']:.4f}")
# print(f"min_cluster_size: {best_config['min_cluster_size']}")
# print(f"n_neighbors: {best_config['n_neighbors']}")
# print(f"min_dist: {best_config['min_dist']}")
# print(f"min_df: {best_config['min_df']}")
# print(f"ngram_range: {best_config['ngram_range']}")
# print(f"umap_metric: {best_config['umap_metric']}")
# print(f"hdbscan_metric: {best_config['hdbscan_metric']}")
# print(f"n_components: {best_config['n_components']}")

Combinations: [(5, 5), (5, 10), (5, 15), (10, 5), (10, 10), (10, 15), (15, 5), (15, 10), (15, 15)]

Analyzing model with target 5 topics, 5 words per topic...
Number of topics: 5
Number of words: 5
Min cluster size: 10


2025-04-16 14:37:59,938 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-16 14:38:45,187 - BERTopic - Dimensionality - Completed ✓
2025-04-16 14:38:45,189 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-16 14:38:45,639 - BERTopic - Cluster - Completed ✓
2025-04-16 14:38:45,641 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-16 14:39:54,526 - BERTopic - Representation - Completed ✓


Topics Words: [['divertikelbloeding', 'medisch', 'buikpijn', 'bloedverlie', 'bloed', 'laboratorium', 'hypertensie', 'gastroscopie', 'anamnees', 'dossier'], ['patientid', 'drs', 'behandelbeperking', 'leverziekt', 'medicatie', 'druppel', 'phonenumber', 'ziek', 'gl', 'weledelgeleer'], ['poliklinisch', 'colectomie', 'ileoanaal', 'vaatchirurg', 'patiente', 'vervolgconsult', 'pouchdysfunctie', 'laboratorium', 'buikpijn', 'consult'], ['ziekenhuis', 'patientkenmerk', 'antibiotica', 'anamnees', 'leeftijd', 'infectie', 'allergie', 'medicatie', 'medication', 'trombosediensten'], ['memo', 'sturen', 'mevr', 'verstuuren', 'dicteren', 'gebeld', 'opgesturen', 'blen', 'ehv', 'graag'], ['nacht', 'avond', 'slapen', 'ochtend', 'diabete', 'middag', 'basisdosering', 'scorelijzen', 'lunch', ''], ['endoscopist', 'endoscopie', 'endoscopisch', 'endoclot', 'colonoscopie', 'antibiotisch', 'poliklinisch', 'gastroscopie', 'mdlverpleegkundig', 'specialisten'], ['patientid', 'phonenumber', 'bsnnummer', 'dr', 'streetn

SyntaxError: invalid syntax (<unknown>, line 1)

In [None]:
results_df_short = results_df.drop(columns=['Topic Info'])
results_df_short

Unnamed: 0,Target Topics,Actual Topics,Words per Topic,Coherence Score
7,15,35,10,0.631122
4,10,23,10,0.629263
8,15,38,15,0.61517
6,15,36,5,0.599712
5,10,25,15,0.588024
3,10,23,5,0.586049
0,5,9,5,0.480149
2,5,9,15,0.471706
1,5,9,10,0.47077


In [None]:
results_df['Topic Info'][1]['Representation']

0               [type, samenvatting, recidief, opdracht, intern, aanwijzing, patiënt, dr, objectgegeven, waarvoor]
1                        [recidief, verwijzing, komen, iom, komst, melena, samenvatting, klacht, stoppen, rectaal]
2                                      [streetnaam, phonenumber, city, initials, tel, lastname, hb, geb, dag, zip]
3          [medewerker, laboratorium, assistent, opdracht, lab, internist, samenvatting, cp, afgewerken, controle]
4    [type, consult, samenvatting, poliklinisch, opdracht, decursus, secundair, medicatie, geneeskun, ifirstnamem]
5                      [gebeld, memo, mevr, gebruiken, verstuuren, mevrouw, samenvatting, meneer, ivm, neerleggen]
6      [decursus, dissemineren, type, infectie, samenvatting, verklaring, lab, postrenaal, nierfunctie, prerenaal]
7             [verwijzer, verwijsred, type, samenvatting, opdracht, bespreking, naam, opmerking, waarvoor, klacht]
8              [laboratorium, opdracht, samenvatting, lab, regelen, ingesproken,

In [None]:
results_df['Topic Info'][1]['Representation']

0               [type, samenvatting, recidief, opdracht, intern, aanwijzing, patiënt, dr, objectgegeven, waarvoor]
1                        [recidief, verwijzing, komen, iom, komst, melena, samenvatting, klacht, stoppen, rectaal]
2                                      [streetnaam, phonenumber, city, initials, tel, lastname, hb, geb, dag, zip]
3          [medewerker, laboratorium, assistent, opdracht, lab, internist, samenvatting, cp, afgewerken, controle]
4    [type, consult, samenvatting, poliklinisch, opdracht, decursus, secundair, medicatie, geneeskun, ifirstnamem]
5                      [gebeld, memo, mevr, gebruiken, verstuuren, mevrouw, samenvatting, meneer, ivm, neerleggen]
6      [decursus, dissemineren, type, infectie, samenvatting, verklaring, lab, postrenaal, nierfunctie, prerenaal]
7             [verwijzer, verwijsred, type, samenvatting, opdracht, bespreking, naam, opmerking, waarvoor, klacht]
8              [laboratorium, opdracht, samenvatting, lab, regelen, ingesproken,

In [None]:
remove_words = {'mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
                'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
                'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
                'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
                'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
                'stop', 'tijd', 'patiënt', 'onderzoek'}

filtered_docs = [word for word in docs if word.lower() not in remove_words]

filtered_docs

["['dhr', 'aj', 'dingemans', 'huisarts', 'streetnaam', 'city', 'datum', 'Kenmerk', 'patientid', 'bsn', 'bsn', 'betreffen', 'mevrouw', 'initials', 'lastname', 'geb', 'birthdate', 'streetnaam', 'zip', 'city', 'tel', 'phonenumber', 'geacht', 'collega', 'bovengenoemde', 'patiënte', 'opnemen', 'afdeling', 'maag', 'darm', 'leverziekt', 'verband', 'melaena', 'rectaal', 'bloedverlie', 'voorgeschiedenis', 'diep', 'veneuaz', 'trombose', 'longembolie', 'cholecystectomie', 'diverticulitis', 'atriumfibrilleren', 'spontaan', 'conversie', 'sinusritme', 'melena', 'waarvoor', 'verklaring', 'vinden', 'verband', 'stabiel', 'hb', 'overleg', 'patiënt', 'expectatief', 'beleid', 'vermoeidheid', 'sinusbradycardie', 'waarvoor', 'stop', 'metoprolol', 'tambocor', 'anamnees', 'vanmiddag', 'fors', 'Helderrood', 'bloedverlie', 'stolsel', 'vermengen', 'ontlasting', 'zwart', 'kleur', 'dag', 'zeuren', 'pijn', 'bovenbuik', 'maagpijn', 'waarvoor', 'stoppen', 'koffie', 'drinken', 'vet', 'eten', 'dag', 'ontlasting', 'inta