In [2]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

import re
import datetime

In [3]:
def create_bertopic_model(n_topics, min_topic_size=50, ngram_range=(1, 1)):
    print(f"Creating BERTopic model with {n_topics} topics and min_topic_size={min_topic_size}...")
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords, ngram_range=ngram_range)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    return topic_model

In [4]:

def calculate_coherence_score(texts, topic_words, ngram_range=(1, 1)):
    # Zorg dat tokens correct zijn
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]

    # Maak n-gram strings van tokens
    joined_texts = [" ".join(doc) for doc in texts]
    vectorizer = CountVectorizer(ngram_range=ngram_range).fit(joined_texts)

    # Maak per document lijst met gegenereerde n-grams
    ngram_texts = [list(ngrams) for ngrams in vectorizer.inverse_transform(vectorizer.transform(joined_texts))]

    # Maak Gensim dictionary en coherence model
    dictionary = Dictionary(ngram_texts)
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=ngram_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()

In [5]:
def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20], n_words_list=[5], 
                                  ngram_range=(1, 2), min_topic_size_list=[10], calculate_coherence=True):
    results = {}
    summary_data = []
    all_doc_tags = {}
    data_df = data.copy()

    for min_size in min_topic_size_list:
        for n_topics in n_topics_list:
            for n_words in n_words_list:
                # print(f"\nAnalyzing with {n_topics} topics, {n_words} words, ngram_range={ngram_range}")
                print(f"\nAnalyzing with min_topic_size={min_size}, n_topics={n_topics}, {n_words} words, ngram_range={ngram_range}")


                topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range, min_topic_size=min_size)
                topics, probs = topic_model.fit_transform(texts, embeddings)

                topic_info = topic_model.get_topic_info()

                # Verwerk keywords per topic
                topic_words = {}
                for topic in set(topics):
                    if topic != -1:
                        words = topic_model.get_topic(topic)[:n_words]
                        topic_words[topic] = [word for word, _ in words]

                topic_word_list = list(topic_words.values())

                # Calculate coherence          
                if calculate_coherence == True:
                    # coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
                    print("Calculating coherence score...")
                    coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list, ngram_range=ngram_range)
                    print(f"Coherence Score: {coherence:.4f}")
                else:
                    print("Skipping coherence calculation...")
                    coherence = None
                    print("Coherence Score (C_v):", coherence)
                        

                # Per document: top topics (≥ threshold)
                doc_tags = []
                threshold = 0.1
                for i, topic in enumerate(topics):
                    if probs is None or isinstance(probs[i], float):
                        # fallback als probs None of 1D zijn
                        tags = [f"Topic {topic}"]
                    else:
                        tags = [f"Topic {j}" for j, p in enumerate(probs[i]) if p > threshold]
                    doc_tags.append(tags if tags else ["Unclear"])

                # Per document: keywords van toegekende topic (voor extra inzicht)
                topic_keywords_per_doc = [
                    topic_words.get(t, []) if t in topic_words else [] for t in topics
                ]

                # Update dataframe
                data_df[f'topic_{n_topics}_{n_words}_{min_size}'] = topics
                data_df[f'tags_{n_topics}_{n_words}_{min_size}'] = doc_tags
                data_df[f'keywords_{n_topics}_{n_words}_{min_size}'] = topic_keywords_per_doc


                results[(n_topics, n_words, ngram_range, min_size)] = {
                    'model': topic_model,
                    'topics': topics,
                    'probs': probs,
                    'topic_words': topic_words,
                    'coherence': coherence,
                    'topic_info': topic_info
                }


                summary_data.append({
                    'min_topic_size': min_size,
                    'n_topics': n_topics,
                    'n_words': n_words,
                    'ngram_range': str(ngram_range),
                    'coherence': coherence
                })


    grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
    return results, grid_summary, topic_model, data_df


In [6]:
def print_results_summary(results, texts, save_to_file=True, filename=None):
    import datetime

    output_lines = []

    # for (n_topics, n_words), result in results.items():
    for key, result in results.items():
        print('key:', key)
        if isinstance(key, tuple) and len(key) == 4:
            n_topics, n_words, ngram_range, min_size = key

        elif isinstance(key, tuple) and len(key) == 3:
            n_topics, n_words, min_size = key
            ngram_range = "(1, 1)"
            # min_size = 'Not given'
        else:
            n_topics, n_words = key
            ngram_range = "(1, 1)"
            min_size = 'Not given'

        # output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words, (ngram_range={ngram_range}) and min_topic_size:{min_size} ===")
        # output_lines.append(f"Coherence Score: {result['coherence']:.4f}")
        coherence = result.get('coherence')
        if coherence is not None:
            output_lines.append(f"Coherence Score: {coherence:.4f}")
        else:
            output_lines.append("Coherence Score: Not available")

        
        output_lines.append("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                output_lines.append(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        output_lines.append("\nTopic sizes: " + str(topic_sizes))
        output_lines.append("\n" + "="*50)

    full_output = "\n".join(output_lines)

    # Print to console
    print(full_output)

    # Optionally save to file
    if save_to_file:
        if not filename:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"topic_results_summary_{timestamp}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(full_output)
        print(f"\n📁 Summary saved to: {filename}")

In [7]:
def fix_character_tokens(char_list):
    text = ''.join(char_list)
    tokens = text.split()  # crude but often works
    return tokens

def flatten_nested_char_lists(nested_list):
    return [''.join(token) for token in nested_list if isinstance(token, list)]
    # return [token for token in nested_list if token != []]




In [8]:
import ast

# Load your data
data = pd.read_csv('a:/df_cleaned.csv')

# Parse the stringified lists into real Python lists
data['tokens'] = data['tokens'].apply(lambda x: ast.literal_eval(x))


remove_list = [
        # Units, measurements, filler symbols
        'mg', 'mmoll', 'mmolL', 'x', 'per', 'dag', 'min', 'uur', 'ml', 'eenhed',

        # Admin & structure
        'samenvatting', 'memo', 'beleid', 'conclusie', 'aanvullend', 'afgewerken', 
        'opdracht', 'opdrachten', 'rapportage', 'diversen', 'contact', 'afspraak', 
        'tijd', 'tijdsduur', 'datum', 'poli', 'recept', 'gefaxt', 'bellen', 
        'akkoord', 'nodig', 'bekende', 'bekend', 'memo', 'scorelijzen', 'naslag',

        # Identifying or privacy-sensitive terms
        'bsn', 'city', 'postcode', 'firstname', 'lastname', 'streetname', 'phonenumber', 'voicemail', 
        'mw', 'dhr', 'mevrouw', 'meneer', 'zoon', 'mevr', 'mvr', 'dhr'
        'inge', 'valkenburg', 'peter', 'miriam', 'debby', 'eliane',

        # Clinical history / often uninformative by itself
        'anamnese', 'anamnees', 'voorgeschiedenis', 'huisarts', 
        'hoofdbehandelaar',

        # Temporal or ambiguous
        'sinds', 'dagen', 'weken', 'maanden', 'week', 'avond', 'nacht', 'ochtend', 'middag', 
        'extra', 'stop', 'gehad', 'gezien', 'zien', 'dd', 
        'ivm', 'links', 'rechts', 'linker', 'dr', 'overige', 'algemeen', 'patiënt', 'patiënte',

        # Admin/communication
        'verzoek', 'mail', 'mailen', 'verstuuren', 'brief', 'uitinen', 'ak', 'bespreeklijst', 
        'wijzigingopmerking',
        'voicemail', 'telefonisch', 'mobiel', 'ingesproek', 'aanleiding', 'telefoon', 'email', 'bereiken', 'svp', 'contactpersoon'
        'terugbellen', 'gemaild', 'insproken', 'voicemail',

        # Unclear / possibly noise
        'eenhed', 'aangeeft', 'scorelijzen', 'inten', 'intn', 'vb', 'sub', 

        # extra         
        'regelen', 'opmerking', 'bespreeklijst', 'sehperiode', 'bedrijf', 
        'ivb', 'mtps', 'cp', 'pat', 'huisadres', 'gg', 
        'medewerker', 'medewerk', 'laboratorium', 'apotheek', 'maand', 'tc', 
        'wonen', 'gezondheidsinstelling', 'leven', 'varken', 'soms', 'jaar', 'mgdag', 'lateraal',
        'bespreking', 'wondfoto', 'cze',         
        
        'vrijdag', 'maandag', 'donderdag', 'woensdag', 'zaterdag', 'zondag',      
        'juli', 'augustus', 'september', 'oktober', 'november', 'december', 'januari', 'februari', 'maart', 'april', 'mei', 'jun',
        
        
        # 'voltooid', 'verdenking', 'waarvoor', 'reden', 'waarschijnlijk', 'mogelijk', 'stuk', 'basisdosering', 'probleem', 'probleemlijst', 'actie',
        # 'nee', 'ja',  'arts',  'radiologie', 'internist', 'evaluatie', 'intake', 'controle',


        ]

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# Remove double words that are next to each other
for i in range(len(data)):
    tokens = data['tokens'].loc[i]
    # print("tokens list length:", len(tokens))
    new_tokens = []
    for j in range(len(tokens) - 1):
        if tokens[j] == tokens[j + 1]:
            print("Duplicate token found:", tokens[j], 'and', tokens[j + 1])
        else:
            new_tokens.append(tokens[j])
    # Append the last token since it won't be checked in the loop
    if tokens:
        new_tokens.append(tokens[-1])
    data['tokens'].loc[i] = new_tokens
    # print("New tokens list length:", len(new_tokens))

# Remove empty documents
data = data[data['tokens'].apply(len) > 0]

# Convert token lists to strings for BERTopic
data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
texts = data['text'].tolist()

# Debug prints
print("First 3 token lists:", data['tokens'].head(3).tolist())
print("First 3 texts:", data['text'].head(3).tolist())
print("Total documents after filtering:", len(texts))


Duplicate token found: bloedgroep and bloedgroep
Duplicate token found: coloscopie and coloscopie
Duplicate token found: acenocoumarol and acenocoumarol
Duplicate token found: Fentanyl and Fentanyl
Duplicate token found: bloed and bloed
Duplicate token found: coloscopie and coloscopie
Duplicate token found: visite and visite
Duplicate token found: visite and visite
Duplicate token found: duodenum and duodenum
Duplicate token found: seh and seh
Duplicate token found: visite and visite
Duplicate token found: visite and visite
Duplicate token found: pijnscore and pijnscore
Duplicate token found: gibloeding and gibloeding
Duplicate token found: triage and triage
Duplicate token found: intern and intern
Duplicate token found: collumcaar and collumcaar
Duplicate token found: pijnscore and pijnscore
Duplicate token found: triage and triage
Duplicate token found: def and def
Duplicate token found: vocht and vocht
Duplicate token found: collumcaar and collumcaar
Duplicate token found: pijnscore

In [9]:
# Load embeddings

### model7 is from before removing duplicates
### model8 is with removal of duplicates based on words that are next to each other

embeddings = np.load('embeddings_model8.npy')
# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = embedding_model.encode(texts, show_progress_bar=True)
# np.save('embeddings_model8.npy', embeddings)

# print("embeddings:", embeddings)
print("Embeddings shape:", embeddings.shape)

Embeddings shape: (9493, 384)


In [10]:
results_2to3, grid_summary_2to3, topic_model_2to3, data_df_2to3 = analyze_topics_with_sentiment(texts, 
                                                                                                embeddings, 
                                                                                                data, 
                                                                                                n_topics_list=[20], 
                                                                                                n_words_list=[5], 
                                                                                                min_topic_size_list=[100],
                                                                                                ngram_range=(2, 3))



Analyzing with min_topic_size=100, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=100...


2025-05-07 01:59:37,727 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-07 02:00:15,871 - BERTopic - Dimensionality - Completed ✓
2025-05-07 02:00:15,871 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-07 02:00:16,532 - BERTopic - Cluster - Completed ✓
2025-05-07 02:00:16,532 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-07 02:00:21,471 - BERTopic - Representation - Completed ✓
2025-05-07 02:00:21,487 - BERTopic - Topic reduction - Reducing number of topics
2025-05-07 02:00:21,487 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(17).
2025-05-07 02:00:21,487 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-07 02:00:27,728 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.7473


In [None]:
# results_2to3, grid_summary_2to3, topic_model_2to3, data_df_2to3 = analyze_topics_with_sentiment(texts, 
#                                                                                                 embeddings, 
#                                                                                                 data, 
#                                                                                                 n_topics_list=[20], 
#                                                                                                 n_words_list=[5], 
#                                                                                                 min_topic_size_list=[30, 50, 75, 100, 150],
#                                                                                                 ngram_range=(2, 3))



Analyzing with min_topic_size=30, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=30...


2025-04-29 21:43:25,331 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:44:02,292 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:44:02,294 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:44:02,682 - BERTopic - Cluster - Completed ✓
2025-04-29 21:44:02,682 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:44:07,630 - BERTopic - Representation - Completed ✓
2025-04-29 21:44:07,646 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:44:07,686 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:44:12,831 - BERTopic - Representation - Completed ✓
2025-04-29 21:44:12,847 - BERTopic - Topic reduction - Reduced number of topics from 74 to 20


Calculating coherence score...
Coherence Score: 0.7345

Analyzing with min_topic_size=50, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=50...


2025-04-29 21:45:30,106 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:45:48,312 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:45:48,312 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:45:48,785 - BERTopic - Cluster - Completed ✓
2025-04-29 21:45:48,785 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:45:54,283 - BERTopic - Representation - Completed ✓
2025-04-29 21:45:54,297 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:45:54,337 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:45:59,751 - BERTopic - Representation - Completed ✓
2025-04-29 21:45:59,774 - BERTopic - Topic reduction - Reduced number of topics from 47 to 20


Calculating coherence score...
Coherence Score: 0.6687

Analyzing with min_topic_size=75, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=75...


2025-04-29 21:47:17,531 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:47:35,545 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:47:35,545 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:47:36,097 - BERTopic - Cluster - Completed ✓
2025-04-29 21:47:36,097 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:47:41,747 - BERTopic - Representation - Completed ✓
2025-04-29 21:47:41,777 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:47:41,816 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:47:47,514 - BERTopic - Representation - Completed ✓
2025-04-29 21:47:47,537 - BERTopic - Topic reduction - Reduced number of topics from 28 to 20


Calculating coherence score...
Coherence Score: 0.6510

Analyzing with min_topic_size=100, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=100...


2025-04-29 21:49:05,150 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:49:22,799 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:49:22,799 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:49:23,449 - BERTopic - Cluster - Completed ✓
2025-04-29 21:49:23,449 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:49:29,132 - BERTopic - Representation - Completed ✓
2025-04-29 21:49:29,149 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:49:29,149 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(17).
2025-04-29 21:49:29,149 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:49:35,799 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.7473

Analyzing with min_topic_size=150, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=150...


2025-04-29 21:50:54,935 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:51:12,801 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:51:12,801 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:51:13,617 - BERTopic - Cluster - Completed ✓
2025-04-29 21:51:13,617 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:51:19,134 - BERTopic - Representation - Completed ✓
2025-04-29 21:51:19,151 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:51:19,151 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(6).
2025-04-29 21:51:19,151 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:51:24,934 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.5530


In [None]:
# Print detailed results summary
print_results_summary(results_2to3, texts)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_ngram2_3_{timestamp}.csv"
grid_summary_2to3.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

key: (20, 5, (2, 3), 30)
key: (20, 5, (2, 3), 50)
key: (20, 5, (2, 3), 75)
key: (20, 5, (2, 3), 100)
key: (20, 5, (2, 3), 150)

=== Results for 20 topics with 5 words, (ngram_range=(2, 3)) and min_topic_size:30 ===
Coherence Score: 0.7345

Topics and their key words:
Topic 0: oraal stuk, tablet oraal, tablet oraal stuk, aanvullen onderzoek, lichamelijk onderzoek, rectaal bloedverlie, streetnaam zip, mcv fl, geacht collega, reden komst
Topic 1: progressie cll, opname exacerbatie copd, opname exacerbatie, exacerbatie copd, copd gold, volgen venetoclax, lymfatisch leukemie, chronisch lymfatisch, chronisch lymfatisch leukemie, cll waarvoor
Topic 2: int inge, internist controle, novomix ophogen, ophogen beloop, novomix ophogen beloop, controle int, tp controle, internist tp, controle int inge, zakken novomix
Topic 3: tevoren lab, diana lab, afronding onderzoek, lab diana, rontg echo, lab urine, tevoren lab oz, onderzoek rontg, lab formulier, afronding onderzoek rontg
Topic 4: diabete lunch,

In [None]:
results_1to3, grid_summary_1to3, topic_model_1to3, data_df_1to3 = analyze_topics_with_sentiment(texts, 
                                                                                                embeddings, 
                                                                                                data, 
                                                                                                n_topics_list=[20], 
                                                                                                n_words_list=[5], 
                                                                                                min_topic_size_list=[30, 50, 75, 100, 150],
                                                                                                ngram_range=(1, 3))


# Print detailed results summary
print_results_summary(results_1to3, texts)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_ngram2_3_{timestamp}.csv"
grid_summary_1to3.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")



Analyzing with min_topic_size=30, n_topics=20, 5 words, ngram_range=(1, 3)
Creating BERTopic model with 20 topics and min_topic_size=30...


2025-04-29 21:52:43,363 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:53:01,619 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:53:01,635 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:53:02,019 - BERTopic - Cluster - Completed ✓
2025-04-29 21:53:02,019 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:53:08,397 - BERTopic - Representation - Completed ✓
2025-04-29 21:53:08,418 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:53:08,463 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:53:14,497 - BERTopic - Representation - Completed ✓
2025-04-29 21:53:14,502 - BERTopic - Topic reduction - Reduced number of topics from 74 to 20


Calculating coherence score...
Coherence Score: 0.6111

Analyzing with min_topic_size=50, n_topics=20, 5 words, ngram_range=(1, 3)
Creating BERTopic model with 20 topics and min_topic_size=50...


2025-04-29 21:54:42,521 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:55:00,254 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:55:00,254 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:55:00,721 - BERTopic - Cluster - Completed ✓
2025-04-29 21:55:00,721 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:55:07,104 - BERTopic - Representation - Completed ✓
2025-04-29 21:55:07,121 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:55:07,154 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:55:13,421 - BERTopic - Representation - Completed ✓
2025-04-29 21:55:13,455 - BERTopic - Topic reduction - Reduced number of topics from 47 to 20


Calculating coherence score...
Coherence Score: 0.6330

Analyzing with min_topic_size=75, n_topics=20, 5 words, ngram_range=(1, 3)
Creating BERTopic model with 20 topics and min_topic_size=75...


2025-04-29 21:56:42,015 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:56:59,790 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:56:59,790 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:57:00,350 - BERTopic - Cluster - Completed ✓
2025-04-29 21:57:00,366 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:57:06,939 - BERTopic - Representation - Completed ✓
2025-04-29 21:57:06,971 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:57:07,002 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:57:13,489 - BERTopic - Representation - Completed ✓
2025-04-29 21:57:13,504 - BERTopic - Topic reduction - Reduced number of topics from 28 to 20


Calculating coherence score...
Coherence Score: 0.6022

Analyzing with min_topic_size=100, n_topics=20, 5 words, ngram_range=(1, 3)
Creating BERTopic model with 20 topics and min_topic_size=100...


2025-04-29 21:58:43,523 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 21:59:01,407 - BERTopic - Dimensionality - Completed ✓
2025-04-29 21:59:01,407 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 21:59:02,056 - BERTopic - Cluster - Completed ✓
2025-04-29 21:59:02,057 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 21:59:08,440 - BERTopic - Representation - Completed ✓
2025-04-29 21:59:08,458 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 21:59:08,458 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(17).
2025-04-29 21:59:08,458 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 21:59:15,691 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.6475

Analyzing with min_topic_size=150, n_topics=20, 5 words, ngram_range=(1, 3)
Creating BERTopic model with 20 topics and min_topic_size=150...


2025-04-29 22:00:45,111 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:01:02,843 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:01:02,843 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:01:03,658 - BERTopic - Cluster - Completed ✓
2025-04-29 22:01:03,658 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:01:09,835 - BERTopic - Representation - Completed ✓
2025-04-29 22:01:09,843 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:01:09,843 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(6).
2025-04-29 22:01:09,843 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:01:16,209 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.6086
key: (20, 5, (1, 3), 30)
key: (20, 5, (1, 3), 50)
key: (20, 5, (1, 3), 75)
key: (20, 5, (1, 3), 100)
key: (20, 5, (1, 3), 150)

=== Results for 20 topics with 5 words, (ngram_range=(1, 3)) and min_topic_size:30 ===
Coherence Score: 0.6111

Topics and their key words:
Topic 0: oraal, stuk, oraal stuk, tablet, onderzoek, anemie, hb, goed, opname, mmoll
Topic 1: obinutuzumab, cll, progressie, copd, volgen, progressie cll, venetoclax, beiderzijds, waarvoor, opname exacerbatie copd
Topic 2: internist, beloop, int inge, tp, internist controle, inge, balie, ha, accepteren, int
Topic 3: lab, diana, echo, blank, tevoren, controle, labbon, tevoren lab, opsturen, rontg
Topic 4: diabete lunch, lunch, diabete lunch slapen, lunch slapen, basisdosering diabete lunch, basisdosering diabete, slapen, basisdosering, lunch slapen basisdosering, slapen basisdosering diabete
Topic 5: diabete vpk, vpk, diabete, vervolgconsult diabete vpk, vervolgconsult 

In [None]:
results_1to2, grid_summary_1to2, topic_model_1to2, data_df_1to2 = analyze_topics_with_sentiment(texts, 
                                                                                                embeddings, 
                                                                                                data, 
                                                                                                n_topics_list=[20], 
                                                                                                n_words_list=[5], 
                                                                                                min_topic_size_list=[30, 50, 75, 100, 150],
                                                                                                ngram_range=(1, 2))


# Print detailed results summary
print_results_summary(results_1to2, texts)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_ngram2_3_{timestamp}.csv"
grid_summary_1to2.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")


Analyzing with min_topic_size=30, n_topics=20, 5 words, ngram_range=(1, 2)
Creating BERTopic model with 20 topics and min_topic_size=30...


2025-04-29 22:02:43,861 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:03:01,665 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:03:01,665 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:03:02,058 - BERTopic - Cluster - Completed ✓
2025-04-29 22:03:02,058 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:03:05,431 - BERTopic - Representation - Completed ✓
2025-04-29 22:03:05,431 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:03:05,478 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:03:08,715 - BERTopic - Representation - Completed ✓
2025-04-29 22:03:08,731 - BERTopic - Topic reduction - Reduced number of topics from 74 to 20


Calculating coherence score...
Coherence Score: 0.6888

Analyzing with min_topic_size=50, n_topics=20, 5 words, ngram_range=(1, 2)
Creating BERTopic model with 20 topics and min_topic_size=50...


2025-04-29 22:04:00,413 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:04:18,140 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:04:18,140 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:04:18,615 - BERTopic - Cluster - Completed ✓
2025-04-29 22:04:18,615 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:04:21,856 - BERTopic - Representation - Completed ✓
2025-04-29 22:04:21,872 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:04:21,903 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:04:25,325 - BERTopic - Representation - Completed ✓
2025-04-29 22:04:25,325 - BERTopic - Topic reduction - Reduced number of topics from 47 to 20


Calculating coherence score...
Coherence Score: 0.6918

Analyzing with min_topic_size=75, n_topics=20, 5 words, ngram_range=(1, 2)
Creating BERTopic model with 20 topics and min_topic_size=75...


2025-04-29 22:05:14,292 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:05:32,088 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:05:32,088 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:05:32,646 - BERTopic - Cluster - Completed ✓
2025-04-29 22:05:32,646 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:05:35,980 - BERTopic - Representation - Completed ✓
2025-04-29 22:05:35,995 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:05:36,011 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:05:39,433 - BERTopic - Representation - Completed ✓
2025-04-29 22:05:39,435 - BERTopic - Topic reduction - Reduced number of topics from 28 to 20


Calculating coherence score...
Coherence Score: 0.6525

Analyzing with min_topic_size=100, n_topics=20, 5 words, ngram_range=(1, 2)
Creating BERTopic model with 20 topics and min_topic_size=100...


2025-04-29 22:06:28,297 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:06:46,126 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:06:46,128 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:06:46,781 - BERTopic - Cluster - Completed ✓
2025-04-29 22:06:46,781 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:06:50,081 - BERTopic - Representation - Completed ✓
2025-04-29 22:06:50,093 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:06:50,093 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(17).
2025-04-29 22:06:50,093 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:06:53,781 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.6939

Analyzing with min_topic_size=150, n_topics=20, 5 words, ngram_range=(1, 2)
Creating BERTopic model with 20 topics and min_topic_size=150...


2025-04-29 22:07:42,366 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:08:00,348 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:08:00,348 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:08:01,165 - BERTopic - Cluster - Completed ✓
2025-04-29 22:08:01,165 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:08:04,349 - BERTopic - Representation - Completed ✓
2025-04-29 22:08:04,359 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:08:04,359 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(6).
2025-04-29 22:08:04,359 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:08:07,682 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.6517
key: (20, 5, (1, 2), 30)
key: (20, 5, (1, 2), 50)
key: (20, 5, (1, 2), 75)
key: (20, 5, (1, 2), 100)
key: (20, 5, (1, 2), 150)

=== Results for 20 topics with 5 words, (ngram_range=(1, 2)) and min_topic_size:30 ===
Coherence Score: 0.6888

Topics and their key words:
Topic 0: oraal, stuk, oraal stuk, tablet, onderzoek, anemie, hb, goed, opname, mmoll
Topic 1: obinutuzumab, cll, progressie, copd, progressie cll, volgen, venetoclax, beiderzijds, waarvoor, opname exacerbatie
Topic 2: internist, beloop, int inge, tp, internist controle, inge, balie, ha, accepteren, wb
Topic 3: lab, diana, echo, blank, tevoren, controle, labbon, tevoren lab, opsturen, rontg
Topic 4: diabete lunch, lunch, lunch slapen, basisdosering diabete, basisdosering, slapen, slapen basisdosering, diabete, slapen diabete, lunch diabete
Topic 5: diabete vpk, vpk, diabete, vervolgconsult diabete, insulin schema, doel, insulin, schema, vervolgconsult, doel verbeteren
T

In [11]:
data_df_2to3.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,topic_20_5_100,tags_20_5_100,keywords_20_5_100
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,10,[Topic 10],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[rectaal, bloedverlie, obvn, divertikelbloedin...",rectaal bloedverlie obvn divertikelbloeding ac...,15,[Topic 15],"[rectaal bloedverlie, aanvullen onderzoek, ove..."
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, initials, adresgegeven...",coloscopie betreffen initials adresgegevens st...,-1,[Topic -1],[]
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,-1,[Topic -1],[]
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[rectaal, bloedverlie, eenmalig, hd, hbstabiel...",rectaal bloedverlie eenmalig hd hbstabiel inr ...,15,[Topic 15],"[rectaal bloedverlie, aanvullen onderzoek, ove..."


In [12]:
# for topic_id in topic_model_2to3.get_topics():
#     print(f"Topic {topic_id}: {[w for w, _ in topic_model_2to3.get_topic(topic_id)[:5]]}")

# topic_df = pd.DataFrame(topic_model_2to3.get_topic_info())
# display(topic_df)

In [13]:
topic_model_2to3  # BERTopic model
results_2to3[(20, 5, (2, 3), 100)]['probs']  # document-topic probabilities
# data_df_2to3.head()  # includes topic assignments and tags


array([0.89051627, 1.        , 0.        , ..., 0.        , 1.        ,
       0.94835778])

In [14]:
# def generate_event_labels(topic_words_dict, label_style="title"):
#     label_dict = {}
#     for topic_id, words in topic_words_dict.items():
#         if not words: continue
#         label = " / ".join(words[:3])  # pick top n-grams
#         if label_style == "title":
#             label = label.replace("_", " ").title()
#         label_dict[topic_id] = label
#     return label_dict

# # Apply
# event_labels = generate_event_labels(results_2to3[(20, 5, (2, 3))]['topic_words'])
# event_labels

In [15]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd

# # Extract probabilities and topic labels
# probs = results_2to3[(20, 5, (2, 3))]['probs']
# topics = results_2to3[(20, 5, (2, 3))]['topics']
# labels = generate_event_labels(results_2to3[(20, 5, (2, 3))]['topic_words'])

# # Build a DataFrame (optional: filter to a subset of docs for legibility)
# df_probs = pd.DataFrame(probs).iloc[:50]  # first 50 docs
# df_probs.columns = [labels.get(i, f"Topic {i}") for i in df_probs.columns]

# # Heatmap
# plt.figure(figsize=(14, 8))
# sns.heatmap(df_probs, cmap="YlGnBu", cbar_kws={'label': 'Topic Relevance'}, linewidths=0.1)
# plt.title("Document vs Topic Heatmap (Top 50 reports)")
# plt.xlabel("Topics (Events)")
# plt.ylabel("Documents")
# plt.tight_layout()
# plt.show()


In [16]:
# pip install seaborn

In [17]:
# summary = data_df_2to3[['pseudo_id', f'topic_20_5', f'tags_20_5', f'keywords_20_5']]
# summary['event_label'] = summary[f'topic_20_5'].map(event_labels)
# summary


In [18]:
# threshold = 0.15  # or 0.2 based on how strict you want to be
# all_doc_tags = []

# for prob_dist in probs:
#     if isinstance(prob_dist, (list, np.ndarray)):
#         tags = [i for i, p in enumerate(prob_dist) if p > threshold]
#     else:
#         tags = []  # fallback if probs is just a float
#     tag_names = [topic_labels.get(t, f"Topic {t}") for t in tags]
#     all_doc_tags.append(tag_names)

# data_df_2to3['event_tags'] = all_doc_tags
# data_df_2to3.head()


In [None]:
results_2to3, grid_summary_2to3, topic_model_2to3, data_df_2to3 = analyze_topics_with_sentiment(texts, 
                                                                                                embeddings, 
                                                                                                data, 
                                                                                                n_topics_list=[20], 
                                                                                                n_words_list=[5], 
                                                                                                min_topic_size_list=[100],
                                                                                                ngram_range=(2, 3))



Analyzing with min_topic_size=100, n_topics=20, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 20 topics and min_topic_size=100...


2025-04-29 22:21:43,765 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 22:22:01,850 - BERTopic - Dimensionality - Completed ✓
2025-04-29 22:22:01,853 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-29 22:22:02,494 - BERTopic - Cluster - Completed ✓
2025-04-29 22:22:02,494 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-29 22:22:08,353 - BERTopic - Representation - Completed ✓
2025-04-29 22:22:08,361 - BERTopic - Topic reduction - Reducing number of topics
2025-04-29 22:22:08,361 - BERTopic - Topic reduction - Number of topics (20) is equal or higher than the clustered topics(17).
2025-04-29 22:22:08,361 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-29 22:22:14,528 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.7473


In [19]:
topic_info = topic_model_2to3.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4444,-1_lichamelijk onderzoek_aanvullen onderzoek_i...,"[lichamelijk onderzoek, aanvullen onderzoek, i...",[vervolg consult type vervolgconsult verkort i...
1,0,135,0_basisdosering diabete lunch_basisdosering di...,"[basisdosering diabete lunch, basisdosering di...","[diabete lunch slapen basisdosering, diabete l..."
2,1,103,1_graad ii_ductaal lobulair carcinoom_invasief...,"[graad ii, ductaal lobulair carcinoom, invasie...",[depressiviteit knieoperatie appendectomie rec...
3,2,109,2_progressie cll_opname exacerbatie copd_opnam...,"[progressie cll, opname exacerbatie copd, opna...",[carpaal tunnel syndroom mammacarcinoom profyl...
4,3,108,3_diabetisch nefropathie_zegelringcarcinoom si...,"[diabetisch nefropathie, zegelringcarcinoom si...",[diabete mellitus type hypertensie nierfunctie...
5,4,237,4_indicatie aanvraag_asa klasse_antibiotisch p...,"[indicatie aanvraag, asa klasse, antibiotisch ...",[aanvraag Endoscopie aanvraag aanvrager Mdlart...
6,5,441,5_diabete vpk_diabete mellitus_oraal stuk_diab...,"[diabete vpk, diabete mellitus, oraal stuk, di...",[vervolg consult diabete consult jaarcontrole ...
7,6,313,6_algeheel malaise_int knoppen_lichamelijk ond...,"[algeheel malaise, int knoppen, lichamelijk on...",[vervolgconsult ouderengeneeskun type vervolgc...
8,7,785,7_beloop verpleegkundig_aantal controle_int in...,"[beloop verpleegkundig, aantal controle, int i...",[initials ad Kenmerk patientid betreffen heer ...
9,8,106,8_acute pancreatitis_chronisch pancreatitis_kn...,"[acute pancreatitis, chronisch pancreatitis, k...",[decursus reden opnaam acute pancreatitis spec...


In [20]:
topic_info = topic_model_2to3.get_topic_info()
topic_labels = {}

for topic_id, row in topic_info.iterrows():
    if row['Topic'] != -1:  # -1 = outliers
        words = row['Representation']
        # print("words:", words)
        label = "__".join(words[:10])  # use top 3 words
        # print("label:", label)
        # label = label.replace(" ", " ").title()
        # print("label_nospaces:", label)
        topic_labels[row['Topic']] = label

topic_labels

{0: 'basisdosering diabete lunch__basisdosering diabete__diabete lunch__lunch slapen basisdosering__slapen basisdosering diabete__slapen basisdosering__diabete lunch slapen__lunch slapen__lunch basisdosering diabete__lunch basisdosering',
 1: 'graad ii__ductaal lobulair carcinoom__invasief ductaal lobulair__lobulair carcinoom__invasief ductaal__ductaal lobulair__hormonaal therapie__hemolytisch anemie__ii graad__ii pr',
 2: 'progressie cll__opname exacerbatie copd__opname exacerbatie__exacerbatie copd__volgen venetoclax__copd gold__cll waarvoor__chronisch lymfatisch__chronisch lymfatisch leukemie__lymfatisch leukemie',
 3: 'diabetisch nefropathie__zegelringcarcinoom sigmoid__oogheelkunen diabetisch retinopathie__start dialyse overvulling__hypertensie nierfunctieverlie__oogheelkunen diabetisch__acute start__acute start dialyse__overvulling klacht__verslechtering nierfunctie predialyse',
 4: 'indicatie aanvraag__asa klasse__antibiotisch profylaxe__profylaxe nvt__antibiotisch profylaxe nvt

In [None]:
topic_info = topic_model_2to3.get_topic_info()
topic_labels = {}

for topic_id, row in topic_info.iterrows():
    if row['Topic'] != -1:  # -1 = outliers
        words = row['Representation']
        # print("words:", words)
        label = "__".join(words[:10])  # use top 3 words
        # print("label:", label)
        label = label.replace(" ", "_").title()
        # print("label_nospaces:", label)
        topic_labels[row['Topic']] = label

topic_labels


{0: 'Basisdosering_Diabete_Lunch__Basisdosering_Diabete__Diabete_Lunch__Lunch_Slapen_Basisdosering__Slapen_Basisdosering_Diabete__Slapen_Basisdosering__Diabete_Lunch_Slapen__Lunch_Slapen__Lunch_Basisdosering_Diabete__Lunch_Basisdosering',
 1: 'Graad_Ii__Ductaal_Lobulair_Carcinoom__Invasief_Ductaal_Lobulair__Lobulair_Carcinoom__Invasief_Ductaal__Ductaal_Lobulair__Hormonaal_Therapie__Hemolytisch_Anemie__Ii_Graad__Ii_Pr',
 2: 'Progressie_Cll__Opname_Exacerbatie_Copd__Opname_Exacerbatie__Exacerbatie_Copd__Volgen_Venetoclax__Copd_Gold__Cll_Waarvoor__Chronisch_Lymfatisch__Chronisch_Lymfatisch_Leukemie__Lymfatisch_Leukemie',
 3: 'Diabetisch_Nefropathie__Zegelringcarcinoom_Sigmoid__Oogheelkunen_Diabetisch_Retinopathie__Start_Dialyse_Overvulling__Hypertensie_Nierfunctieverlie__Oogheelkunen_Diabetisch__Acute_Start__Acute_Start_Dialyse__Overvulling_Klacht__Verslechtering_Nierfunctie_Predialyse',
 4: 'Indicatie_Aanvraag__Asa_Klasse__Antibiotisch_Profylaxe__Profylaxe_Nvt__Antibiotisch_Profylaxe_Nvt

In [None]:
# # Transform if you haven't yet
# topics, probs = topic_model_2to3.transform(texts)

# # Map topics to labels
# data_df_2to3['event_tag'] = [topic_labels.get(t, 'Outlier') for t in topics]

probs = results_2to3[(20, 5, (2, 3), 100)]['probs']
topics = results_2to3[(20, 5, (2, 3), 100)]['topics']

In [None]:
doc_labels = []
for topic in topics:
    label = topic_labels.get(topic, "Unknown")  # Default to "Unknown" if no match
    doc_labels.append(label)

doc_labels



['Oraal_Stuk__Tablet_Oraal__Tablet_Oraal_Stuk__Kenmerk_Patientid__Kenmerk_Patientid_Betreffen__Patientid_Betreffen__Geb_Birthdate__Geacht_Collega__Initials_Geb_Birthdate__Initials_Geb',
 'Rectaal_Bloedverlie__Aanvullen_Onderzoek__Overig_Actie__Hb_Controle__Ferriprief_Anemie__Lichamelijk_Onderzoek__Bloedverlie_Anum__Hd_Stabiel__Beloop_Vpk__Hemoglobine_Mmoll',
 'Unknown',
 'Unknown',
 'Rectaal_Bloedverlie__Aanvullen_Onderzoek__Overig_Actie__Hb_Controle__Ferriprief_Anemie__Lichamelijk_Onderzoek__Bloedverlie_Anum__Hd_Stabiel__Beloop_Vpk__Hemoglobine_Mmoll',
 'Rectaal_Bloedverlie__Aanvullen_Onderzoek__Overig_Actie__Hb_Controle__Ferriprief_Anemie__Lichamelijk_Onderzoek__Bloedverlie_Anum__Hd_Stabiel__Beloop_Vpk__Hemoglobine_Mmoll',
 'Oraal_Stuk__Tablet_Oraal__Tablet_Oraal_Stuk__Kenmerk_Patientid__Kenmerk_Patientid_Betreffen__Patientid_Betreffen__Geb_Birthdate__Geacht_Collega__Initials_Geb_Birthdate__Initials_Geb',
 'Rectaal_Bloedverlie__Aanvullen_Onderzoek__Overig_Actie__Hb_Controle__Ferripri

In [None]:
data_df_2to3["event_tag"] = doc_labels


In [None]:
# Transform if you haven't yet
# topics, probs = topic_model_2to3.transform(texts)

# Map topics to labels
data_df_2to3['event_tag'] = [topic_labels.get(t, 'Outlier') for t in topics]


In [None]:
data_df_2to3.head(5)

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,topic_20_5_100,tags_20_5_100,keywords_20_5_100,event_tag
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,10,[Topic 10],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",Oraal_Stuk__Tablet_Oraal__Tablet_Oraal_Stuk__K...
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[rectaal, bloedverlie, obvn, divertikelbloedin...",rectaal bloedverlie obvn divertikelbloeding ac...,15,[Topic 15],"[rectaal bloedverlie, aanvullen onderzoek, ove...",Rectaal_Bloedverlie__Aanvullen_Onderzoek__Over...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, initials, adresgegeven...",coloscopie betreffen initials adresgegevens st...,-1,[Topic -1],[],Outlier
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,-1,[Topic -1],[],Outlier
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[rectaal, bloedverlie, eenmalig, hd, hbstabiel...",rectaal bloedverlie eenmalig hd hbstabiel inr ...,15,[Topic 15],"[rectaal bloedverlie, aanvullen onderzoek, ove...",Rectaal_Bloedverlie__Aanvullen_Onderzoek__Over...


In [None]:
# threshold = 0.15  # or tune it to 0.2 if you want stricter

# doc_tags = []
# for prob in probs:
#     tags = []
#     if isinstance(prob, (list, np.ndarray)):
#         tags = [topic_labels.get(i, f"Topic_{i}") for i, p in enumerate(prob) if p > threshold]
#     doc_tags.append(tags if tags else ['Unclear'])

# data_df_2to3['event_tags'] = doc_tags


In [None]:
# # Convert lists to tuples to make them hashable
# unique_event_tags = data_df_2to3['event_tags'].apply(tuple).unique()
# unique_event_tags

In [None]:
event_log = data_df_2to3[['pseudo_id', 'date', 'event_tag', 'verslagen_report_tags', 'verslagen_report_start_date', 'tags_20_5_100']].copy()

event_log = event_log.rename(columns={
    'tags_20_5_100': 'topic_tags'
})

event_log

Unnamed: 0,pseudo_id,date,event_tag,verslagen_report_tags,verslagen_report_start_date,topic_tags
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 15:06:00,Oraal_Stuk__Tablet_Oraal__Tablet_Oraal_Stuk__K...,Klinische Brief,2020-11-26 15:06:00,[Topic 10]
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 09:53:00,Rectaal_Bloedverlie__Aanvullen_Onderzoek__Over...,"Consult, Kliniek: vervolgconsult",2020-11-26 09:53:00,[Topic 15]
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 14:13:00,Outlier,Poliklinische Brief,2020-11-25 14:13:00,[Topic -1]
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 13:48:00,Outlier,Poliklinische Brief,2020-11-25 13:48:00,[Topic -1]
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 08:47:00,Rectaal_Bloedverlie__Aanvullen_Onderzoek__Over...,"Consult, Kliniek: vervolgconsult",2020-11-25 08:47:00,[Topic 15]
...,...,...,...,...,...,...
9572,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-03-20 08:13:00,Outlier,Consult,2015-03-20 08:13:00,[Topic -1]
9573,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-01-14 15:39:00,Rectaal_Bloedverlie__Aanvullen_Onderzoek__Over...,"Consult, Kliniek: vervolgconsult",2015-01-14 15:39:00,[Topic 15]
9574,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2014-12-21 09:31:00,Outlier,"Consult, SEH",2014-12-21 09:31:00,[Topic -1]
9575,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2010-11-10 21:03:00,Medisch_Dossier__Naam_Functie__Dossier_Vk_Sput...,"Consult, SEH",2010-11-10 21:03:00,[Topic 11]


In [None]:
event_log.to_csv("event_log_test.csv", index=False)


In [None]:
data_df_2to3[['pseudo_id', 'tags_20_5']]  
data_df_2to3[data_df_2to3['tags_20_5'].apply(lambda tags: len(tags) == 2)]


Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,topic_20_5,tags_20_5,keywords_20_5,event_tags,event_tag


In [None]:
data_df_2to3.head(5)

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,topic_20_5,tags_20_5,keywords_20_5,event_tags,event_tag
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[Unclear],oraal stuk__tablet oraal__tablet oraal stuk
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[rectaal, bloedverlie, obvn, divertikelbloedin...",rectaal bloedverlie obvn divertikelbloeding ac...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[Unclear],oraal stuk__tablet oraal__tablet oraal stuk
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, initials, adresgegeven...",coloscopie betreffen initials adresgegevens st...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[Unclear],oraal stuk__tablet oraal__tablet oraal stuk
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[Unclear],oraal stuk__tablet oraal__tablet oraal stuk
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[rectaal, bloedverlie, eenmalig, hd, hbstabiel...",rectaal bloedverlie eenmalig hd hbstabiel inr ...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[Unclear],oraal stuk__tablet oraal__tablet oraal stuk
