In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# def create_bertopic_model(n_topics, min_topic_size=10):
#     embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#     umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
#     hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
#     # vectorizer_model = CountVectorizer(stop_words=dutch_stopwords)
#     vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words=dutch_stopwords)
#     # vectorizer_model = CountVectorizer(stop_words='english')
    
#     topic_model = BERTopic(
#         embedding_model=embedding_model,
#         umap_model=umap_model,
#         hdbscan_model=hdbscan_model,
#         vectorizer_model=vectorizer_model,
#         nr_topics=n_topics,
#         verbose=True,
#         calculate_probabilities=True
#     )
    
#     return topic_model

def create_bertopic_model(n_topics, min_topic_size=10, ngram_range=(1, 1)):
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords, ngram_range=ngram_range)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    return topic_model



In [None]:
# def calculate_coherence_score(texts, topic_words):
#     # Convert each document to a list if it's a string representation of a list
#     texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]
    
#     dictionary = Dictionary(texts)

#     coherence_model = CoherenceModel(
#         topics=topic_words,  
#         texts=texts,
#         dictionary=dictionary,
#         coherence='c_v'
#     )

#     return coherence_model.get_coherence()


def calculate_coherence_score(texts, topic_words, ngram_range=(1, 1)):
    # Zorg dat tokens correct zijn
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]

    # Maak n-gram strings van tokens
    joined_texts = [" ".join(doc) for doc in texts]
    vectorizer = CountVectorizer(ngram_range=ngram_range).fit(joined_texts)

    # Maak per document lijst met gegenereerde n-grams
    ngram_texts = [list(ngrams) for ngrams in vectorizer.inverse_transform(vectorizer.transform(joined_texts))]

    # Maak Gensim dictionary en coherence model
    dictionary = Dictionary(ngram_texts)
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=ngram_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()



In [4]:
# import plotly.graph_objects as go
# import pandas as pd

# def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[30], n_words_list=[5]):
#     results = {}
#     summary_data = []

#     coherence_fig = go.Figure()
    
#     for n_topics in n_topics_list:
#         coherence_scores = []
        
#         for n_words in n_words_list:
#             print(f"\n Analyzing with {n_topics} topics and top {n_words} words...")

#             # Create and fit the BERTopic model
#             topic_model = create_bertopic_model(n_topics)
#             # topics, probs = topic_model.fit_transform(texts, embeddings, calculate_probabilities=True)
#             # Fit model
#             topics = topic_model.fit(texts, embeddings)
#             # Get topic probabilities for each doc
#             topics, probs = topic_model.transform(texts)



#             topic_info = topic_model.get_topic_info()


#             # Build readable labels
#             topic_labels = {}
#             for topic_id, words in topic_words.items():
#                 label = "_".join(words[:5])  # Or however many keywords you want
#                 topic_labels[topic_id] = label

#             # Apply it to tags
#             doc_topic_labels = []
#             threshold = 0.1
#             for prob in probs:
#                 if prob is None:
#                     doc_topic_labels.append(["Outlier"])
#                 else:
#                     tags = [topic_labels[i] for i, p in enumerate(prob) if p > threshold]
#                     doc_topic_labels.append(tags if tags else ["Unclear"])


#             # Extract keywords per topic
#             topic_word_list = []
#             topic_words = {}

#             for topic in range(len(set(topics)) - 1):
#                 words = topic_model.get_topic(topic)[:n_words]
#                 topic_word_list.append([word for word, _ in words])
#                 topic_words[topic] = [word for word, _ in words]

#             # Calculate coherence
#             coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
#             print(f" Coherence Score (C_v): {coherence:.4f}")
#             coherence_scores.append(coherence)

#             # Save results
#             results[(n_topics, n_words)] = {
#                 'model': topic_model,
#                 'topics': topics,
#                 'topic_info': topic_info,
#                 'topic_words': topic_words,
#                 'coherence': coherence
#             }

#             # Save interactive visualizations
#             topic_model.visualize_topics().write_html(f'topic_visualization_{n_topics}_{n_words}.html')
#             topic_model.visualize_heatmap().write_html(f'topic_heatmap_{n_topics}_{n_words}.html')

#             # Append summary data
#             summary_data.append({
#                 'n_topics': n_topics,
#                 'n_words': n_words,
#                 'coherence': coherence
#             })

#         # Add to coherence plot
#         coherence_fig.add_trace(go.Scatter(
#             x=n_words_list,
#             y=coherence_scores,
#             mode='lines+markers',
#             name=f'{n_topics} topics'
#         ))

#     coherence_fig.update_layout(
#         title='Coherence Scores across Different Configurations',
#         xaxis_title='Number of Words per Topic',
#         yaxis_title='Coherence Score (C_v)',
#         showlegend=True
#     )
#     coherence_fig.write_html('coherence_scores.html')

#     results[(n_topics, n_words)]['doc_tags'] = doc_topic_tags

#     # Create and print summary
#     grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
#     print("\n Top Configurations by Coherence:")
#     print(grid_summary.head(3).to_string(index=False))

#     data[f'topic_tags_{n_topics}_{n_words}'] = doc_topic_tags
#     data[f'topic_main_{n_topics}_{n_words}'] = topics


#     return results, grid_summary, topic_model, data


In [5]:
# def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[30], n_words_list=[5], ngram_range=(1, 1)):
#     import plotly.graph_objects as go
#     import pandas as pd
#     from tqdm import tqdm

#     results = {}
#     summary_data = []
#     all_doc_tags = {}  # Store tags per (n_topics, n_words) combination

#     coherence_fig = go.Figure()

#     for n_topics in n_topics_list:
#         coherence_scores = []

#         for n_words in n_words_list:
#             print(f"\nAnalyzing with {n_topics} topics and top {n_words} words...")

#             # Step 1: Create & fit model
#             topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range)
#             topic_model.fit(texts, embeddings)
#             topics, probs = topic_model.transform(texts)
#             print(f"Topics: {topics}")


#             topic_info = topic_model.get_topic_info()

#             # Step 2: Extract keywords per topic
#             topic_word_list = []
#             topic_words = {}

#             for topic in range(len(set(topics)) - 1):
#                 words = topic_model.get_topic(topic)[:n_words]
#                 word_list = [word for word, _ in words]
#                 topic_word_list.append(word_list)
#                 print('topic_words:', word_list)
#                 topic_words[topic] = word_list

#             # Step 3: Calculate coherence
#             coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
#             print(f"Coherence Score (C_v): {coherence:.4f}")
#             coherence_scores.append(coherence)

#             # Step 4: Assign per-document tags based on topic probabilities
#             threshold = 0.1
#             doc_tags = []
#             for prob in probs:
#                 print(f"Probabilities: {prob}")
#                 if prob is None:
#                     doc_tags.append(["Outlier"])
#                     print("Outlier detected")
#                 else:
#                     tags = [f"Topic {i}" for i, p in enumerate(prob) if p > threshold]
#                     doc_tags.append(tags if tags else ["Unclear"])
#                     print(f"Tags: {tags}")

#             all_doc_tags[(n_topics, n_words)] = doc_tags

#             # Step 5: Save results
#             results[(n_topics, n_words)] = {
#                 'model': topic_model,
#                 'topics': topics,
#                 'probs': probs,
#                 'topic_info': topic_info,
#                 'topic_words': topic_words,
#                 'coherence': coherence,
#                 'doc_tags': doc_tags
#             }

#             topic_model.visualize_topics().write_html(f'topic_visualization_{n_topics}_{n_words}.html')
#             topic_model.visualize_heatmap().write_html(f'topic_heatmap_{n_topics}_{n_words}.html')

#             summary_data.append({
#                 'n_topics': n_topics,
#                 'n_words': n_words,
#                 'coherence': coherence
#             })

#         # Plot coherence line
#         coherence_fig.add_trace(go.Scatter(
#             x=n_words_list,
#             y=coherence_scores,
#             mode='lines+markers',
#             name=f'{n_topics} topics'
#         ))

#     coherence_fig.update_layout(
#         title='Coherence Scores across Topic Configs',
#         xaxis_title='Top Words per Topic',
#         yaxis_title='Coherence Score (C_v)',
#         showlegend=True
#     )
#     coherence_fig.write_html('coherence_scores.html')

#     grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
#     print("\n🏆 Top Configurations by Coherence:")
#     print(grid_summary.head(3).to_string(index=False))

#     # Final output: tags for highest-coherence config
#     best_config = grid_summary.iloc[0][['n_topics', 'n_words']].astype(int).tolist()
#     best_tags = all_doc_tags[tuple(best_config)]
#     data_with_tags = data.copy()
#     data_with_tags['event_tags'] = best_tags

#     return results, grid_summary, topic_model, data_with_tags


In [6]:
def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20], n_words_list=[5], ngram_range=(1, 2), calculate_coherence=True):
    results = {}
    summary_data = []
    all_doc_tags = {}
    data_df = data.copy()

    for n_topics in n_topics_list:
        for n_words in n_words_list:
            print(f"\nAnalyzing with {n_topics} topics, {n_words} words, ngram_range={ngram_range}")

            topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range)
            topics, probs = topic_model.fit_transform(texts, embeddings)
            topic_info = topic_model.get_topic_info()

            # Verwerk keywords per topic
            topic_words = {}
            for topic in set(topics):
                if topic != -1:
                    words = topic_model.get_topic(topic)[:n_words]
                    topic_words[topic] = [word for word, _ in words]

            # Coherence berekenen
            topic_word_list = list(topic_words.values())
            if calculate_coherence == True:
                # coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
                print("Calculating coherence score...")
                coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list, ngram_range=ngram_range)
                print(f"Coherence Score: {coherence:.4f}")
            else:
                print("Skipping coherence calculation...")
                coherence = None
                print("Coherence Score (C_v):", coherence)
                       

            # Per document: top topics (≥ threshold)
            doc_tags = []
            threshold = 0.1
            for i, topic in enumerate(topics):
                if probs is None or isinstance(probs[i], float):
                    # fallback als probs None of 1D zijn
                    tags = [f"Topic {topic}"]
                else:
                    tags = [f"Topic {j}" for j, p in enumerate(probs[i]) if p > threshold]
                doc_tags.append(tags if tags else ["Unclear"])

            # Per document: keywords van toegekende topic (voor extra inzicht)
            topic_keywords_per_doc = [
                topic_words.get(t, []) if t in topic_words else [] for t in topics
            ]

            # Update dataframe
            data_df[f'topic_{n_topics}_{n_words}'] = topics
            data_df[f'tags_{n_topics}_{n_words}'] = doc_tags
            data_df[f'keywords_{n_topics}_{n_words}'] = topic_keywords_per_doc

            # # Opslaan
            # results[(n_topics, n_words)] = {
            #     'model': topic_model,
            #     'topics': topics,
            #     'probs': probs,
            #     'topic_words': topic_words,
            #     'coherence': coherence
            # }

            results[(n_topics, n_words, ngram_range)] = {
                'model': topic_model,
                'topics': topics,
                'probs': probs,
                'topic_words': topic_words,
                'coherence': coherence,
                'topic_info': topic_info
            }


            summary_data.append({
                'n_topics': n_topics,
                'n_words': n_words,
                'ngram_range': str(ngram_range),
                'coherence': coherence
            })


    grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
    return results, grid_summary, topic_model, data_df


In [7]:
def print_results_summary(results, texts, save_to_file=True, filename=None):
    import datetime

    output_lines = []

    # for (n_topics, n_words), result in results.items():
    for key, result in results.items():
        print('key:', key)
        if isinstance(key, tuple) and len(key) == 3:
            n_topics, n_words, ngram_range = key
        else:
            n_topics, n_words = key
            ngram_range = "(1, 1)"

        # output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words (ngram_range={ngram_range}) ===")
        # output_lines.append(f"Coherence Score: {result['coherence']:.4f}")
        coherence = result.get('coherence')
        if coherence is not None:
            output_lines.append(f"Coherence Score: {coherence:.4f}")
        else:
            output_lines.append("Coherence Score: Not available")

        
        output_lines.append("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                output_lines.append(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        output_lines.append("\nTopic sizes: " + str(topic_sizes))
        output_lines.append("\n" + "="*50)

    full_output = "\n".join(output_lines)

    # Print to console
    print(full_output)

    # Optionally save to file
    if save_to_file:
        if not filename:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"topic_results_summary_{timestamp}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(full_output)
        print(f"\n📁 Summary saved to: {filename}")


In [8]:
def fix_character_tokens(char_list):
    text = ''.join(char_list)
    tokens = text.split()  # crude but often works
    return tokens

def flatten_nested_char_lists(nested_list):
    return [''.join(token) for token in nested_list if isinstance(token, list)]
    # return [token for token in nested_list if token != []]




In [None]:
import ast

# Load your data
data = pd.read_csv('a:/df_cleaned.csv')

# Parse the stringified lists into real Python lists
data['tokens'] = data['tokens'].apply(lambda x: ast.literal_eval(x))


remove_list = [
        # Units, measurements, filler symbols
        'mg', 'mmoll', 'mmolL', 'x', 'per', 'dag', 'min', 'uur', 'ml', 'eenhed',

        # Admin & structure
        'samenvatting', 'memo', 'beleid', 'conclusie', 'aanvullend', 'afgewerken', 
        'opdracht', 'opdrachten', 'rapportage', 'diversen', 'contact', 'afspraak', 
        'tijd', 'tijdsduur', 'datum', 'poli', 'recept', 'gefaxt', 'bellen', 
        'akkoord', 'nodig', 'bekende', 'bekend', 'memo', 'scorelijzen', 'naslag',

        # Identifying or privacy-sensitive terms
        'bsn', 'city', 'postcode', 'firstname', 'lastname', 'streetname', 'phonenumber', 'voicemail', 
        'mw', 'dhr', 'mevrouw', 'meneer', 'zoon', 'mevr', 'mvr', 'dhr'
        'inge', 'valkenburg', 'peter', 'miriam', 'debby', 'eliane',

        # Clinical history / often uninformative by itself
        'anamnese', 'anamnees', 'voorgeschiedenis', 'huisarts', 
        'hoofdbehandelaar',

        # Temporal or ambiguous
        'sinds', 'dagen', 'weken', 'maanden', 'week', 'avond', 'nacht', 'ochtend', 'middag', 
        'extra', 'stop', 'gehad', 'gezien', 'zien', 'dd', 
        'ivm', 'links', 'rechts', 'linker', 'dr', 'overige', 'algemeen', 'patiënt', 'patiënte',

        # Admin/communication
        'verzoek', 'mail', 'mailen', 'verstuuren', 'brief', 'uitinen', 'ak', 'bespreeklijst', 
        'wijzigingopmerking',
        'voicemail', 'telefonisch', 'mobiel', 'ingesproek', 'aanleiding', 'telefoon', 'email', 'bereiken', 'svp', 'contactpersoon'
        'terugbellen', 'gemaild', 'insproken', 'voicemail',

        # Unclear / possibly noise
        'eenhed', 'aangeeft', 'scorelijzen', 'inten', 'intn', 'vb', 'sub', 

        # extra         
        'regelen', 'opmerking', 'bespreeklijst', 'sehperiode', 'bedrijf', 
        'ivb', 'mtps', 'cp', 'pat', 'huisadres', 'gg', 
        'medewerker', 'medewerk', 'laboratorium', 'apotheek', 'maand', 'tc', 
        'wonen', 'gezondheidsinstelling', 'leven', 'varken', 'soms', 'jaar', 'mgdag', 'lateraal',
        'bespreking', 'wondfoto', 'cze',         
        
        'vrijdag', 'maandag', 'donderdag', 'woensdag', 'zaterdag', 'zondag',      
        'juli', 'augustus', 'september', 'oktober', 'november', 'december', 'januari', 'februari', 'maart', 'april', 'mei', 'jun',
        
        
        # 'voltooid', 'verdenking', 'waarvoor', 'reden', 'waarschijnlijk', 'mogelijk', 'stuk', 'basisdosering', 'probleem', 'probleemlijst', 'actie',
        # 'nee', 'ja',  'arts',  'radiologie', 'internist', 'evaluatie', 'intake', 'controle',


        ]

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# Remove double words that are next to each other
for i in range(len(data)):
    tokens = data['tokens'].loc[i]
    print("tokens list length:", len(tokens))
    new_tokens = []
    for j in range(len(tokens) - 1):
        if tokens[j] == tokens[j + 1]:
            print("Duplicate token found:", tokens[j], 'and', tokens[j + 1])
        else:
            new_tokens.append(tokens[j])
    # Append the last token since it won't be checked in the loop
    if tokens:
        new_tokens.append(tokens[-1])
    data['tokens'].loc[i] = new_tokens
    print("New tokens list length:", len(new_tokens))

# Remove empty documents
data = data[data['tokens'].apply(len) > 0]

# Convert token lists to strings for BERTopic
data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
texts = data['text'].tolist()

# Debug prints
print("First 3 token lists:", data['tokens'].head(3).tolist())
print("First 3 texts:", data['text'].head(3).tolist())
print("Total documents after filtering:", len(texts))


tokens list length: 357
Duplicate token found: bloedgroep and bloedgroep
Duplicate token found: coloscopie and coloscopie
Duplicate token found: acenocoumarol and acenocoumarol
New tokens list length: 354
tokens list length: 89
New tokens list length: 89
tokens list length: 127
Duplicate token found: Fentanyl and Fentanyl
New tokens list length: 126
tokens list length: 71
Duplicate token found: bloed and bloed
New tokens list length: 70
tokens list length: 118
Duplicate token found: coloscopie and coloscopie
New tokens list length: 117
tokens list length: 36
New tokens list length: 36
tokens list length: 305
New tokens list length: 305
tokens list length: 275
New tokens list length: 275
tokens list length: 60
Duplicate token found: visite and visite
New tokens list length: 59
tokens list length: 34
Duplicate token found: visite and visite
New tokens list length: 33
tokens list length: 64
Duplicate token found: duodenum and duodenum
New tokens list length: 63
tokens list length: 27
New 

In [10]:
# len(data['tokens'].loc[0]) - 1

In [11]:
# def dedup(text):
#     return re.sub(r'\b(\w+)(\s+\1)+\b', r'\1', text)

# texts = [dedup(t) for t in texts]
# texts 

In [12]:
# Load embeddings

### model7 is from before removing duplicates
### model8 is with removal of duplicates based on words that are next to each other

embeddings = np.load('embeddings_model8.npy')
# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = embedding_model.encode(texts, show_progress_bar=True)
# np.save('embeddings_model8.npy', embeddings)

# print("embeddings:", embeddings)
print("Embeddings shape:", embeddings.shape)



Embeddings shape: (9493, 384)


In [13]:
# Analyze with different numbers of topics and words
# results, grid_summary, topic_model, data_df = analyze_topics_with_sentiment(texts, embeddings, data)


In [14]:
results_1gram2gram, grid_summary1, topic_model1, data_df1 = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20, 30], n_words_list=[5, 10], ngram_range=(1, 2))



Analyzing with 10 topics, 5 words, ngram_range=(1, 2)


2025-04-22 22:19:40,891 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:20:22,525 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:20:22,525 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:20:22,904 - BERTopic - Cluster - Completed ✓
2025-04-22 22:20:22,904 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:20:25,983 - BERTopic - Representation - Completed ✓
2025-04-22 22:20:25,999 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:20:26,044 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:20:29,014 - BERTopic - Representation - Completed ✓
2025-04-22 22:20:29,017 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.7011

Analyzing with 10 topics, 10 words, ngram_range=(1, 2)


2025-04-22 22:21:26,365 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:21:45,466 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:21:45,466 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:21:45,845 - BERTopic - Cluster - Completed ✓
2025-04-22 22:21:45,845 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:21:49,117 - BERTopic - Representation - Completed ✓
2025-04-22 22:21:49,117 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:21:49,173 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:21:52,435 - BERTopic - Representation - Completed ✓
2025-04-22 22:21:52,451 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.6078

Analyzing with 20 topics, 5 words, ngram_range=(1, 2)


2025-04-22 22:22:42,977 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:23:02,415 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:23:02,415 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:23:02,815 - BERTopic - Cluster - Completed ✓
2025-04-22 22:23:02,816 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:23:06,064 - BERTopic - Representation - Completed ✓
2025-04-22 22:23:06,064 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:23:06,120 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:23:09,282 - BERTopic - Representation - Completed ✓
2025-04-22 22:23:09,284 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.7959

Analyzing with 20 topics, 10 words, ngram_range=(1, 2)


2025-04-22 22:23:59,295 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:24:17,695 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:24:17,695 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:24:18,050 - BERTopic - Cluster - Completed ✓
2025-04-22 22:24:18,050 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:24:21,179 - BERTopic - Representation - Completed ✓
2025-04-22 22:24:21,195 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:24:21,238 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:24:24,262 - BERTopic - Representation - Completed ✓
2025-04-22 22:24:24,279 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.6770

Analyzing with 30 topics, 5 words, ngram_range=(1, 2)


2025-04-22 22:25:14,080 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:25:31,863 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:25:31,863 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:25:32,250 - BERTopic - Cluster - Completed ✓
2025-04-22 22:25:32,250 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:25:35,363 - BERTopic - Representation - Completed ✓
2025-04-22 22:25:35,382 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:25:35,418 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:25:38,468 - BERTopic - Representation - Completed ✓
2025-04-22 22:25:38,480 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.7571

Analyzing with 30 topics, 10 words, ngram_range=(1, 2)


2025-04-22 22:26:27,481 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:26:45,281 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:26:45,281 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:26:45,647 - BERTopic - Cluster - Completed ✓
2025-04-22 22:26:45,647 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:26:48,996 - BERTopic - Representation - Completed ✓
2025-04-22 22:26:48,997 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:26:49,045 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:26:52,247 - BERTopic - Representation - Completed ✓
2025-04-22 22:26:52,248 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.6096


In [15]:
for topic_id in topic_model1.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model1.get_topic(topic_id)[:5]]}")

topic_df = pd.DataFrame(topic_model1.get_topic_info())
display(topic_df)


Topic -1: ['onderzoek', 'hb', 'bloedverlie', 'anemie', 'controle']
Topic 0: ['oraal', 'stuk', 'oraal stuk', 'tablet', 'onderzoek']
Topic 1: ['anemie', 'waarvoor', 'starten', 'volgen', 'obinutuzumab']
Topic 2: ['diabete', 'diabetisch', 'diabete mellitus', 'mellitus', 'voet']
Topic 3: ['anemie', 'waarvoor', 'hypertensie', 'trombocytose', 'goed']
Topic 4: ['diabete lunch', 'lunch', 'lunch slapen', 'basisdosering', 'slapen']
Topic 5: ['echo', 'atriumfibrilleren', 'rivaroxaban', 'controle', 'hypertensie']
Topic 6: ['controle', 'decursus', 'uitvoeren controle', 'decursus podo', 'podo decursus']
Topic 7: ['peggy', 'innen peggy', 'innen', 'controle', 'tp']
Topic 8: ['lab', 'diana', 'labbon', 'tevoren', 'opsturen']
Topic 9: ['osteoporose', 'denosumab', 'fractuur', 'injectie', 'bisfosfonaat']
Topic 10: ['beloop verpleegkundig', 'verpleegkundig', 'beloop', 'ct', 'meenemen']
Topic 11: ['hyperparathyreoïdie', 'intern', 'intern geneeskun', 'geneeskun', 'secundair hyperparathyreoïdie']
Topic 12: ['aa

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1674,-1_onderzoek_hb_bloedverlie_anemie,"[onderzoek, hb, bloedverlie, anemie, controle,...",[weledelgeleer heer drs anj daaem ve kenmerk p...
1,0,4166,0_oraal_stuk_oraal stuk_tablet,"[oraal, stuk, oraal stuk, tablet, onderzoek, a...",[reden komst verwijzing reden komst rectaal bl...
2,1,808,1_anemie_waarvoor_starten_volgen,"[anemie, waarvoor, starten, volgen, obinutuzum...",[carpaal tunnel syndroom mammacarcinoom profyl...
3,2,656,2_diabete_diabetisch_diabete mellitus_mellitus,"[diabete, diabetisch, diabete mellitus, mellit...",[diabete mellitus type hypertensie nierfunctie...
4,3,267,3_anemie_waarvoor_hypertensie_trombocytose,"[anemie, waarvoor, hypertensie, trombocytose, ...",[hypertensie ptca rcx acuut onderwandinfarct t...
5,4,206,4_diabete lunch_lunch_lunch slapen_basisdosering,"[diabete lunch, lunch, lunch slapen, basisdose...","[diabete lunch slapen basisdosering, diabete l..."
6,5,198,5_echo_atriumfibrilleren_rivaroxaban_controle,"[echo, atriumfibrilleren, rivaroxaban, control...",[hypertensie struma neusseptumcorrectie blaasr...
7,6,191,6_controle_decursus_uitvoeren controle_decursu...,"[controle, decursus, uitvoeren controle, decur...","[decursus podo Decursus uitvoeren controle, de..."
8,7,188,7_peggy_innen peggy_innen_controle,"[peggy, innen peggy, innen, controle, tp, krij...","[controle innen peggy, tp controle innen peggy..."
9,8,163,8_lab_diana_labbon_tevoren,"[lab, diana, labbon, tevoren, opsturen, tevore...","[lab, lab, lab]"


In [16]:
# Print detailed results summary
print_results_summary(results_1gram2gram, texts)

key: (10, 5, (1, 2))
key: (10, 10, (1, 2))
key: (20, 5, (1, 2))
key: (20, 10, (1, 2))
key: (30, 5, (1, 2))
key: (30, 10, (1, 2))

=== Results for 10 topics with 5 words (ngram_range=(1, 2)) ===
Coherence Score: 0.7011

Topics and their key words:
Topic 0: oraal, stuk, onderzoek, anemie, oraal stuk, tablet, opname, waarvoor, goed, hb
Topic 1: controle, aantal, aantal controle, decursus, uitvoeren controle, podo decursus, decursus podo, podo, decursus uitvoeren, uitvoeren
Topic 2: lab, controle, beloop, hb, overig actie, actie, transferrin, overig, ijzer, recidiveren
Topic 3: diabete lunch, lunch, lunch slapen, slapen, basisdosering, slapen basisdosering, basisdosering diabete, diabete, slapen diabete, sn
Topic 4: beloop verpleegkundig, verpleegkundig, bespreeklijst trauma, trauma radiologie, radiologie seh, bespreeklijst, beloop, radiologie, trauma, seh
Topic 5: vocht toediening, toediening soort, soort infuus, soort, infuus nacl, vocht, toediening, nacl vocht, nacl, infuus
Topic 6: sep

In [17]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_ngram1_2_{timestamp}.csv"
grid_summary1.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_ngram1_2_20250422_222739.csv'


In [18]:
results_test, grid_summary_test, topic_model_test, data_df_test = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20, 30], n_words_list=[5, 10], ngram_range=(2, 3))



Analyzing with 10 topics, 5 words, ngram_range=(2, 3)


2025-04-22 22:27:42,362 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:28:00,132 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:28:00,147 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:28:00,517 - BERTopic - Cluster - Completed ✓
2025-04-22 22:28:00,517 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:28:06,568 - BERTopic - Representation - Completed ✓
2025-04-22 22:28:06,583 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:28:06,636 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:28:12,021 - BERTopic - Representation - Completed ✓
2025-04-22 22:28:12,043 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.8063

Analyzing with 10 topics, 10 words, ngram_range=(2, 3)


2025-04-22 22:29:34,468 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:29:52,467 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:29:52,467 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:29:52,900 - BERTopic - Cluster - Completed ✓
2025-04-22 22:29:52,900 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:29:58,784 - BERTopic - Representation - Completed ✓
2025-04-22 22:29:58,800 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:29:58,866 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:30:04,434 - BERTopic - Representation - Completed ✓
2025-04-22 22:30:04,456 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.8114

Analyzing with 20 topics, 5 words, ngram_range=(2, 3)


2025-04-22 22:31:24,091 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:31:42,039 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:31:42,041 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:31:42,408 - BERTopic - Cluster - Completed ✓
2025-04-22 22:31:42,408 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:31:48,454 - BERTopic - Representation - Completed ✓
2025-04-22 22:31:48,473 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:31:48,528 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:31:54,088 - BERTopic - Representation - Completed ✓
2025-04-22 22:31:54,108 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.8411

Analyzing with 20 topics, 10 words, ngram_range=(2, 3)


2025-04-22 22:33:14,871 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:33:32,744 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:33:32,744 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:33:33,119 - BERTopic - Cluster - Completed ✓
2025-04-22 22:33:33,135 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:33:39,205 - BERTopic - Representation - Completed ✓
2025-04-22 22:33:39,227 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:33:39,294 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:33:44,896 - BERTopic - Representation - Completed ✓
2025-04-22 22:33:44,932 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.7764

Analyzing with 30 topics, 5 words, ngram_range=(2, 3)


2025-04-22 22:35:04,788 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:35:22,778 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:35:22,779 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:35:23,156 - BERTopic - Cluster - Completed ✓
2025-04-22 22:35:23,156 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:35:29,392 - BERTopic - Representation - Completed ✓
2025-04-22 22:35:29,395 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:35:29,460 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:35:35,240 - BERTopic - Representation - Completed ✓
2025-04-22 22:35:35,263 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.7531

Analyzing with 30 topics, 10 words, ngram_range=(2, 3)


2025-04-22 22:36:58,298 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:37:16,390 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:37:16,390 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:37:16,790 - BERTopic - Cluster - Completed ✓
2025-04-22 22:37:16,791 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:37:23,036 - BERTopic - Representation - Completed ✓
2025-04-22 22:37:23,054 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:37:23,100 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:37:29,096 - BERTopic - Representation - Completed ✓
2025-04-22 22:37:29,115 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.7017


In [19]:
for topic_id in topic_model_test.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model_test.get_topic(topic_id)[:5]]}")

topic_df_test = pd.DataFrame(topic_model_test.get_topic_info())
display(topic_df_test)

Topic -1: ['lichamelijk onderzoek', 'oraal stuk', 'aanvullen onderzoek', 'rectaal bloedverlie', 'reden komst']
Topic 0: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'aanvullen onderzoek', 'lichamelijk onderzoek']
Topic 1: ['progressie cll', 'exacerbatie copd', 'opname exacerbatie copd', 'opname exacerbatie', 'graad ii']
Topic 2: ['diabete mellitus', 'mellitus type', 'diabete mellitus type', 'diabetisch retinopathie', 'diabete vpk']
Topic 3: ['goed lv functie', 'lv functie', 'essentiel trombocytose', 'ptca rcx', 'alcoholgebruik verminderde intake']
Topic 4: ['diabete lunch', 'diabete lunch slapen', 'lunch slapen', 'lunch slapen basisdosering', 'slapen basisdosering']
Topic 5: ['echo cor', 'antistoff vinden', 'matig ti', 'accepteren atriumfibrilleren', 'rate controle']
Topic 6: ['uitvoeren controle', 'decursus podo', 'podo decursus', 'decursus podo decursus', 'decursus uitvoeren']
Topic 7: ['innen peggy', 'krijgen remicade', 'tp controle', 'krijgen venofer', 'controle mirjam']
Top

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1674,-1_lichamelijk onderzoek_oraal stuk_aanvullen ...,"[lichamelijk onderzoek, oraal stuk, aanvullen ...",[reden komst verwijzing reden komst rectaal bl...
1,0,4166,0_oraal stuk_tablet oraal_tablet oraal stuk_aa...,"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[weledelgeleer vrouwe drs mej oerlemans street...
2,1,808,1_progressie cll_exacerbatie copd_opname exace...,"[progressie cll, exacerbatie copd, opname exac...",[carpaal tunnel syndroom mammacarcinoom profyl...
3,2,656,2_diabete mellitus_mellitus type_diabete melli...,"[diabete mellitus, mellitus type, diabete mell...",[diabete mellitus type hypertensie nierfunctie...
4,3,267,3_goed lv functie_lv functie_essentiel tromboc...,"[goed lv functie, lv functie, essentiel trombo...",[hypertensie ptca rcx acuut onderwandinfarct t...
5,4,206,4_diabete lunch_diabete lunch slapen_lunch sla...,"[diabete lunch, diabete lunch slapen, lunch sl...","[diabete lunch slapen basisdosering, diabete l..."
6,5,198,5_echo cor_antistoff vinden_matig ti_acceptere...,"[echo cor, antistoff vinden, matig ti, accepte...",[hypertensie struma neusseptumcorrectie blaasr...
7,6,191,6_uitvoeren controle_decursus podo_podo decurs...,"[uitvoeren controle, decursus podo, podo decur...","[decursus podo Decursus uitvoeren controle, de..."
8,7,188,7_innen peggy_krijgen remicade_tp controle_kri...,"[innen peggy, krijgen remicade, tp controle, k...","[controle innen peggy, controle innen peggy, t..."
9,8,163,8_tevoren lab_lab diana_diana lab_lab urine,"[tevoren lab, lab diana, diana lab, lab urine,...",[vervolgafspraak staan plannen tevoren lab oz ...


In [20]:
# Print detailed results summary
print_results_summary(results_test, texts)

key: (10, 5, (2, 3))
key: (10, 10, (2, 3))
key: (20, 5, (2, 3))
key: (20, 10, (2, 3))
key: (30, 5, (2, 3))
key: (30, 10, (2, 3))

=== Results for 10 topics with 5 words (ngram_range=(2, 3)) ===
Coherence Score: 0.8063

Topics and their key words:
Topic 0: oraal stuk, tablet oraal, tablet oraal stuk, aanvullen onderzoek, lichamelijk onderzoek, int knoppen, mcv fl, rectaal bloedverlie, diabete mellitus, overig actie
Topic 1: aantal controle, uitvoeren controle, decursus podo decursus, podo decursus, decursus podo, podo decursus uitvoeren, decursus uitvoeren, decursus uitvoeren controle, controle decursus podo, controle decursus
Topic 2: overig actie, videocapsule bloed, melena gastroscopie focus, recidiveren melena gastroscopie, focus videocapsule, gastroscopie focus videocapsule, focus videocapsule bloed, vg recidiveren melaena, videocapsule bloed proximaal, recidiveren melaena angiodysplasie
Topic 3: diabete lunch, diabete lunch slapen, lunch slapen, lunch slapen basisdosering, slapen 

In [21]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_test_2to3gram_{timestamp}.csv"
grid_summary_test.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_test_2to3gram_20250422_223858.csv'


In [22]:
results_1to3, grid_summary_1to3, topic_model_1to3, data_df_1to3 = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20, 30], n_words_list=[5, 10], ngram_range=(1, 3))



Analyzing with 10 topics, 5 words, ngram_range=(1, 3)


2025-04-22 22:39:01,480 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:39:19,524 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:39:19,526 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:39:19,945 - BERTopic - Cluster - Completed ✓
2025-04-22 22:39:19,945 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:39:26,725 - BERTopic - Representation - Completed ✓
2025-04-22 22:39:26,742 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:39:26,808 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:39:32,658 - BERTopic - Representation - Completed ✓
2025-04-22 22:39:32,683 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.6834

Analyzing with 10 topics, 10 words, ngram_range=(1, 3)


2025-04-22 22:41:02,743 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:41:20,943 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:41:20,943 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:41:21,343 - BERTopic - Cluster - Completed ✓
2025-04-22 22:41:21,358 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:41:28,077 - BERTopic - Representation - Completed ✓
2025-04-22 22:41:28,102 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:41:28,143 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:41:34,077 - BERTopic - Representation - Completed ✓
2025-04-22 22:41:34,106 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...
Coherence Score: 0.6829

Analyzing with 20 topics, 5 words, ngram_range=(1, 3)


2025-04-22 22:43:03,973 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:43:21,897 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:43:21,897 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:43:22,318 - BERTopic - Cluster - Completed ✓
2025-04-22 22:43:22,318 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:43:29,307 - BERTopic - Representation - Completed ✓
2025-04-22 22:43:29,322 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:43:29,380 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:43:35,446 - BERTopic - Representation - Completed ✓
2025-04-22 22:43:35,477 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.7710

Analyzing with 20 topics, 10 words, ngram_range=(1, 3)


2025-04-22 22:45:06,798 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:45:24,692 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:45:24,692 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:45:25,121 - BERTopic - Cluster - Completed ✓
2025-04-22 22:45:25,121 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:45:32,192 - BERTopic - Representation - Completed ✓
2025-04-22 22:45:32,208 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:45:32,269 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:45:38,370 - BERTopic - Representation - Completed ✓
2025-04-22 22:45:38,404 - BERTopic - Topic reduction - Reduced number of topics from 249 to 20


Calculating coherence score...
Coherence Score: 0.7053

Analyzing with 30 topics, 5 words, ngram_range=(1, 3)


2025-04-22 22:47:29,045 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:47:47,615 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:47:47,615 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:47:48,066 - BERTopic - Cluster - Completed ✓
2025-04-22 22:47:48,066 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:47:55,016 - BERTopic - Representation - Completed ✓
2025-04-22 22:47:55,043 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:47:55,082 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:48:01,499 - BERTopic - Representation - Completed ✓
2025-04-22 22:48:01,516 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.7260

Analyzing with 30 topics, 10 words, ngram_range=(1, 3)


2025-04-22 22:49:34,969 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:49:53,051 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:49:53,054 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:49:53,517 - BERTopic - Cluster - Completed ✓
2025-04-22 22:49:53,517 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:50:00,567 - BERTopic - Representation - Completed ✓
2025-04-22 22:50:00,600 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:50:00,652 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:50:06,984 - BERTopic - Representation - Completed ✓
2025-04-22 22:50:07,001 - BERTopic - Topic reduction - Reduced number of topics from 249 to 30


Calculating coherence score...
Coherence Score: 0.6181


In [23]:
for topic_id in topic_model_1to3.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model_1to3.get_topic(topic_id)[:5]]}")

topic_df_1to3 = pd.DataFrame(topic_model_1to3.get_topic_info())
display(topic_df_1to3)

Topic -1: ['onderzoek', 'hb', 'bloedverlie', 'anemie', 'controle']
Topic 0: ['oraal', 'stuk', 'oraal stuk', 'tablet', 'onderzoek']
Topic 1: ['anemie', 'waarvoor', 'starten', 'volgen', 'opname']
Topic 2: ['diabete', 'diabetisch', 'diabete mellitus', 'mellitus', 'voet']
Topic 3: ['anemie', 'waarvoor', 'hypertensie', 'goed', 'lab']
Topic 4: ['diabete lunch', 'lunch', 'diabete lunch slapen', 'lunch slapen', 'basisdosering']
Topic 5: ['echo', 'atriumfibrilleren', 'rivaroxaban', 'controle', 'hypertensie']
Topic 6: ['controle', 'decursus', 'uitvoeren controle', 'decursus podo decursus', 'decursus podo']
Topic 7: ['peggy', 'innen peggy', 'innen', 'controle', 'tp']
Topic 8: ['lab', 'diana', 'labbon', 'tevoren', 'opsturen']
Topic 9: ['osteoporose', 'denosumab', 'fractuur', 'injectie', 'dexa']
Topic 10: ['beloop verpleegkundig', 'verpleegkundig', 'beloop', 'ct', 'meenemen']
Topic 11: ['hyperparathyreoïdie', 'intern', 'intern geneeskun', 'geneeskun', 'opname intern geneeskun']
Topic 12: ['aantal',

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1674,-1_onderzoek_hb_bloedverlie_anemie,"[onderzoek, hb, bloedverlie, anemie, controle,...",[reden komst verwijzing reden komst rectaal bl...
1,0,4166,0_oraal_stuk_oraal stuk_tablet,"[oraal, stuk, oraal stuk, tablet, onderzoek, a...",[weledelgeleer heer drs boer waalreseweg zip K...
2,1,808,1_anemie_waarvoor_starten_volgen,"[anemie, waarvoor, starten, volgen, opname, ob...",[carpaal tunnel syndroom mammacarcinoom profyl...
3,2,656,2_diabete_diabetisch_diabete mellitus_mellitus,"[diabete, diabetisch, diabete mellitus, mellit...",[diabete mellitus type hypertensie nierfunctie...
4,3,267,3_anemie_waarvoor_hypertensie_goed,"[anemie, waarvoor, hypertensie, goed, lab, tro...",[hypertensie ptca rcx acuut onderwandinfarct t...
5,4,206,4_diabete lunch_lunch_diabete lunch slapen_lun...,"[diabete lunch, lunch, diabete lunch slapen, l...","[diabete lunch slapen basisdosering, diabete l..."
6,5,198,5_echo_atriumfibrilleren_rivaroxaban_controle,"[echo, atriumfibrilleren, rivaroxaban, control...",[hypertensie struma neusseptumcorrectie blaasr...
7,6,191,6_controle_decursus_uitvoeren controle_decursu...,"[controle, decursus, uitvoeren controle, decur...","[decursus podo Decursus uitvoeren controle, de..."
8,7,188,7_peggy_innen peggy_innen_controle,"[peggy, innen peggy, innen, controle, tp, krij...","[controle innen peggy, controle innen peggy, t..."
9,8,163,8_lab_diana_labbon_tevoren,"[lab, diana, labbon, tevoren, opsturen, tevore...","[lab, lab, lab]"


In [24]:
# Print detailed results summary
print_results_summary(results_1to3, texts)

key: (10, 5, (1, 3))
key: (10, 10, (1, 3))
key: (20, 5, (1, 3))
key: (20, 10, (1, 3))
key: (30, 5, (1, 3))
key: (30, 10, (1, 3))

=== Results for 10 topics with 5 words (ngram_range=(1, 3)) ===
Coherence Score: 0.6834

Topics and their key words:
Topic 0: oraal, stuk, onderzoek, anemie, oraal stuk, tablet, opname, waarvoor, goed, hb
Topic 1: controle, aantal, aantal controle, decursus, uitvoeren controle, decursus podo decursus, podo decursus, decursus podo, podo, uitvoeren
Topic 2: lab, controle, beloop, hb, overig actie, actie, transferrin, overig, ijzer, recidiveren
Topic 3: diabete lunch, lunch, diabete lunch slapen, lunch slapen, slapen, basisdosering, lunch slapen basisdosering, slapen basisdosering, diabete, basisdosering diabete lunch
Topic 4: beloop verpleegkundig, verpleegkundig, beloop, bespreeklijst trauma radiologie, radiologie seh, bespreeklijst, bespreeklijst trauma, trauma radiologie seh, trauma radiologie, radiologie
Topic 5: toediening soort infuus, toediening soort, 

In [25]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_1to3_{timestamp}.csv"
grid_summary_1to3.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_1to3_20250422_225142.csv'


In [None]:
results_2to5gram, grid_summary2, topic_model2, data_df2 = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20], n_words_list=[5, 10], ngram_range=(2, 5), calculate_coherence=True)



Analyzing with 10 topics, 5 words, ngram_range=(2, 5)


2025-04-22 22:51:45,715 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-22 22:52:04,105 - BERTopic - Dimensionality - Completed ✓
2025-04-22 22:52:04,105 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-22 22:52:04,753 - BERTopic - Cluster - Completed ✓
2025-04-22 22:52:04,753 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-22 22:52:20,030 - BERTopic - Representation - Completed ✓
2025-04-22 22:52:20,083 - BERTopic - Topic reduction - Reducing number of topics
2025-04-22 22:52:20,140 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-22 22:52:34,428 - BERTopic - Representation - Completed ✓
2025-04-22 22:52:34,493 - BERTopic - Topic reduction - Reduced number of topics from 249 to 10


Calculating coherence score...


In [None]:
for topic_id in topic_model2.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model2.get_topic(topic_id)[:5]]}")

topic_df2 = pd.DataFrame(topic_model2.get_topic_info())
display(topic_df2)


Topic -1: ['lichamelijk onderzoek', 'aanvullen onderzoek', 'oraal stuk', 'rectaal bloedverlie', 'reden komst']
Topic 0: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'aanvullen onderzoek', 'lichamelijk onderzoek']
Topic 1: ['aantal controle', 'uitvoeren controle', 'decursus podo', 'decursus podo decursus', 'podo decursus']
Topic 2: ['beloop verpleegkundig', 'tevoren lab', 'diana lab', 'lab diana', 'lab lab']
Topic 3: ['basisdosering basisdosering', 'basisdosering basisdosering basisdosering', 'basisdosering basisdosering basisdosering basisdosering', 'diabete lunch', 'lunch slapen']
Topic 4: ['krijgen remicade', 'krijgen remicade iv', 'krijgen venofer', 'remicade iv', 'beloop bld lijst']
Topic 5: ['vocht vocht', 'toediening soort infuus', 'vocht toediening', 'vocht toediening soort', 'vocht toediening soort infuus']
Topic 6: ['denosumab inj', 'injectie denosumab', 'overig actie', 'starten denosumab', 'osteoporose baseren']
Topic 7: ['videocapsule bloed', 'gastroscopie focus video

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1683,-1_lichamelijk onderzoek_aanvullen onderzoek_o...,"[lichamelijk onderzoek, aanvullen onderzoek, o...",[pneumothorax laat keer hypertensie retinopath...
1,0,6245,0_oraal stuk_tablet oraal_tablet oraal stuk_aa...,"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[vervolg consult type vervolgconsult verkort i...
2,1,321,1_aantal controle_uitvoeren controle_decursus ...,"[aantal controle, uitvoeren controle, decursus...","[decursus podo Decursus uitvoeren controle, de..."
3,2,268,2_beloop verpleegkundig_tevoren lab_diana lab_...,"[beloop verpleegkundig, tevoren lab, diana lab...",[initials streetnaam cd Kenmerk patientid betr...
4,3,249,3_basisdosering basisdosering_basisdosering ba...,"[basisdosering basisdosering, basisdosering ba...",[diabete lunch slapen basisdosering basisdoser...
5,4,144,4_krijgen remicade_krijgen remicade iv_krijgen...,"[krijgen remicade, krijgen remicade iv, krijge...",[krijgen remicade bijzonderheid hierna krijgen...
6,5,95,5_vocht vocht_toediening soort infuus_vocht to...,"[vocht vocht, toediening soort infuus, vocht t...","[vocht vocht toediening soort infuus Nacl, voc..."
7,6,88,6_denosumab inj_injectie denosumab_overig acti...,"[denosumab inj, injectie denosumab, overig act...",[consult osteoporose verpleegkundig reden koms...
8,7,81,7_videocapsule bloed_gastroscopie focus videoc...,"[videocapsule bloed, gastroscopie focus videoc...",[recidiveren melena Gastroscopie focus videoca...
9,8,59,8_decursus podo_decursus podo decursus_podo de...,"[decursus podo, decursus podo decursus, podo d...",[kwaliteitsindicator diabetisch voetwond soort...


In [None]:
# Print detailed results summary
print_results_summary(results_2to5gram, texts)


=== Results for 10 topics with 5 words (ngram_range=(2, 5)) ===
Coherence Score: Not available

Topics and their key words:
Topic 0: oraal stuk, tablet oraal, tablet oraal stuk, aanvullen onderzoek, lichamelijk onderzoek, mcv fl, rectaal bloedverlie, int knoppen, diabete mellitus, overig actie
Topic 1: aantal controle, uitvoeren controle, decursus podo decursus, decursus podo, podo decursus, decursus uitvoeren, podo decursus uitvoeren, decursus podo decursus uitvoeren, decursus podo decursus uitvoeren controle, decursus uitvoeren controle
Topic 2: basisdosering basisdosering, basisdosering basisdosering basisdosering, basisdosering basisdosering basisdosering basisdosering, diabete lunch, lunch slapen, diabete lunch slapen, slapen basisdosering, slapen basisdosering basisdosering, basisdosering basisdosering diabete, basisdosering diabete
Topic 3: vocht vocht, toediening soort infuus, vocht toediening, vocht toediening soort, vocht toediening soort infuus, toediening soort, vocht voch

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_2to5gram_{timestamp}.csv"
grid_summary2.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_2to5gram_20250419_200818.csv'


In [None]:
results_1to5gram, grid_summary1to5gram, topic_model1to5gram, data_df1to5gram = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20], n_words_list=[5, 10], ngram_range=(1, 5), calculate_coherence=True)


Analyzing with 10 topics, 5 words, ngram_range=(1, 5)


2025-04-19 20:22:46,210 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 20:23:04,293 - BERTopic - Dimensionality - Completed ✓
2025-04-19 20:23:04,293 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 20:23:04,694 - BERTopic - Cluster - Completed ✓
2025-04-19 20:23:04,694 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 20:23:19,910 - BERTopic - Representation - Completed ✓
2025-04-19 20:23:19,974 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 20:23:20,027 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 20:23:34,511 - BERTopic - Representation - Completed ✓
2025-04-19 20:23:34,578 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Skipping coherence calculation...

Analyzing with 10 topics, 10 words, ngram_range=(1, 5)


2025-04-19 20:23:37,645 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 20:23:55,928 - BERTopic - Dimensionality - Completed ✓
2025-04-19 20:23:55,928 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 20:23:56,361 - BERTopic - Cluster - Completed ✓
2025-04-19 20:23:56,361 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 20:24:12,995 - BERTopic - Representation - Completed ✓
2025-04-19 20:24:13,055 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 20:24:13,115 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 20:24:28,377 - BERTopic - Representation - Completed ✓
2025-04-19 20:24:28,436 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Skipping coherence calculation...

Analyzing with 20 topics, 5 words, ngram_range=(1, 5)


2025-04-19 20:24:32,146 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 20:24:51,031 - BERTopic - Dimensionality - Completed ✓
2025-04-19 20:24:51,031 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 20:24:51,445 - BERTopic - Cluster - Completed ✓
2025-04-19 20:24:51,445 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 20:25:07,560 - BERTopic - Representation - Completed ✓
2025-04-19 20:25:07,612 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 20:25:07,673 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 20:25:22,971 - BERTopic - Representation - Completed ✓
2025-04-19 20:25:23,030 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Skipping coherence calculation...

Analyzing with 20 topics, 10 words, ngram_range=(1, 5)


2025-04-19 20:25:26,885 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 20:25:45,344 - BERTopic - Dimensionality - Completed ✓
2025-04-19 20:25:45,344 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 20:25:45,760 - BERTopic - Cluster - Completed ✓
2025-04-19 20:25:45,760 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 20:26:02,380 - BERTopic - Representation - Completed ✓
2025-04-19 20:26:02,450 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 20:26:02,502 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 20:26:18,482 - BERTopic - Representation - Completed ✓
2025-04-19 20:26:18,538 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Skipping coherence calculation...


In [None]:
for topic_id in topic_model1to5gram.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model1to5gram.get_topic(topic_id)[:5]]}")

topic_df1to5gram = pd.DataFrame(topic_model1to5gram.get_topic_info())
display(topic_df1to5gram)


Topic -1: ['onderzoek', 'hb', 'controle', 'voltooien', 'seh']
Topic 0: ['oraal', 'stuk', 'anemie', 'onderzoek', 'oraal stuk']
Topic 1: ['controle', 'aantal', 'decursus', 'aantal controle', 'uitvoeren controle']
Topic 2: ['lab', 'beloop verpleegkundig', 'beloop', 'verpleegkundig', 'controle']
Topic 3: ['basisdosering', 'basisdosering basisdosering', 'basisdosering basisdosering basisdosering', 'basisdosering basisdosering basisdosering basisdosering', 'lunch']
Topic 4: ['beloop', 'krijgen', 'krijgen remicade', 'remicade', 'venofer']
Topic 5: ['vocht', 'vocht vocht', 'vocht toediening soort', 'vocht toediening', 'toediening soort infuus']
Topic 6: ['osteoporose', 'fractuur', 'denosumab', 'injectie', 'dexa']
Topic 7: ['recidiveren', 'gastroscopie', 'videocapsule bloed', 'gastroscopie focus videocapsule bloed proximaal', 'melena gastroscopie focus videocapsule']
Topic 8: ['wond', 'decursus', 'decursus podo', 'podo decursus', 'decursus podo decursus']
Topic 9: ['eosinofiel', 'enteritis', 'e

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1683,-1_onderzoek_hb_controle_voltooien,"[onderzoek, hb, controle, voltooien, seh, bloe...",[medisch dossier vk sputovamo leeftijd registr...
1,0,6245,0_oraal_stuk_anemie_onderzoek,"[oraal, stuk, anemie, onderzoek, oraal stuk, t...",[vervolg consult type vervolgconsult verkort i...
2,1,321,1_controle_aantal_decursus_aantal controle,"[controle, aantal, decursus, aantal controle, ...","[decursus podo Decursus uitvoeren controle, de..."
3,2,268,2_lab_beloop verpleegkundig_beloop_verpleegkundig,"[lab, beloop verpleegkundig, beloop, verpleegk...",[initials streetnaam cd Kenmerk patientid betr...
4,3,249,3_basisdosering_basisdosering basisdosering_ba...,"[basisdosering, basisdosering basisdosering, b...",[diabete lunch slapen basisdosering basisdoser...
5,4,144,4_beloop_krijgen_krijgen remicade_remicade,"[beloop, krijgen, krijgen remicade, remicade, ...",[komen balie laat keer verkeerd meegekreg kuij...
6,5,95,5_vocht_vocht vocht_vocht toediening soort_voc...,"[vocht, vocht vocht, vocht toediening soort, v...","[vocht vocht toediening soort infuus Nacl, voc..."
7,6,88,6_osteoporose_fractuur_denosumab_injectie,"[osteoporose, fractuur, denosumab, injectie, d...",[vervolg consult type vervolgconsult verkort i...
8,7,81,7_recidiveren_gastroscopie_videocapsule bloed_...,"[recidiveren, gastroscopie, videocapsule bloed...",[recidiveren melena Gastroscopie focus videoca...
9,8,59,8_wond_decursus_decursus podo_podo decursus,"[wond, decursus, decursus podo, podo decursus,...",[kwaliteitsindicator diabetisch voetwond soort...


In [None]:
# Print detailed results summary
print_results_summary(results_21to5gram, texts)

key: (10, 5, (1, 5))
key: (10, 10, (1, 5))
key: (20, 5, (1, 5))
key: (20, 10, (1, 5))

=== Results for 10 topics with 5 words (ngram_range=(1, 5)) ===
Coherence Score: Not available

Topics and their key words:
Topic 0: oraal, stuk, onderzoek, anemie, oraal stuk, tablet, opname, waarvoor, goed, hb
Topic 1: controle, aantal, decursus, aantal controle, uitvoeren controle, decursus podo decursus, decursus podo, podo decursus, podo, uitvoeren
Topic 2: basisdosering, basisdosering basisdosering, basisdosering basisdosering basisdosering, basisdosering basisdosering basisdosering basisdosering, lunch, diabete lunch, lunch slapen, diabete lunch slapen, slapen, diabete
Topic 3: vocht, vocht vocht, vocht toediening soort, vocht toediening, toediening soort infuus, toediening soort, vocht vocht toediening soort infuus, vocht vocht toediening soort, vocht vocht toediening, vocht toediening soort infuus
Topic 4: wond, decursus, decursus podo decursus, decursus podo, podo decursus, podo, nee, leide

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_1to5gram_{timestamp}.csv"
grid_summary1to5gram.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_1to5gram_20250419_202829.csv'


In [None]:
# Access the best model:
best_config = grid_summary.iloc[0]
best_model = results[(best_config['n_topics'], best_config['n_words'])]['model']
best_model

<bertopic._bertopic.BERTopic at 0x2158acc3ed0>

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_{timestamp}.csv"
grid_summary.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_20250418_211048.csv'


In [None]:
data_df 
data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,event_tags
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,[Unclear]
8,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Reden van opname:...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting decursus reden opnaam melena spec...,"[decursus, opnaam, melena, specieel, gastrosco...",decursus opnaam melena specieel gastroscopie b...,[Unclear]
9,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Consult,Samenvatting: \nConclusie\r\n-Samenvatting: Me...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting conclusie samenvatting Melena ace...,"[Melena, acenocoumarol, gebruik, gastroscopie,...",Melena acenocoumarol gebruik gastroscopie afwi...,[Unclear]
28,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Overige aantekeningen",Samenvatting: \nMemo\r\n-Memo: trombosedienst ...,2015-02-20 14:11:00,2015-02-20 14:11:00,samenvatting memo memo trombosedienst belen vp...,"[trombosedienst, belen, vpo, colo, inr, afspre...",trombosedienst belen vpo colo inr afspreken do...,[Unclear]
30,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, SEH",Samenvatting: \nSEPSIS - VPK\r\n-Bestaat er op...,2013-10-22 22:49:00,2013-10-22 22:49:00,samenvatting sepsis vpk bestaan basis anamnees...,"[sepsis, vpk, bestaan, basis, infectie]",sepsis vpk bestaan basis infectie,[Unclear]
...,...,...,...,...,...,...,...,...,...
9505,FAA79717FF2C725767E9469350ACECF640E5FCBC,Consult,Samenvatting: \nOpdrachten medewerker INT\r\n[...,2012-07-31 11:10:00,2012-07-31 11:10:00,samenvatting opdracht medewerk uitinen opdrach...,[Eliane],Eliane,[Topic 22]
9506,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Polikliniek: vervolgconsult",Samenvatting: \nVervolg consult Diabetes\r\n-D...,2012-07-31 07:47:00,2012-07-31 07:47:00,samenvatting vervolg consult diabete datum con...,"[vervolg, consult, diabete, consult, intern, d...",vervolg consult diabete consult intern diabete...,[Topic 1]
9507,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: Sc...,2012-06-20 17:27:00,2012-06-20 17:27:00,samenvatting decursus podo decursus screening ...,"[decursus, podo, decursus, screening, pulsatie...",decursus podo decursus screening pulsatie prop...,[Unclear]
9508,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: VB...,2012-04-24 14:27:00,2012-04-24 14:27:00,samenvatting decursus podo Decursus vb uitvoer...,"[decursus, podo, Decursus, uitvoeren]",decursus podo Decursus uitvoeren,[Topic 12]


In [None]:
# topic_keywords = {
#     topic_num: ", ".join([word for word, _ in topic_model.get_topic(topic_num)[:5]])
#     for topic_num in topic_model.get_topics().keys()
#     if topic_num != -1  # exclude outliers
# }


In [None]:
# data_df["topic_id"] = topics
# data_df["topic_keywords"] = data_df["topic_id"].map(lambda t: topic_keywords.get(t, "Outlier"))
# data_df

In [None]:
# data_df 
# data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

In [None]:
# threshold = 0.1
# doc_keywords = []

# for prob in probs:
#     if isinstance(prob, float):  # fallback
#         topic_ids = [0] if prob > threshold else []
#     else:
#         topic_ids = [i for i, p in enumerate(prob) if p > threshold]
    
#     keywords = [topic_keywords.get(t, "Outlier") for t in topic_ids]
#     doc_keywords.append("; ".join(keywords) if keywords else "Unclear")

# data_df["topic_keywords2"] = doc_keywords
# data_df


In [None]:
data_df 
data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,event_tags
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,[Unclear]
8,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Reden van opname:...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting decursus reden opnaam melena spec...,"[decursus, opnaam, melena, specieel, gastrosco...",decursus opnaam melena specieel gastroscopie b...,[Unclear]
9,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Consult,Samenvatting: \nConclusie\r\n-Samenvatting: Me...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting conclusie samenvatting Melena ace...,"[Melena, acenocoumarol, gebruik, gastroscopie,...",Melena acenocoumarol gebruik gastroscopie afwi...,[Unclear]
28,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Overige aantekeningen",Samenvatting: \nMemo\r\n-Memo: trombosedienst ...,2015-02-20 14:11:00,2015-02-20 14:11:00,samenvatting memo memo trombosedienst belen vp...,"[trombosedienst, belen, vpo, colo, inr, afspre...",trombosedienst belen vpo colo inr afspreken do...,[Unclear]
30,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, SEH",Samenvatting: \nSEPSIS - VPK\r\n-Bestaat er op...,2013-10-22 22:49:00,2013-10-22 22:49:00,samenvatting sepsis vpk bestaan basis anamnees...,"[sepsis, vpk, bestaan, basis, infectie]",sepsis vpk bestaan basis infectie,[Unclear]
...,...,...,...,...,...,...,...,...,...
9505,FAA79717FF2C725767E9469350ACECF640E5FCBC,Consult,Samenvatting: \nOpdrachten medewerker INT\r\n[...,2012-07-31 11:10:00,2012-07-31 11:10:00,samenvatting opdracht medewerk uitinen opdrach...,[Eliane],Eliane,[Topic 22]
9506,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Polikliniek: vervolgconsult",Samenvatting: \nVervolg consult Diabetes\r\n-D...,2012-07-31 07:47:00,2012-07-31 07:47:00,samenvatting vervolg consult diabete datum con...,"[vervolg, consult, diabete, consult, intern, d...",vervolg consult diabete consult intern diabete...,[Topic 1]
9507,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: Sc...,2012-06-20 17:27:00,2012-06-20 17:27:00,samenvatting decursus podo decursus screening ...,"[decursus, podo, decursus, screening, pulsatie...",decursus podo decursus screening pulsatie prop...,[Unclear]
9508,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: VB...,2012-04-24 14:27:00,2012-04-24 14:27:00,samenvatting decursus podo Decursus vb uitvoer...,"[decursus, podo, Decursus, uitvoeren]",decursus podo Decursus uitvoeren,[Topic 12]


In [None]:
# Filter rows where 'straathof' appears in the 'text' column
found_reports = data[data['text'].str.contains('bitterbal', case=False, na=False)]

# Display the filtered rows
found_reports.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text
4800,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2017-05-23 10:59:00,2017-05-23 10:59:00,samenvatting diabete scorelijzen datum insulin...,"[diabete, insulin, bitterball, eten]",diabete insulin bitterball eten


In [None]:
print(found_reports['alltext'].iloc[0])
print(found_reports['verslagen_report_content'].iloc[0])

samenvatting diabete scorelijzen datum insulin avond extra bitterball eten
Samenvatting: 
Diabetes scorelijst
-Datum: 22-05-2017
-N: 9.2
-Insuline avond extra: bitterballen
gegeten


In [None]:
found_reports = data[data['alltext'].str.contains('basisdosering basisdosering basisdosering', case=False, na=False)]
found_reports#.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text


In [None]:
# # Normalize spacing and lowercase before searching
# normalized = data['alltext'].str.lower().str.replace(r'\s+', ' ', regex=True)
# found_reports = data[normalized.str.contains(r'\bbasisdosering(?:\s+basisdosering){2}\b')]
# found_reports.head()


Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text


In [None]:
# # Fuzzy count of how many times "basisdosering" appears
# data['count_basisdosering'] = data['alltext'].str.lower().str.count('basisdosering')
# found_reports = data[data['count_basisdosering'] >= 3]
# found_reports


Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,count_basisdosering
4941,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2014-11-25 11:16:00,2014-11-25 11:16:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
4943,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2014-11-25 11:15:00,2014-11-25 11:15:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
4944,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2014-11-25 11:15:00,2014-11-25 11:15:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
4946,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2014-10-28 11:28:00,2014-10-28 11:28:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
4947,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-N: 9.2...,2014-10-28 11:28:00,2014-10-28 11:28:00,samenvatting diabete scorelijzen lunch avond s...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
...,...,...,...,...,...,...,...,...,...
6106,B3EED5BDCD189C8E78AEC6626C504EA16EC0C290,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2013-01-30 11:35:00,2013-01-30 11:35:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,5
6117,B3EED5BDCD189C8E78AEC6626C504EA16EC0C290,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2012-04-11 11:02:00,2012-04-11 11:02:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, basisdosering, basisdosering,...",diabete lunch basisdosering basisdosering basi...,5
7618,D30EB04A0947E012105B292E735B8AD3450A12C6,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2017-10-10 17:14:00,2017-10-10 17:14:00,samenvatting diabete scorelijzen datum lunch a...,"[diabete, lunch, slapen, basisdosering, basisd...",diabete lunch slapen basisdosering basisdoseri...,3
7623,D30EB04A0947E012105B292E735B8AD3450A12C6,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2017-10-03 15:47:00,2017-10-03 15:47:00,samenvatting diabete scorelijzen datum uur ont...,"[diabete, ontbijt, lunch, lunch, slapen, basis...",diabete ontbijt lunch lunch slapen basisdoseri...,3


In [None]:
i = 147
print(found_reports['alltext'].iloc[i])
print(found_reports['verslagen_report_content'].iloc[i])

samenvatting diabete scorelijzen datum lunch avond basisdosering basisdosering ochtend basisdosering middag basisdosering avond basisdosering nacht
Samenvatting: 
Diabetes scorelijst
-Datum: 10-04-2012
-N: 11.8
-Voor Lunch: 10.0
-Voor avond: 12.5
[ Basisdosering ]
-Basisdosering ochtend: 20
-Basisdosering middag: 4
-Basisdosering avond: 20
-Basisdosering nacht: 40


In [None]:
combined_freq_df = pd.merge(freq_df_neg, freq_df_pos, on='number', how='outer', suffixes=('_neg', '_pos'))
print(combined_freq_df.head(50))

NameError: name 'freq_df_neg' is not defined