In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# def create_bertopic_model(n_topics, min_topic_size=10):
#     embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#     umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
#     hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
#     # vectorizer_model = CountVectorizer(stop_words=dutch_stopwords)
#     vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words=dutch_stopwords)
#     # vectorizer_model = CountVectorizer(stop_words='english')
    
#     topic_model = BERTopic(
#         embedding_model=embedding_model,
#         umap_model=umap_model,
#         hdbscan_model=hdbscan_model,
#         vectorizer_model=vectorizer_model,
#         nr_topics=n_topics,
#         verbose=True,
#         calculate_probabilities=True
#     )
    
#     return topic_model

def create_bertopic_model(n_topics, min_topic_size=10, ngram_range=(1, 1)):
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords, ngram_range=ngram_range)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    return topic_model



In [3]:
# def calculate_coherence_score(texts, topic_words):
#     # Convert each document to a list if it's a string representation of a list
#     texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]
    
#     dictionary = Dictionary(texts)

#     coherence_model = CoherenceModel(
#         topics=topic_words,  
#         texts=texts,
#         dictionary=dictionary,
#         coherence='c_v'
#     )

#     return coherence_model.get_coherence()

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

def calculate_coherence_score(texts, topic_words, ngram_range=(1, 1)):
    # Zorg dat tokens correct zijn
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]

    # Maak n-gram strings van tokens
    joined_texts = [" ".join(doc) for doc in texts]
    vectorizer = CountVectorizer(ngram_range=ngram_range).fit(joined_texts)

    # Maak per document lijst met gegenereerde n-grams
    ngram_texts = [list(ngrams) for ngrams in vectorizer.inverse_transform(vectorizer.transform(joined_texts))]

    # Maak Gensim dictionary en coherence model
    dictionary = Dictionary(ngram_texts)
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=ngram_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()



In [4]:
# import plotly.graph_objects as go
# import pandas as pd

# def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[30], n_words_list=[5]):
#     results = {}
#     summary_data = []

#     coherence_fig = go.Figure()
    
#     for n_topics in n_topics_list:
#         coherence_scores = []
        
#         for n_words in n_words_list:
#             print(f"\n Analyzing with {n_topics} topics and top {n_words} words...")

#             # Create and fit the BERTopic model
#             topic_model = create_bertopic_model(n_topics)
#             # topics, probs = topic_model.fit_transform(texts, embeddings, calculate_probabilities=True)
#             # Fit model
#             topics = topic_model.fit(texts, embeddings)
#             # Get topic probabilities for each doc
#             topics, probs = topic_model.transform(texts)



#             topic_info = topic_model.get_topic_info()


#             # Build readable labels
#             topic_labels = {}
#             for topic_id, words in topic_words.items():
#                 label = "_".join(words[:5])  # Or however many keywords you want
#                 topic_labels[topic_id] = label

#             # Apply it to tags
#             doc_topic_labels = []
#             threshold = 0.1
#             for prob in probs:
#                 if prob is None:
#                     doc_topic_labels.append(["Outlier"])
#                 else:
#                     tags = [topic_labels[i] for i, p in enumerate(prob) if p > threshold]
#                     doc_topic_labels.append(tags if tags else ["Unclear"])


#             # Extract keywords per topic
#             topic_word_list = []
#             topic_words = {}

#             for topic in range(len(set(topics)) - 1):
#                 words = topic_model.get_topic(topic)[:n_words]
#                 topic_word_list.append([word for word, _ in words])
#                 topic_words[topic] = [word for word, _ in words]

#             # Calculate coherence
#             coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
#             print(f" Coherence Score (C_v): {coherence:.4f}")
#             coherence_scores.append(coherence)

#             # Save results
#             results[(n_topics, n_words)] = {
#                 'model': topic_model,
#                 'topics': topics,
#                 'topic_info': topic_info,
#                 'topic_words': topic_words,
#                 'coherence': coherence
#             }

#             # Save interactive visualizations
#             topic_model.visualize_topics().write_html(f'topic_visualization_{n_topics}_{n_words}.html')
#             topic_model.visualize_heatmap().write_html(f'topic_heatmap_{n_topics}_{n_words}.html')

#             # Append summary data
#             summary_data.append({
#                 'n_topics': n_topics,
#                 'n_words': n_words,
#                 'coherence': coherence
#             })

#         # Add to coherence plot
#         coherence_fig.add_trace(go.Scatter(
#             x=n_words_list,
#             y=coherence_scores,
#             mode='lines+markers',
#             name=f'{n_topics} topics'
#         ))

#     coherence_fig.update_layout(
#         title='Coherence Scores across Different Configurations',
#         xaxis_title='Number of Words per Topic',
#         yaxis_title='Coherence Score (C_v)',
#         showlegend=True
#     )
#     coherence_fig.write_html('coherence_scores.html')

#     results[(n_topics, n_words)]['doc_tags'] = doc_topic_tags

#     # Create and print summary
#     grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
#     print("\n Top Configurations by Coherence:")
#     print(grid_summary.head(3).to_string(index=False))

#     data[f'topic_tags_{n_topics}_{n_words}'] = doc_topic_tags
#     data[f'topic_main_{n_topics}_{n_words}'] = topics


#     return results, grid_summary, topic_model, data


In [5]:
# def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[30], n_words_list=[5], ngram_range=(1, 1)):
#     import plotly.graph_objects as go
#     import pandas as pd
#     from tqdm import tqdm

#     results = {}
#     summary_data = []
#     all_doc_tags = {}  # Store tags per (n_topics, n_words) combination

#     coherence_fig = go.Figure()

#     for n_topics in n_topics_list:
#         coherence_scores = []

#         for n_words in n_words_list:
#             print(f"\nAnalyzing with {n_topics} topics and top {n_words} words...")

#             # Step 1: Create & fit model
#             topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range)
#             topic_model.fit(texts, embeddings)
#             topics, probs = topic_model.transform(texts)
#             print(f"Topics: {topics}")


#             topic_info = topic_model.get_topic_info()

#             # Step 2: Extract keywords per topic
#             topic_word_list = []
#             topic_words = {}

#             for topic in range(len(set(topics)) - 1):
#                 words = topic_model.get_topic(topic)[:n_words]
#                 word_list = [word for word, _ in words]
#                 topic_word_list.append(word_list)
#                 print('topic_words:', word_list)
#                 topic_words[topic] = word_list

#             # Step 3: Calculate coherence
#             coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
#             print(f"Coherence Score (C_v): {coherence:.4f}")
#             coherence_scores.append(coherence)

#             # Step 4: Assign per-document tags based on topic probabilities
#             threshold = 0.1
#             doc_tags = []
#             for prob in probs:
#                 print(f"Probabilities: {prob}")
#                 if prob is None:
#                     doc_tags.append(["Outlier"])
#                     print("Outlier detected")
#                 else:
#                     tags = [f"Topic {i}" for i, p in enumerate(prob) if p > threshold]
#                     doc_tags.append(tags if tags else ["Unclear"])
#                     print(f"Tags: {tags}")

#             all_doc_tags[(n_topics, n_words)] = doc_tags

#             # Step 5: Save results
#             results[(n_topics, n_words)] = {
#                 'model': topic_model,
#                 'topics': topics,
#                 'probs': probs,
#                 'topic_info': topic_info,
#                 'topic_words': topic_words,
#                 'coherence': coherence,
#                 'doc_tags': doc_tags
#             }

#             topic_model.visualize_topics().write_html(f'topic_visualization_{n_topics}_{n_words}.html')
#             topic_model.visualize_heatmap().write_html(f'topic_heatmap_{n_topics}_{n_words}.html')

#             summary_data.append({
#                 'n_topics': n_topics,
#                 'n_words': n_words,
#                 'coherence': coherence
#             })

#         # Plot coherence line
#         coherence_fig.add_trace(go.Scatter(
#             x=n_words_list,
#             y=coherence_scores,
#             mode='lines+markers',
#             name=f'{n_topics} topics'
#         ))

#     coherence_fig.update_layout(
#         title='Coherence Scores across Topic Configs',
#         xaxis_title='Top Words per Topic',
#         yaxis_title='Coherence Score (C_v)',
#         showlegend=True
#     )
#     coherence_fig.write_html('coherence_scores.html')

#     grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
#     print("\n🏆 Top Configurations by Coherence:")
#     print(grid_summary.head(3).to_string(index=False))

#     # Final output: tags for highest-coherence config
#     best_config = grid_summary.iloc[0][['n_topics', 'n_words']].astype(int).tolist()
#     best_tags = all_doc_tags[tuple(best_config)]
#     data_with_tags = data.copy()
#     data_with_tags['event_tags'] = best_tags

#     return results, grid_summary, topic_model, data_with_tags


In [6]:
def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20], n_words_list=[5], ngram_range=(1, 2), calculate_coherence=True):
    results = {}
    summary_data = []
    all_doc_tags = {}
    data_df = data.copy()

    for n_topics in n_topics_list:
        for n_words in n_words_list:
            print(f"\nAnalyzing with {n_topics} topics, {n_words} words, ngram_range={ngram_range}")

            topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range)
            topics, probs = topic_model.fit_transform(texts, embeddings)
            topic_info = topic_model.get_topic_info()

            # Verwerk keywords per topic
            topic_words = {}
            for topic in set(topics):
                if topic != -1:
                    words = topic_model.get_topic(topic)[:n_words]
                    topic_words[topic] = [word for word, _ in words]

            # Coherence berekenen
            topic_word_list = list(topic_words.values())
            if calculate_coherence == True:
                # coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
                print("Calculating coherence score...")
                coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list, ngram_range=ngram_range)
            else:
                print("Skipping coherence calculation...")
                coherence = None
            



            print(f"Coherence Score: {coherence:.4f}")

            # Per document: top topics (≥ threshold)
            doc_tags = []
            threshold = 0.1
            for i, topic in enumerate(topics):
                if probs is None or isinstance(probs[i], float):
                    # fallback als probs None of 1D zijn
                    tags = [f"Topic {topic}"]
                else:
                    tags = [f"Topic {j}" for j, p in enumerate(probs[i]) if p > threshold]
                doc_tags.append(tags if tags else ["Unclear"])

            # Per document: keywords van toegekende topic (voor extra inzicht)
            topic_keywords_per_doc = [
                topic_words.get(t, []) if t in topic_words else [] for t in topics
            ]

            # Update dataframe
            data_df[f'topic_{n_topics}_{n_words}'] = topics
            data_df[f'tags_{n_topics}_{n_words}'] = doc_tags
            data_df[f'keywords_{n_topics}_{n_words}'] = topic_keywords_per_doc

            # # Opslaan
            # results[(n_topics, n_words)] = {
            #     'model': topic_model,
            #     'topics': topics,
            #     'probs': probs,
            #     'topic_words': topic_words,
            #     'coherence': coherence
            # }

            results[(n_topics, n_words, ngram_range)] = {
                'model': topic_model,
                'topics': topics,
                'probs': probs,
                'topic_words': topic_words,
                'coherence': coherence,
                'topic_info': topic_info
            }


            summary_data.append({
                'n_topics': n_topics,
                'n_words': n_words,
                'ngram_range': str(ngram_range),
                'coherence': coherence
            })


    grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
    return results, grid_summary, topic_model, data_df


In [7]:
def print_results_summary(results, texts, save_to_file=True, filename=None):
    import datetime

    output_lines = []

    # for (n_topics, n_words), result in results.items():
    for key, result in results.items():
        if isinstance(key, tuple) and len(key) == 3:
            n_topics, n_words, ngram_range = key
        else:
            n_topics, n_words = key
            ngram_range = "(1, 1)"

        # output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words (ngram_range={ngram_range}) ===")
        output_lines.append(f"Coherence Score: {result['coherence']:.4f}")
        
        output_lines.append("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                output_lines.append(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        output_lines.append("\nTopic sizes: " + str(topic_sizes))
        output_lines.append("\n" + "="*50)

    full_output = "\n".join(output_lines)

    # Print to console
    print(full_output)

    # Optionally save to file
    if save_to_file:
        if not filename:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"topic_results_summary_{timestamp}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(full_output)
        print(f"\n📁 Summary saved to: {filename}")


In [8]:
def fix_character_tokens(char_list):
    text = ''.join(char_list)
    tokens = text.split()  # crude but often works
    return tokens

def flatten_nested_char_lists(nested_list):
    return [''.join(token) for token in nested_list if isinstance(token, list)]
    # return [token for token in nested_list if token != []]




In [9]:
import ast

# Load your data
data = pd.read_csv('a:/df_cleaned.csv')

# Parse the stringified lists into real Python lists
data['tokens'] = data['tokens'].apply(lambda x: ast.literal_eval(x))


remove_list = [
        # Units, measurements, filler symbols
        'mg', 'mmoll', 'mmolL', 'x', 'per', 'dag', 'min', 'uur', 'ml', 'eenhed',

        # Admin & structure
        'samenvatting', 'memo', 'beleid', 'conclusie', 'aanvullend', 'afgewerken', 
        'opdracht', 'opdrachten', 'rapportage', 'diversen', 'contact', 'afspraak', 
        'tijd', 'tijdsduur', 'datum', 'poli', 'recept', 'gefaxt', 'bellen', 
        'akkoord', 'nodig', 'bekende', 'bekend', 'memo', 'scorelijzen', 'naslag',

        # Identifying or privacy-sensitive terms
        'bsn', 'city', 'postcode', 'firstname', 'lastname', 'streetname', 'phonenumber', 'voicemail', 
        'mw', 'dhr', 'mevrouw', 'meneer', 'zoon', 'mevr', 'mvr', 'dhr'
        'inge', 'valkenburg', 'peter', 'miriam', 'debby', 'eliane',

        # Clinical history / often uninformative by itself
        'anamnese', 'anamnees', 'voorgeschiedenis', 'huisarts', 
        'hoofdbehandelaar',

        # Temporal or ambiguous
        'sinds', 'dagen', 'weken', 'maanden', 'week', 'avond', 'nacht', 'ochtend', 'middag', 
        'extra', 'stop', 'gehad', 'gezien', 'zien', 'dd', 
        'ivm', 'links', 'rechts', 'linker', 'dr', 'overige', 'algemeen', 'patiënt', 'patiënte',

        # Admin/communication
        'verzoek', 'mail', 'mailen', 'verstuuren', 'brief', 'uitinen', 'ak', 'bespreeklijst', 
        'wijzigingopmerking',
        'voicemail', 'telefonisch', 'mobiel', 'ingesproek', 'aanleiding', 'telefoon', 'email', 'bereiken', 'svp', 'contactpersoon'
        'terugbellen', 'gemaild', 'insproken', 'voicemail',

        # Unclear / possibly noise
        'eenhed', 'aangeeft', 'scorelijzen', 'inten', 'intn', 'vb', 'sub', 

        # extra         
        'regelen', 'opmerking', 'bespreeklijst', 'sehperiode', 'bedrijf', 
        'ivb', 'mtps', 'cp', 'pat', 'huisadres', 'gg', 
        'medewerker', 'medewerk', 'laboratorium', 'apotheek', 'maand', 'tc', 
        'wonen', 'gezondheidsinstelling', 'leven', 'varken', 'soms', 'jaar', 'mgdag', 'lateraal',
        'bespreking', 'wondfoto', 'cze',         
        
        'vrijdag', 'maandag', 'donderdag', 'woensdag', 'zaterdag', 'zondag',      
        'juli', 'augustus', 'september', 'oktober', 'november', 'december', 'januari', 'februari', 'maart', 'april', 'mei', 'jun',
        
        
        # 'voltooid', 'verdenking', 'waarvoor', 'reden', 'waarschijnlijk', 'mogelijk', 'stuk', 'basisdosering', 'probleem', 'probleemlijst', 'actie',
        # 'nee', 'ja',  'arts',  'radiologie', 'internist', 'evaluatie', 'intake', 'controle',


        ]

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# Remove empty documents
data = data[data['tokens'].apply(len) > 0]

# Convert token lists to strings for BERTopic
data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
texts = data['text'].tolist()

# Debug prints
print("First 3 token lists:", data['tokens'].head(3).tolist())
print("First 3 texts:", data['text'].head(3).tolist())
print("Total documents after filtering:", len(texts))


First 3 token lists: [['aj', 'dingemans', 'streetnaam', 'Kenmerk', 'patientid', 'betreffen', 'initials', 'geb', 'birthdate', 'streetnaam', 'zip', 'tel', 'geacht', 'collega', 'bovengenoemde', 'opnemen', 'afdeling', 'maag', 'darm', 'leverziekt', 'verband', 'melaena', 'rectaal', 'bloedverlie', 'diep', 'veneuaz', 'trombose', 'longembolie', 'cholecystectomie', 'diverticulitis', 'atriumfibrilleren', 'spontaan', 'conversie', 'sinusritme', 'melena', 'waarvoor', 'verklaring', 'vinden', 'verband', 'stabiel', 'hb', 'overleg', 'expectatief', 'vermoeidheid', 'sinusbradycardie', 'waarvoor', 'metoprolol', 'tambocor', 'vanmiddag', 'fors', 'Helderrood', 'bloedverlie', 'stolsel', 'vermengen', 'ontlasting', 'zwart', 'kleur', 'zeuren', 'pijn', 'bovenbuik', 'maagpijn', 'waarvoor', 'stoppen', 'koffie', 'drinken', 'vet', 'eten', 'ontlasting', 'intaak', 'bloed', 'zwart', 'verkleuring', 'bemerken', 'tractus', 'bijdragen', 'mn', 'lwklachten', 'all', 'penicilline', 'urticaria', 'lichamelijk', 'onderzoek', 'contr

In [None]:
# Load embeddings

# embeddings = np.load('embeddings_model6.npy')
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(texts, show_progress_bar=True)
np.save('embeddings_model7.npy', embeddings)

# print("embeddings:", embeddings)
print("Embeddings shape:", embeddings.shape)



In [None]:
# Analyze with different numbers of topics and words
# results, grid_summary, topic_model, data_df = analyze_topics_with_sentiment(texts, embeddings, data)


In [None]:
results_1gram2gram, grid_summary1, topic_model1, data_df1 = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20, 30], n_words_list=[5, 10], ngram_range=(1, 2))



Analyzing with 10 topics, 5 words, ngram_range=(1, 2)


2025-04-19 02:06:38,372 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:07:15,834 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:07:15,834 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:07:16,264 - BERTopic - Cluster - Completed ✓
2025-04-19 02:07:16,264 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:07:19,149 - BERTopic - Representation - Completed ✓
2025-04-19 02:07:19,156 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:07:19,206 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:07:22,043 - BERTopic - Representation - Completed ✓
2025-04-19 02:07:22,048 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Coherence Score: 0.8162

Analyzing with 10 topics, 10 words, ngram_range=(1, 2)


2025-04-19 02:08:10,136 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:08:28,320 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:08:28,320 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:08:28,698 - BERTopic - Cluster - Completed ✓
2025-04-19 02:08:28,698 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:08:31,982 - BERTopic - Representation - Completed ✓
2025-04-19 02:08:32,001 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:08:32,048 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:08:35,082 - BERTopic - Representation - Completed ✓
2025-04-19 02:08:35,096 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Coherence Score: 0.6746

Analyzing with 20 topics, 5 words, ngram_range=(1, 2)


2025-04-19 02:09:23,554 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:09:41,649 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:09:41,649 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:09:42,033 - BERTopic - Cluster - Completed ✓
2025-04-19 02:09:42,034 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:09:45,350 - BERTopic - Representation - Completed ✓
2025-04-19 02:09:45,350 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:09:45,403 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:09:48,449 - BERTopic - Representation - Completed ✓
2025-04-19 02:09:48,452 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Coherence Score: 0.7779

Analyzing with 20 topics, 10 words, ngram_range=(1, 2)


2025-04-19 02:10:52,922 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:11:11,202 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:11:11,202 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:11:11,588 - BERTopic - Cluster - Completed ✓
2025-04-19 02:11:11,588 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:11:14,972 - BERTopic - Representation - Completed ✓
2025-04-19 02:11:14,988 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:11:15,029 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:11:18,205 - BERTopic - Representation - Completed ✓
2025-04-19 02:11:18,205 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Coherence Score: 0.6443

Analyzing with 30 topics, 5 words, ngram_range=(1, 2)


2025-04-19 02:12:07,785 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:12:25,650 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:12:25,652 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:12:26,018 - BERTopic - Cluster - Completed ✓
2025-04-19 02:12:26,018 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:12:29,274 - BERTopic - Representation - Completed ✓
2025-04-19 02:12:29,282 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:12:29,325 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:12:32,385 - BERTopic - Representation - Completed ✓
2025-04-19 02:12:32,402 - BERTopic - Topic reduction - Reduced number of topics from 261 to 30


Coherence Score: 0.7449

Analyzing with 30 topics, 10 words, ngram_range=(1, 2)


2025-04-19 02:13:20,587 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:13:38,419 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:13:38,419 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:13:38,803 - BERTopic - Cluster - Completed ✓
2025-04-19 02:13:38,803 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:13:42,078 - BERTopic - Representation - Completed ✓
2025-04-19 02:13:42,085 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:13:42,124 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:13:45,220 - BERTopic - Representation - Completed ✓
2025-04-19 02:13:45,236 - BERTopic - Topic reduction - Reduced number of topics from 261 to 30


Coherence Score: 0.6379


In [None]:
for topic_id in topic_model1.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model1.get_topic(topic_id)[:5]]}")

topic_df = pd.DataFrame(topic_model1.get_topic_info())
display(topic_df)


Topic -1: ['onderzoek', 'hb', 'voltooien', 'seh', 'controle']
Topic 0: ['oraal', 'stuk', 'oraal stuk', 'tablet', 'anemie']
Topic 1: ['diabete', 'diabetisch', 'diabete mellitus', 'mellitus', 'voet']
Topic 2: ['obinutuzumab', 'cll', 'anemie', 'progressie', 'volgen']
Topic 3: ['controle', 'aantal', 'aantal controle', 'decursus', 'uitvoeren controle']
Topic 4: ['basisdosering', 'basisdosering basisdosering', 'lunch', 'diabete lunch', 'lunch slapen']
Topic 5: ['lab', 'diana', 'tevoren', 'opsturen', 'tevoren lab']
Topic 6: ['krijgen remicade', 'remicade', 'krijgen', 'beloop', 'venofer']
Topic 7: ['hyperparathyreoïdie', 'intern', 'intern geneeskun', 'geneeskun', 'secundair hyperparathyreoïdie']
Topic 8: ['beloop verpleegkundig', 'verpleegkundig', 'beloop', 'ct', 'meenemen']
Topic 9: ['oog', 'extracaps phaco', 'extracaps', 'phaco lens', 'lens']
Topic 10: ['dermatomyositis', 'waarvoor', 'tacrolimus', 'duodenumcarcinoom', 'metastaseren duodenumcarcinoom']
Topic 11: ['vocht', 'vocht vocht', 'voch

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1683,-1_onderzoek_hb_voltooien_seh,"[onderzoek, hb, voltooien, seh, controle, bloe...",[medisch dossier vk sputovamo leeftijd registr...
1,0,4884,0_oraal_stuk_oraal stuk_tablet,"[oraal, stuk, oraal stuk, tablet, anemie, onde...",[weledelgeleer heer drs pwcmj meuffel parallel...
2,1,580,1_diabete_diabetisch_diabete mellitus_mellitus,"[diabete, diabetisch, diabete mellitus, mellit...",[diabete mellitus type hypertensie nierfunctie...
3,2,443,2_obinutuzumab_cll_anemie_progressie,"[obinutuzumab, cll, anemie, progressie, volgen...",[carpaal tunnel syndroom mammacarcinoom profyl...
4,3,302,3_controle_aantal_aantal controle_decursus,"[controle, aantal, aantal controle, decursus, ...","[aantal controle, aantal controle, aantal cont..."
5,4,249,4_basisdosering_basisdosering basisdosering_lu...,"[basisdosering, basisdosering basisdosering, l...",[diabete lunch slapen basisdosering basisdoser...
6,5,159,5_lab_diana_tevoren_opsturen,"[lab, diana, tevoren, opsturen, tevoren lab, d...","[lab, lab, lab]"
7,6,144,6_krijgen remicade_remicade_krijgen_beloop,"[krijgen remicade, remicade, krijgen, beloop, ...","[krijgen remicade bijzonderheid infusie, krijg..."
8,7,132,7_hyperparathyreoïdie_intern_intern geneeskun_...,"[hyperparathyreoïdie, intern, intern geneeskun...",[vervolg consult type vervolgconsult verkort i...
9,8,109,8_beloop verpleegkundig_verpleegkundig_beloop_ct,"[beloop verpleegkundig, verpleegkundig, beloop...",[initials ad Kenmerk patientid betreffen heer ...


In [None]:
# Print detailed results summary
print_results_summary(results_1gram2gram, texts)


=== Results for 10 topics with 5 words (ngram_range=(1, 2)) ===
Coherence Score: 0.8162

Topics and their key words:
Topic 0: oraal, stuk, onderzoek, anemie, oraal stuk, tablet, opname, waarvoor, goed, hb
Topic 1: controle, aantal, decursus, aantal controle, uitvoeren controle, podo decursus, decursus podo, podo, decursus uitvoeren, uitvoeren
Topic 2: basisdosering, basisdosering basisdosering, lunch, diabete lunch, lunch slapen, slapen, diabete, slapen basisdosering, basisdosering diabete, slapen diabete
Topic 3: vocht, vocht vocht, vocht toediening, toediening soort, soort infuus, soort, toediening, infuus nacl, infuus, nacl
Topic 4: wond, decursus, decursus podo, podo decursus, podo, leiden wond, nee, leiden, wonddebridement, zijde mtp
Topic 5: trauma radiologie, bespreeklijst, bespreeklijst trauma, radiologie seh, seh bespreeklijst, radiologie, trauma, seh, radiologeibespreking vd, radiologeibespreking
Topic 6: sepsis arts, arts sepsis, sepsis, arts, trail, phantasi trail, phantas

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_ngram1_2_{timestamp}.csv"
grid_summary1.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_ngram1_2_20250419_021432.csv'


In [None]:
results_test, grid_summary_test, topic_model_test, data_df_test = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20], n_words_list=[5, 10], ngram_range=(2, 3))



Analyzing with 10 topics, 5 words, ngram_range=(2, 3)


2025-04-19 02:14:35,716 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:14:53,672 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:14:53,673 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:14:54,037 - BERTopic - Cluster - Completed ✓
2025-04-19 02:14:54,037 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:14:59,921 - BERTopic - Representation - Completed ✓
2025-04-19 02:14:59,938 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:14:59,982 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:15:05,280 - BERTopic - Representation - Completed ✓
2025-04-19 02:15:05,288 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Coherence Score: 0.8318

Analyzing with 10 topics, 10 words, ngram_range=(2, 3)


2025-04-19 02:16:30,408 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:16:49,089 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:16:49,089 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:16:49,492 - BERTopic - Cluster - Completed ✓
2025-04-19 02:16:49,492 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:16:55,822 - BERTopic - Representation - Completed ✓
2025-04-19 02:16:55,843 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:16:55,895 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:17:01,791 - BERTopic - Representation - Completed ✓
2025-04-19 02:17:01,812 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


Coherence Score: 0.8108

Analyzing with 20 topics, 5 words, ngram_range=(2, 3)


2025-04-19 02:18:22,674 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:18:40,724 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:18:40,724 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:18:41,124 - BERTopic - Cluster - Completed ✓
2025-04-19 02:18:41,124 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:18:47,524 - BERTopic - Representation - Completed ✓
2025-04-19 02:18:47,541 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:18:47,601 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:18:53,573 - BERTopic - Representation - Completed ✓
2025-04-19 02:18:53,595 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Coherence Score: 0.8011

Analyzing with 20 topics, 10 words, ngram_range=(2, 3)


2025-04-19 02:20:15,669 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:20:33,957 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:20:33,959 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:20:34,359 - BERTopic - Cluster - Completed ✓
2025-04-19 02:20:34,375 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:20:40,947 - BERTopic - Representation - Completed ✓
2025-04-19 02:20:40,966 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:20:41,013 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:20:47,152 - BERTopic - Representation - Completed ✓
2025-04-19 02:20:47,175 - BERTopic - Topic reduction - Reduced number of topics from 261 to 20


Coherence Score: 0.7563


In [None]:
for topic_id in topic_model_test.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model_test.get_topic(topic_id)[:5]]}")

topic_df_test = pd.DataFrame(topic_model_test.get_topic_info())
display(topic_df_test)

Topic -1: ['lichamelijk onderzoek', 'aanvullen onderzoek', 'oraal stuk', 'reden komst', 'rectaal bloedverlie']
Topic 0: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'aanvullen onderzoek', 'lichamelijk onderzoek']
Topic 1: ['aantal controle', 'uitvoeren controle', 'decursus podo decursus', 'podo decursus', 'decursus podo']
Topic 2: ['beloop verpleegkundig', 'tevoren lab', 'diana lab', 'lab diana', 'lab lab']
Topic 3: ['basisdosering basisdosering', 'basisdosering basisdosering basisdosering', 'diabete lunch', 'lunch slapen', 'diabete lunch slapen']
Topic 4: ['krijgen remicade', 'krijgen remicade iv', 'krijgen venofer', 'remicade iv', 'beloop bld lijst']
Topic 5: ['vocht vocht', 'vocht vocht toediening', 'vocht toediening soort', 'vocht toediening', 'toediening soort']
Topic 6: ['denosumab inj', 'injectie denosumab', 'overig actie', 'starten denosumab', 'osteoporose baseren']
Topic 7: ['videocapsule bloed', 'videocapsule bloed proximaal', 'vg recidiveren melaena', 'melena gastrosc

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1683,-1_lichamelijk onderzoek_aanvullen onderzoek_o...,"[lichamelijk onderzoek, aanvullen onderzoek, o...",[medisch dossier vk sputovamo leeftijd registr...
1,0,6245,0_oraal stuk_tablet oraal_tablet oraal stuk_aa...,"[oraal stuk, tablet oraal, tablet oraal stuk, ...",[vervolg consult type vervolgconsult verkort i...
2,1,321,1_aantal controle_uitvoeren controle_decursus ...,"[aantal controle, uitvoeren controle, decursus...","[decursus podo Decursus uitvoeren controle, de..."
3,2,268,2_beloop verpleegkundig_tevoren lab_diana lab_...,"[beloop verpleegkundig, tevoren lab, diana lab...","[beloop verpleegkundig, initials ad Kenmerk pa..."
4,3,249,3_basisdosering basisdosering_basisdosering ba...,"[basisdosering basisdosering, basisdosering ba...",[diabete lunch slapen basisdosering basisdoser...
5,4,144,4_krijgen remicade_krijgen remicade iv_krijgen...,"[krijgen remicade, krijgen remicade iv, krijge...","[beloop bld lijst hoog novomix ophogen, krijge..."
6,5,95,5_vocht vocht_vocht vocht toediening_vocht toe...,"[vocht vocht, vocht vocht toediening, vocht to...","[vocht vocht toediening soort infuus Nacl, voc..."
7,6,88,6_denosumab inj_injectie denosumab_overig acti...,"[denosumab inj, injectie denosumab, overig act...",[vervolg consult type vervolgconsult verkort i...
8,7,81,7_videocapsule bloed_videocapsule bloed proxim...,"[videocapsule bloed, videocapsule bloed proxim...",[recidiveren melena Gastroscopie focus videoca...
9,8,59,8_decursus podo_decursus podo decursus_podo de...,"[decursus podo, decursus podo decursus, podo d...",[kwaliteitsindicator diabetisch voetwond soort...


In [None]:
# Print detailed results summary
print_results_summary(results_test, texts)


=== Results for 10 topics with 5 words (ngram_range=(2, 3)) ===
Coherence Score: 0.8318

Topics and their key words:
Topic 0: oraal stuk, tablet oraal, tablet oraal stuk, aanvullen onderzoek, lichamelijk onderzoek, mcv fl, rectaal bloedverlie, int knoppen, diabete mellitus, overig actie
Topic 1: aantal controle, uitvoeren controle, decursus podo, podo decursus, decursus podo decursus, decursus uitvoeren, podo decursus uitvoeren, decursus uitvoeren controle, controle decursus podo, controle decursus
Topic 2: basisdosering basisdosering, basisdosering basisdosering basisdosering, diabete lunch, lunch slapen, diabete lunch slapen, slapen basisdosering, slapen basisdosering basisdosering, basisdosering basisdosering diabete, basisdosering diabete, lunch slapen basisdosering
Topic 3: vocht vocht, vocht vocht toediening, vocht toediening soort, vocht toediening, toediening soort, toediening soort infuus, soort infuus, soort infuus nacl, infuus nacl, infuus nacl vocht
Topic 4: decursus podo 

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_test_2to3gram_{timestamp}.csv"
grid_summary_test.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_test_2to3gram_20250419_022207.csv'


In [None]:
results_2to5gram, grid_summary2, topic_model2, data_df2 = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[10, 20], n_words_list=[5, 10], ngram_range=(2, 5), calculate_coherence=False)



Analyzing with 10 topics, 5 words, ngram_range=(2, 5)


2025-04-19 02:22:10,141 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-19 02:22:27,994 - BERTopic - Dimensionality - Completed ✓
2025-04-19 02:22:27,994 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-19 02:22:28,378 - BERTopic - Cluster - Completed ✓
2025-04-19 02:22:28,378 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-19 02:22:41,977 - BERTopic - Representation - Completed ✓
2025-04-19 02:22:42,031 - BERTopic - Topic reduction - Reducing number of topics
2025-04-19 02:22:42,077 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-19 02:22:55,261 - BERTopic - Representation - Completed ✓
2025-04-19 02:22:55,311 - BERTopic - Topic reduction - Reduced number of topics from 261 to 10


In [None]:
for topic_id in topic_model2.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model2.get_topic(topic_id)[:5]]}")

topic_df2 = pd.DataFrame(topic_model2.get_topic_info())
display(topic_df2)


Topic -1: ['lichamelijk onderzoek', 'aanvullen onderzoek', 'rectaal bloedverlie', 'tablet oraal', 'medisch dossier']
Topic 0: ['tablet oraal', 'aanvullen onderzoek', 'lichamelijk onderzoek', 'int knoppen', 'mcv fl']
Topic 1: ['diabete lunch', 'lunch slapen', 'diabete lunch slapen', 'slapen diabete', 'lunch slapen diabete']
Topic 2: ['sepsis sepsis', 'innen peggy', 'krijgen remicade', 'beloop uitvoeren', 'beloop uitvoeren beloop']
Topic 3: ['beloop verpleegkundig', 'onderzoek bloed prikken', 'bijgevoegd bloeduitslag', 'medicatie meenemen', 'onderzoek bloed']
Topic 4: ['knoppen consult', 'consult knoppen', 'knoppen consult knoppen', 'verkort dossier', 'verkort verkort dossier']
Topic 5: ['vocht vocht', 'toediening soort infuus', 'vocht toediening', 'vocht toediening soort', 'vocht toediening soort infuus']
Topic 6: ['decursus podo', 'decursus podo decursus', 'podo decursus', 'uitvoeren decursus', 'uitvoeren decursus podo']
Topic 7: ['aantal aantal', 'mdlarts aantal', 'aantal venofer', 'a

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1634,-1_lichamelijk onderzoek_aanvullen onderzoek_r...,"[lichamelijk onderzoek, aanvullen onderzoek, r...",[komst verwijzing komst pijn rbbten hnpoperati...
1,0,6508,0_tablet oraal_aanvullen onderzoek_lichamelijk...,"[tablet oraal, aanvullen onderzoek, lichamelij...",[vervolg consult type vervolgconsult verkort i...
2,1,251,1_diabete lunch_lunch slapen_diabete lunch sla...,"[diabete lunch, lunch slapen, diabete lunch sl...","[diabete lunch slapen, diabete lunch slapen, d..."
3,2,238,2_sepsis sepsis_innen peggy_krijgen remicade_b...,"[sepsis sepsis, innen peggy, krijgen remicade,...","[sepsis sepsis, sepsis sepsis, sepsis sepsis]"
4,3,116,3_beloop verpleegkundig_onderzoek bloed prikke...,"[beloop verpleegkundig, onderzoek bloed prikke...",[initials streetnaam cd Kenmerk patientid betr...
5,4,108,4_knoppen consult_consult knoppen_knoppen cons...,"[knoppen consult, consult knoppen, knoppen con...",[vervolgconsult type registratie verkort Verko...
6,5,95,5_vocht vocht_toediening soort infuus_vocht to...,"[vocht vocht, toediening soort infuus, vocht t...","[vocht vocht toediening soort infuus Nacl, voc..."
7,6,93,6_decursus podo_decursus podo decursus_podo de...,"[decursus podo, decursus podo decursus, podo d...","[decursus podo Decursus uitvoeren, decursus po..."
8,7,88,7_aantal aantal_mdlarts aantal_aantal venofer_...,"[aantal aantal, mdlarts aantal, aantal venofer...","[mdlarts aantal, mdlarts aantal, mdlarts aantal]"
9,8,87,8_injectie denosumab_denosumab inj_starten den...,"[injectie denosumab, denosumab inj, starten de...",[gebruiken denosumab inj osteoporose baseren w...


In [None]:
# Print detailed results summary
print_results_summary(results_2to5gram, texts)

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_2to5gram_{timestamp}.csv"
grid_summary2.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

In [None]:
results_21to5gram, grid_summary1to5gram, topic_model1to5gram, data_df1to5gram = analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20], n_words_list=[5], ngram_range=(1, 5), calculate_coherence=False)


Analyzing with 20 topics, 5 words, ngram_range=(1, 5)


2025-04-18 22:41:52,103 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-18 22:42:11,620 - BERTopic - Dimensionality - Completed ✓
2025-04-18 22:42:11,620 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-18 22:42:12,182 - BERTopic - Cluster - Completed ✓
2025-04-18 22:42:12,182 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-18 22:42:26,584 - BERTopic - Representation - Completed ✓
2025-04-18 22:42:26,632 - BERTopic - Topic reduction - Reducing number of topics
2025-04-18 22:42:26,728 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-18 22:42:42,205 - BERTopic - Representation - Completed ✓
2025-04-18 22:42:42,279 - BERTopic - Topic reduction - Reduced number of topics from 250 to 20


In [None]:
for topic_id in topic_model1to5gram.get_topics():
    print(f"Topic {topic_id}: {[w for w, _ in topic_model1to5gram.get_topic(topic_id)[:5]]}")

topic_df1to5gram = pd.DataFrame(topic_model1to5gram.get_topic_info())
display(topic_df1to5gram)


In [None]:
# Print detailed results summary
print_results_summary(topic_df1to5gram, texts)

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_1to5gram_{timestamp}.csv"
grid_summary1to5gram.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

NameError: name 'grid_summary' is not defined

In [None]:
# Access the best model:
best_config = grid_summary.iloc[0]
best_model = results[(best_config['n_topics'], best_config['n_words'])]['model']
best_model

<bertopic._bertopic.BERTopic at 0x2158acc3ed0>

In [None]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_{timestamp}.csv"
grid_summary.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")

📁 Saved topic modeling summary to 'topic_model_results_20250418_211048.csv'


In [None]:
data_df 
data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,event_tags
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,[Unclear]
8,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Reden van opname:...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting decursus reden opnaam melena spec...,"[decursus, opnaam, melena, specieel, gastrosco...",decursus opnaam melena specieel gastroscopie b...,[Unclear]
9,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Consult,Samenvatting: \nConclusie\r\n-Samenvatting: Me...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting conclusie samenvatting Melena ace...,"[Melena, acenocoumarol, gebruik, gastroscopie,...",Melena acenocoumarol gebruik gastroscopie afwi...,[Unclear]
28,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Overige aantekeningen",Samenvatting: \nMemo\r\n-Memo: trombosedienst ...,2015-02-20 14:11:00,2015-02-20 14:11:00,samenvatting memo memo trombosedienst belen vp...,"[trombosedienst, belen, vpo, colo, inr, afspre...",trombosedienst belen vpo colo inr afspreken do...,[Unclear]
30,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, SEH",Samenvatting: \nSEPSIS - VPK\r\n-Bestaat er op...,2013-10-22 22:49:00,2013-10-22 22:49:00,samenvatting sepsis vpk bestaan basis anamnees...,"[sepsis, vpk, bestaan, basis, infectie]",sepsis vpk bestaan basis infectie,[Unclear]
...,...,...,...,...,...,...,...,...,...
9505,FAA79717FF2C725767E9469350ACECF640E5FCBC,Consult,Samenvatting: \nOpdrachten medewerker INT\r\n[...,2012-07-31 11:10:00,2012-07-31 11:10:00,samenvatting opdracht medewerk uitinen opdrach...,[Eliane],Eliane,[Topic 22]
9506,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Polikliniek: vervolgconsult",Samenvatting: \nVervolg consult Diabetes\r\n-D...,2012-07-31 07:47:00,2012-07-31 07:47:00,samenvatting vervolg consult diabete datum con...,"[vervolg, consult, diabete, consult, intern, d...",vervolg consult diabete consult intern diabete...,[Topic 1]
9507,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: Sc...,2012-06-20 17:27:00,2012-06-20 17:27:00,samenvatting decursus podo decursus screening ...,"[decursus, podo, decursus, screening, pulsatie...",decursus podo decursus screening pulsatie prop...,[Unclear]
9508,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: VB...,2012-04-24 14:27:00,2012-04-24 14:27:00,samenvatting decursus podo Decursus vb uitvoer...,"[decursus, podo, Decursus, uitvoeren]",decursus podo Decursus uitvoeren,[Topic 12]


In [None]:
# topic_keywords = {
#     topic_num: ", ".join([word for word, _ in topic_model.get_topic(topic_num)[:5]])
#     for topic_num in topic_model.get_topics().keys()
#     if topic_num != -1  # exclude outliers
# }


In [None]:
# data_df["topic_id"] = topics
# data_df["topic_keywords"] = data_df["topic_id"].map(lambda t: topic_keywords.get(t, "Outlier"))
# data_df

In [None]:
# data_df 
# data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

In [None]:
# threshold = 0.1
# doc_keywords = []

# for prob in probs:
#     if isinstance(prob, float):  # fallback
#         topic_ids = [0] if prob > threshold else []
#     else:
#         topic_ids = [i for i, p in enumerate(prob) if p > threshold]
    
#     keywords = [topic_keywords.get(t, "Outlier") for t in topic_ids]
#     doc_keywords.append("; ".join(keywords) if keywords else "Unclear")

# data_df["topic_keywords2"] = doc_keywords
# data_df


In [None]:
data_df 
data_df[data_df['event_tags'].apply(lambda tags: 'Topic 0' not in tags)]

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,event_tags
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...,[Unclear]
8,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Reden van opname:...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting decursus reden opnaam melena spec...,"[decursus, opnaam, melena, specieel, gastrosco...",decursus opnaam melena specieel gastroscopie b...,[Unclear]
9,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Consult,Samenvatting: \nConclusie\r\n-Samenvatting: Me...,2016-08-23 14:59:00,2016-08-23 14:59:00,samenvatting conclusie samenvatting Melena ace...,"[Melena, acenocoumarol, gebruik, gastroscopie,...",Melena acenocoumarol gebruik gastroscopie afwi...,[Unclear]
28,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Overige aantekeningen",Samenvatting: \nMemo\r\n-Memo: trombosedienst ...,2015-02-20 14:11:00,2015-02-20 14:11:00,samenvatting memo memo trombosedienst belen vp...,"[trombosedienst, belen, vpo, colo, inr, afspre...",trombosedienst belen vpo colo inr afspreken do...,[Unclear]
30,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, SEH",Samenvatting: \nSEPSIS - VPK\r\n-Bestaat er op...,2013-10-22 22:49:00,2013-10-22 22:49:00,samenvatting sepsis vpk bestaan basis anamnees...,"[sepsis, vpk, bestaan, basis, infectie]",sepsis vpk bestaan basis infectie,[Unclear]
...,...,...,...,...,...,...,...,...,...
9505,FAA79717FF2C725767E9469350ACECF640E5FCBC,Consult,Samenvatting: \nOpdrachten medewerker INT\r\n[...,2012-07-31 11:10:00,2012-07-31 11:10:00,samenvatting opdracht medewerk uitinen opdrach...,[Eliane],Eliane,[Topic 22]
9506,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Polikliniek: vervolgconsult",Samenvatting: \nVervolg consult Diabetes\r\n-D...,2012-07-31 07:47:00,2012-07-31 07:47:00,samenvatting vervolg consult diabete datum con...,"[vervolg, consult, diabete, consult, intern, d...",vervolg consult diabete consult intern diabete...,[Topic 1]
9507,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: Sc...,2012-06-20 17:27:00,2012-06-20 17:27:00,samenvatting decursus podo decursus screening ...,"[decursus, podo, decursus, screening, pulsatie...",decursus podo decursus screening pulsatie prop...,[Unclear]
9508,FAA79717FF2C725767E9469350ACECF640E5FCBC,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus Podo\r\n-Decursus: VB...,2012-04-24 14:27:00,2012-04-24 14:27:00,samenvatting decursus podo Decursus vb uitvoer...,"[decursus, podo, Decursus, uitvoeren]",decursus podo Decursus uitvoeren,[Topic 12]


In [None]:
# Filter rows where 'straathof' appears in the 'text' column
found_reports = data[data['text'].str.contains('bitterbal', case=False, na=False)]

# Display the filtered rows
found_reports.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text
4800,8CAE818529D2702AD874E91A5403942857128837,Consult,Samenvatting: \nDiabetes scorelijst\r\n-Datum:...,2017-05-23 10:59:00,2017-05-23 10:59:00,samenvatting diabete scorelijzen datum insulin...,"[diabete, insulin, bitterball, eten]",diabete insulin bitterball eten


In [None]:
print(found_reports['alltext'].iloc[0])
print(found_reports['verslagen_report_content'].iloc[0])

samenvatting diabete scorelijzen datum insulin avond extra bitterball eten
Samenvatting: 
Diabetes scorelijst
-Datum: 22-05-2017
-N: 9.2
-Insuline avond extra: bitterballen
gegeten
