In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_bertopic_model(n_topics, min_topic_size=10):
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords)
    # vectorizer_model = CountVectorizer(stop_words='english')
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    
    return topic_model

In [3]:
def calculate_coherence_score(texts, topic_words):
    # Convert each document to a list if it's a string representation of a list
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]
    
    dictionary = Dictionary(texts)

    coherence_model = CoherenceModel(
        topics=topic_words,  
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )

    return coherence_model.get_coherence()

In [4]:
# def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[5, 10, 15], n_words_list=[5, 10, 15]):
# # def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[5], n_words_list=[5]):
#     results = {}
    
#     # to create figure for coherence scores across different configurations
#     coherence_fig = go.Figure()
    
#     for n_topics in n_topics_list:
#         coherence_scores = []
#         for n_words in n_words_list:
#                 print(f"\nAnalyzing with {n_topics} topics and {n_words} words per topic...")
                
#                 # Create and fit the BERTopic model
#                 topic_model = create_bertopic_model(n_topics)
                
#                 topics, probs = topic_model.fit_transform(texts, embeddings)
                
#                 topic_info = topic_model.get_topic_info()
                
#                 # Get topics with specified number of words
#                 topic_words = {}
#                 for topic in range(-1, len(set(topics))-1):
#                     words = topic_model.get_topic(topic)[:n_words]
#                     topic_words[topic] = [word for word, _ in words]
            

#                 for topic in range(-1, len(set(topics)) - 1):
#                     print(f"Topic {topic}: {topic_model.get_topic(topic)}")

#                 # Get topics with specified number of words (words only, no probabilities)
#                 topic_word_list = []
#                 for topic in range(len(set(topics)) - 1):
#                     topic_words = [word for word, _ in topic_model.get_topic(topic)[:n_words]]
#                     topic_word_list.append(topic_words)


#                 # Coherence score
#                 coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
#                 print(f"Coherence Score (C_v): {coherence}")
#                 coherence_scores.append(coherence)
                
                
#                 results[(n_topics, n_words)] = {
#                     'model': topic_model,
#                     'topics': topics,
#                     'topic_info': topic_info,
#                     'topic_words': topic_words,
#                     'coherence': coherence
#                 }
                
#                 # to save the visualizations
#                 fig_topics = topic_model.visualize_topics()
#                 fig_topics.write_html(f'topic_visualization_{n_topics}_{n_words}.html')
                
#                 fig_heatmap = topic_model.visualize_heatmap()
#                 fig_heatmap.write_html(f'topic_heatmap_{n_topics}_{n_words}.html')
                
            
#         # Add coherence scores to the plot
#         coherence_fig.add_trace(go.Scatter(
#             x=n_words_list,
#             y=coherence_scores,
#             mode='lines+markers',
#             name=f'{n_topics} topics'
#         ))
    
#     coherence_fig.update_layout(
#         title='Coherence Scores across Different Configurations',
#         xaxis_title='Number of Words per Topic',
#         yaxis_title='Coherence Score (C_v)',
#         showlegend=True
#     )
#     coherence_fig.write_html('coherence_scores.html')
    
#     return results

In [5]:
import plotly.graph_objects as go
import pandas as pd

def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20, 25, 30, 40, 50], n_words_list=[5, 10]):
    results = {}
    summary_data = []

    coherence_fig = go.Figure()
    
    for n_topics in n_topics_list:
        coherence_scores = []
        
        for n_words in n_words_list:
            print(f"\n Analyzing with {n_topics} topics and top {n_words} words...")

            # Create and fit the BERTopic model
            topic_model = create_bertopic_model(n_topics)
            topics, probs = topic_model.fit_transform(texts, embeddings)
            topic_info = topic_model.get_topic_info()

            # Extract keywords per topic
            topic_word_list = []
            topic_words = {}

            for topic in range(len(set(topics)) - 1):
                words = topic_model.get_topic(topic)[:n_words]
                topic_word_list.append([word for word, _ in words])
                topic_words[topic] = [word for word, _ in words]

            # Calculate coherence
            coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
            print(f" Coherence Score (C_v): {coherence:.4f}")
            coherence_scores.append(coherence)

            # Save results
            results[(n_topics, n_words)] = {
                'model': topic_model,
                'topics': topics,
                'topic_info': topic_info,
                'topic_words': topic_words,
                'coherence': coherence
            }

            # Save interactive visualizations
            topic_model.visualize_topics().write_html(f'topic_visualization_{n_topics}_{n_words}.html')
            topic_model.visualize_heatmap().write_html(f'topic_heatmap_{n_topics}_{n_words}.html')

            # Append summary data
            summary_data.append({
                'n_topics': n_topics,
                'n_words': n_words,
                'coherence': coherence
            })

        # Add to coherence plot
        coherence_fig.add_trace(go.Scatter(
            x=n_words_list,
            y=coherence_scores,
            mode='lines+markers',
            name=f'{n_topics} topics'
        ))

    coherence_fig.update_layout(
        title='Coherence Scores across Different Configurations',
        xaxis_title='Number of Words per Topic',
        yaxis_title='Coherence Score (C_v)',
        showlegend=True
    )
    coherence_fig.write_html('coherence_scores.html')

    # Create and print summary
    grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
    print("\n Top Configurations by Coherence:")
    print(grid_summary.head(3).to_string(index=False))

    return results, grid_summary


In [6]:
# def print_results_summary(results, texts):
#     for (n_topics, n_words), result in results.items():
#         print(f"\n=== Results for {n_topics} topics with {n_words} words ===")
#         print(f"Coherence Score: {result['coherence']:.4f}")
        
#         print("\nTopics and their key words:")
#         topic_info = result['topic_info']
#         for _, row in topic_info.iterrows():
#             topic_num = row['Topic']
#             if topic_num != -1:
#                 words = row['Representation']
#                 print(f"Topic {topic_num}: {', '.join(words)}")

#         topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
#         print("\nTopic sizes:", topic_sizes)
#         print("\n" + "="*50)

    
#         # # Print per-document topic assignment and top words
#         # model = result['model']
#         # topics = result['topics']
#         # docs = texts

#         # print("\nTop topic keywords per document:")
#         # for doc, topic in zip(docs, topics):
#         #     if topic != -1:
#         #         topic_words = model.get_topic(topic)
#         #         top_words = ", ".join([word for word, _ in topic_words[:5]])  # Top 5 words
#         #         print(f"Document: {doc[:100]}...")  # preview first 100 chars
#         #         print(f"Assigned Topic: {topic}")
#         #         print(f"Topic Keywords: {top_words}")
#         #         print("-" * 50)


In [7]:
def print_results_summary(results, texts, save_to_file=True, filename=None):
    import datetime

    output_lines = []

    for (n_topics, n_words), result in results.items():
        output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        output_lines.append(f"Coherence Score: {result['coherence']:.4f}")
        
        output_lines.append("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                output_lines.append(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        output_lines.append("\nTopic sizes: " + str(topic_sizes))
        output_lines.append("\n" + "="*50)

    full_output = "\n".join(output_lines)

    # Print to console
    print(full_output)

    # Optionally save to file
    if save_to_file:
        if not filename:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"topic_results_summary_{timestamp}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(full_output)
        print(f"\n📁 Summary saved to: {filename}")


In [8]:
def fix_character_tokens(char_list):
    text = ''.join(char_list)
    tokens = text.split()  # crude but often works
    return tokens

def flatten_nested_char_lists(nested_list):
    return [''.join(token) for token in nested_list if isinstance(token, list)]
    # return [token for token in nested_list if token != []]




In [None]:
import ast

# Load your data
data = pd.read_csv('a:/df_cleaned.csv')

# Parse the stringified lists into real Python lists
data['tokens'] = data['tokens'].apply(lambda x: ast.literal_eval(x))

# Remove unwanted tokens
# remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
#         'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
#         'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
#         'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
#         'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
#         'stop', 'tijd', 'patiënt', 'onderzoek']

# remove_list = [
#     'mg', 'x', 'per', 'dag', 'mmolL',
#     'firstname', 'lastname', 'postcode', 'streetname', 'city', 'bsn',
#     'datum', 'dagen', 'min', 'uur', 'tijd', 'week', 'weken', 'maanden',
#     'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'aj', 'dhr', 'mw'
#     ]

# remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmoll', 'waarvoor',  
#         'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
#         'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
#         'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
#         'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
#         'stop', 'tijd', 'patiënt', 'memo', 'afspraak', 'opdracht', 'controle', 'poli',
#         'medewerk', 'afgewerken', 'medewerker', 'tijdsduur', 'diversen', 'rapportage',
#         'bekend', 'contact', 'intake', 'evaluatie', 'arts',
#         'recept', 'gefaxt', 'bellen', 'nee', 'aangeeft', 'eenhed' ]

remove_list = [
        # Units, measurements, filler symbols
        'mg', 'mmoll', 'mmolL', 'x', 'per', 'dag', 'min', 'uur', 'ml', 'eenhed',

        # Admin & structure
        'samenvatting', 'memo', 'beleid', 'conclusie', 'aanvullend', 'voltooid', 'afgewerken', 
        'opdracht', 'opdrachten', 'controle', 'rapportage', 'diversen', 'contact', 'afspraak', 
        'evaluatie', 'intake', 'tijd', 'tijdsduur', 'datum', 'poli', 'recept', 'gefaxt', 'bellen', 
        'akkoord', 'nodig', 'bekende', 'bekend', 'memo', 'scorelijzen', 'naslag',

        # Identifying or privacy-sensitive terms
        'bsn', 'city', 'postcode', 'firstname', 'lastname', 'streetname', 'phonenumber', 'voicemail', 
        'mw', 'dhr', 'mevrouw', 'meneer', 'zoon', 'mevr', 'mvr', 'dhr'
        'inge', 'valkenburg', 'peter', 'miriam', 'debby', 'eliane',

        # Clinical history / often uninformative by itself
        'anamnese', 'anamnees', 'voorgeschiedenis', 'reden', 'verdenking', 'huisarts', 
        'arts', 'hoofdbehandelaar', 'radiologie',

        # Temporal or ambiguous
        'sinds', 'dagen', 'weken', 'maanden', 'week', 'avond', 'nacht', 'ochtend', 'middag', 
        'extra', 'stop', 'waarschijnlijk', 'mogelijk', 'gehad', 'gezien', 'zien', 'stuk', 'dd', 
        'ivm', 'links', 'rechts', 'linker', 'dr', 'overige', 'algemeen', 'patiënt', 'patiënte',

        # Admin/communication
        'verzoek', 'mail', 'mailen', 'verstuuren', 'brief', 'uitinen', 'ak', 'bespreeklijst', 
        'wijzigingopmerking',
        'voicemail', 'telefonisch', 'mobiel', 'ingesproek', 'aanleiding', 'telefoon', 'email', 'bereiken', 'svp', 'contactpersoon'
        'terugbellen', 'gemaild', 'insproken', 'voicemail',

        # Unclear / possibly noise
        'eenhed', 'aangeeft', 'scorelijzen', 'inten', 'intn', 'vb', 'sub', 'basisdosering', 'waarvoor',

        # extra         
        'actie', 'regelen', 'opmerking', 'bespreeklijst', 'sehperiode', 'bedrijf', 
        'probleem', 'probleemlijst', 'ivb', 'mtps', 'cp', 'pat', 'huisadres', 'gg', 
        'medewerker', 'medewerk', 'laboratorium', 'apotheek', 'maand', 'tc', 'internist', 
        'wonen', 'gezondheidsinstelling', 'leven', 'varken', 'soms', 'jaar', 'mgdag', 'lateraal',
        'bespreking', 'wondfoto', 'cze',         
        
        'vrijdag', 'maandag', 'donderdag', 'woensdag', 'zaterdag', 'zondag',      
        'juli', 'augustus', 'september', 'oktober', 'november', 'december', 'januari', 'februari', 'maart', 'april', 'mei', 'jun',
        'nee', 'ja',  
        
        ]

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# Remove empty documents
data = data[data['tokens'].apply(len) > 0]

# Convert token lists to strings for BERTopic
data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
texts = data['text'].tolist()

# Debug prints
print("First 3 token lists:", data['tokens'].head(3).tolist())
print("First 3 texts:", data['text'].head(3).tolist())
print("Total documents after filtering:", len(texts))


First 3 token lists: [['aj', 'dingemans', 'streetnaam', 'Kenmerk', 'patientid', 'betreffen', 'initials', 'geb', 'birthdate', 'streetnaam', 'zip', 'tel', 'geacht', 'collega', 'bovengenoemde', 'opnemen', 'afdeling', 'maag', 'darm', 'leverziekt', 'verband', 'melaena', 'rectaal', 'bloedverlie', 'diep', 'veneuaz', 'trombose', 'longembolie', 'cholecystectomie', 'diverticulitis', 'atriumfibrilleren', 'spontaan', 'conversie', 'sinusritme', 'melena', 'verklaring', 'vinden', 'verband', 'stabiel', 'hb', 'overleg', 'expectatief', 'vermoeidheid', 'sinusbradycardie', 'metoprolol', 'tambocor', 'vanmiddag', 'fors', 'Helderrood', 'bloedverlie', 'stolsel', 'vermengen', 'ontlasting', 'zwart', 'kleur', 'zeuren', 'pijn', 'bovenbuik', 'maagpijn', 'stoppen', 'koffie', 'drinken', 'vet', 'eten', 'ontlasting', 'intaak', 'bloed', 'zwart', 'verkleuring', 'bemerken', 'tractus', 'bijdragen', 'mn', 'lwklachten', 'all', 'penicilline', 'urticaria', 'lichamelijk', 'onderzoek', 'hr', 'bpm', 'nibp', 'mmhg', 'temp', 'alg'

In [10]:
# Load embeddings

# embeddings = np.load('embeddings_model1.npy')
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = embedding_model.encode(texts, show_progress_bar=True)
np.save('embeddings_model6.npy', embeddings)

# print("embeddings:", embeddings)
print("Embeddings shape:", embeddings.shape)



Batches: 100%|██████████| 296/296 [04:35<00:00,  1.07it/s]

Embeddings shape: (9447, 384)





In [11]:
# Analyze with different numbers of topics and words
results, grid_summary = analyze_topics_with_sentiment(texts, embeddings, data)



 Analyzing with 20 topics and top 5 words...


2025-04-17 16:13:30,202 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:14:09,161 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:14:09,161 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:14:09,523 - BERTopic - Cluster - Completed ✓
2025-04-17 16:14:09,539 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:14:10,492 - BERTopic - Representation - Completed ✓
2025-04-17 16:14:10,492 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:14:10,535 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:14:11,334 - BERTopic - Representation - Completed ✓
2025-04-17 16:14:11,334 - BERTopic - Topic reduction - Reduced number of topics from 250 to 20


 Coherence Score (C_v): 0.5752

 Analyzing with 20 topics and top 10 words...


2025-04-17 16:14:34,429 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:14:52,251 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:14:52,252 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:14:52,619 - BERTopic - Cluster - Completed ✓
2025-04-17 16:14:52,634 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:14:53,653 - BERTopic - Representation - Completed ✓
2025-04-17 16:14:53,670 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:14:53,702 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:14:54,553 - BERTopic - Representation - Completed ✓
2025-04-17 16:14:54,553 - BERTopic - Topic reduction - Reduced number of topics from 250 to 20


 Coherence Score (C_v): 0.5007

 Analyzing with 25 topics and top 5 words...


2025-04-17 16:15:10,888 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:15:28,620 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:15:28,620 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:15:28,987 - BERTopic - Cluster - Completed ✓
2025-04-17 16:15:28,987 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:15:30,056 - BERTopic - Representation - Completed ✓
2025-04-17 16:15:30,059 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:15:30,086 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:15:30,953 - BERTopic - Representation - Completed ✓
2025-04-17 16:15:30,970 - BERTopic - Topic reduction - Reduced number of topics from 250 to 25


 Coherence Score (C_v): 0.6180

 Analyzing with 25 topics and top 10 words...


2025-04-17 16:15:47,457 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:16:05,123 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:16:05,123 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:16:05,520 - BERTopic - Cluster - Completed ✓
2025-04-17 16:16:05,520 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:16:06,519 - BERTopic - Representation - Completed ✓
2025-04-17 16:16:06,519 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:16:06,564 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:16:07,418 - BERTopic - Representation - Completed ✓
2025-04-17 16:16:07,422 - BERTopic - Topic reduction - Reduced number of topics from 250 to 25


 Coherence Score (C_v): 0.5347

 Analyzing with 30 topics and top 5 words...


2025-04-17 16:16:24,586 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:16:42,837 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:16:42,838 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:16:43,221 - BERTopic - Cluster - Completed ✓
2025-04-17 16:16:43,221 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:16:44,237 - BERTopic - Representation - Completed ✓
2025-04-17 16:16:44,238 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:16:44,275 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:16:45,121 - BERTopic - Representation - Completed ✓
2025-04-17 16:16:45,139 - BERTopic - Topic reduction - Reduced number of topics from 250 to 30


 Coherence Score (C_v): 0.6413

 Analyzing with 30 topics and top 10 words...


2025-04-17 16:17:04,113 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:17:21,755 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:17:21,755 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:17:22,121 - BERTopic - Cluster - Completed ✓
2025-04-17 16:17:22,121 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:17:23,121 - BERTopic - Representation - Completed ✓
2025-04-17 16:17:23,121 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:17:23,155 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:17:24,017 - BERTopic - Representation - Completed ✓
2025-04-17 16:17:24,021 - BERTopic - Topic reduction - Reduced number of topics from 250 to 30


 Coherence Score (C_v): 0.5580

 Analyzing with 40 topics and top 5 words...


2025-04-17 16:17:43,656 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:18:01,572 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:18:01,588 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:18:01,939 - BERTopic - Cluster - Completed ✓
2025-04-17 16:18:01,939 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:18:02,989 - BERTopic - Representation - Completed ✓
2025-04-17 16:18:02,989 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:18:03,024 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:18:03,891 - BERTopic - Representation - Completed ✓
2025-04-17 16:18:03,897 - BERTopic - Topic reduction - Reduced number of topics from 252 to 40


 Coherence Score (C_v): 0.6469

 Analyzing with 40 topics and top 10 words...


2025-04-17 16:18:24,973 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:18:43,125 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:18:43,125 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:18:43,505 - BERTopic - Cluster - Completed ✓
2025-04-17 16:18:43,505 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:18:44,508 - BERTopic - Representation - Completed ✓
2025-04-17 16:18:44,508 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:18:44,539 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:18:45,406 - BERTopic - Representation - Completed ✓
2025-04-17 16:18:45,406 - BERTopic - Topic reduction - Reduced number of topics from 250 to 40


 Coherence Score (C_v): 0.5664

 Analyzing with 50 topics and top 5 words...


2025-04-17 16:19:07,680 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:19:25,498 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:19:25,514 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:19:25,846 - BERTopic - Cluster - Completed ✓
2025-04-17 16:19:25,846 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:19:26,783 - BERTopic - Representation - Completed ✓
2025-04-17 16:19:26,783 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:19:26,815 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:19:27,608 - BERTopic - Representation - Completed ✓
2025-04-17 16:19:27,608 - BERTopic - Topic reduction - Reduced number of topics from 252 to 50


 Coherence Score (C_v): 0.6431

 Analyzing with 50 topics and top 10 words...


2025-04-17 16:19:52,039 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-17 16:20:10,652 - BERTopic - Dimensionality - Completed ✓
2025-04-17 16:20:10,653 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-17 16:20:11,029 - BERTopic - Cluster - Completed ✓
2025-04-17 16:20:11,029 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-17 16:20:11,985 - BERTopic - Representation - Completed ✓
2025-04-17 16:20:11,986 - BERTopic - Topic reduction - Reducing number of topics
2025-04-17 16:20:12,022 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-17 16:20:12,855 - BERTopic - Representation - Completed ✓
2025-04-17 16:20:12,859 - BERTopic - Topic reduction - Reduced number of topics from 250 to 50


 Coherence Score (C_v): 0.5675

 Top Configurations by Coherence:
 n_topics  n_words  coherence
       40        5   0.646914
       50        5   0.643080
       30        5   0.641312


In [12]:
grid_summary

Unnamed: 0,n_topics,n_words,coherence
6,40,5,0.646914
8,50,5,0.64308
4,30,5,0.641312
2,25,5,0.618025
0,20,5,0.575176
9,50,10,0.567492
7,40,10,0.566372
5,30,10,0.558042
3,25,10,0.534691
1,20,10,0.500743


In [None]:
# Access the best model:
best_config = grid_summary.iloc[0]
best_model = results[(best_config['n_topics'], best_config['n_words'])]['model']
best_model

In [14]:
# Print detailed results summary
print_results_summary(results, texts)


=== Results for 20 topics with 5 words ===
Coherence Score: 0.5752

Topics and their key words:
Topic 0: oraal, anemie, onderzoek, tablet, opname, goed, hb, mmoll, hypertensie, medicatie
Topic 1: lab, anemie, ferriprief, gastroscopie, overig, ferritine, hb, beloop, ijzerinfuus, coloscopie
Topic 2: lunch, slapen, diabete, lev, ontbijt, insulin, lantus, spuiten, slaaptable, bitterball
Topic 3: sepsis, tp, peggy, innen, beloop, remicade, krijgen, balie, venofer, mirjam
Topic 4: verpleegkundig, beloop, ct, meenemen, ctbuik, werkdag, dagbehandeling, contrastverpleegkundige, routenummer, bijgevoegd
Topic 5: eten, knoppen, consult, verkort, kg, gaan, gewicht, labaanvraag, goed, mdlarts
Topic 6: vocht, soort, toediening, infuus, nacl, uitscheiding, totaal, medicatie, divers, ring
Topic 7: aantal, mdlarts, venofer, jp, lm, db, remicade, jb, mp, ed
Topic 8: decursus, podo, uitvoeren, wonddebridement, wond, tcc, mtp, ulcus, osa, service
Topic 9: osteoporose, denosumab, fractuur, injectie, dexa, 

In [15]:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"topic_model_results_{timestamp}.csv"
grid_summary.to_csv(filename, index=False)
print(f"📁 Saved topic modeling summary to '{filename}'")


📁 Saved topic modeling summary to 'topic_model_results_20250417_162033.csv'


In [16]:
# Filter rows where 'straathof' appears in the 'text' column
found_reports = data[data['text'].str.contains(' eliane ', case=False, na=False)]

# Display the filtered rows
found_reports.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text


In [None]:
# print(found_reports['alltext'].iloc[1])
# print(found_reports['verslagen_report_content'].iloc[1])

IndexError: single positional indexer is out-of-bounds

NameError: name 'topic_model' is not defined