In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

import re
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_bertopic_model(n_topics, min_topic_size=50, ngram_range=(1, 1)):
    print(f"Creating BERTopic model with {n_topics} topics and min_topic_size={min_topic_size}...")
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords, ngram_range=ngram_range)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    return topic_model

In [3]:

def calculate_coherence_score(texts, topic_words, ngram_range=(1, 1)):
    if not topic_words or len(topic_words) == 0:
        print("No topic words provided.")
        return None


    # Zorg dat tokens correct zijn
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]

    # Maak n-gram strings van tokens
    joined_texts = [" ".join(doc) for doc in texts]
    vectorizer = CountVectorizer(ngram_range=ngram_range).fit(joined_texts)

    # Maak per document lijst met gegenereerde n-grams
    ngram_texts = [list(ngrams) for ngrams in vectorizer.inverse_transform(vectorizer.transform(joined_texts))]

    # Maak Gensim dictionary en coherence model
    dictionary = Dictionary(ngram_texts)
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=ngram_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    return coherence_model.get_coherence()

In [4]:
def analyze_topics_with_sentiment(texts, embeddings, data, n_topics_list=[20], n_words_list=[5], 
                                  ngram_range=(1, 2), min_topic_size_list=[10], calculate_coherence=True):
    results = {}
    summary_data = []
    all_doc_tags = {}
    data_df = data.copy()

    for min_size in min_topic_size_list:
        for n_topics in n_topics_list:
            for n_words in n_words_list:
                # print(f"\nAnalyzing with {n_topics} topics, {n_words} words, ngram_range={ngram_range}")
                print(f"\nAnalyzing with min_topic_size={min_size}, n_topics={n_topics}, {n_words} words, ngram_range={ngram_range}")


                topic_model = create_bertopic_model(n_topics, ngram_range=ngram_range, min_topic_size=min_size)
                topics, probs = topic_model.fit_transform(texts, embeddings)

                topic_info = topic_model.get_topic_info()

                # Verwerk keywords per topic
                topic_words = {}
                for topic in set(topics):
                    if topic != -1:
                        words = topic_model.get_topic(topic)[:n_words]
                        topic_words[topic] = [word for word, _ in words]

                topic_word_list = list(topic_words.values())

                # Calculate coherence          
                if calculate_coherence == True:
                    # print(f"Number of topics: {len(topics)}")
                    # print(f"Number of probabilities: {len(probs) if probs is not None else 'None'}")
                    # print(f"Probs: {probs}")
                    # coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list)
                    print("Calculating coherence score...")

                    if not topic_word_list or len(topic_word_list) == 0:
                        print("No topics were found. Skipping coherence calculation.")
                        coherence = None

                    else:
                        coherence = calculate_coherence_score(data['tokens'].tolist(), topic_word_list, ngram_range=ngram_range)
                        print(f"Coherence Score: {coherence:.4f}")
                else:
                    print("Skipping coherence calculation...")
                    coherence = None
                    print("Coherence Score (C_v):", coherence)
                        

                # Per document: top topics (≥ threshold)
                doc_tags = []
                threshold = 0.1
                for i, topic in enumerate(topics):
                    if probs is None or isinstance(probs[i], float):
                        # fallback als probs None of 1D zijn
                        tags = [f"Topic {topic}"]
                    else:
                        tags = [f"Topic {j}" for j, p in enumerate(probs[i]) if p > threshold]
                    doc_tags.append(tags if tags else ["Unclear"])

                # Per document: keywords van toegekende topic (voor extra inzicht)
                topic_keywords_per_doc = [
                    topic_words.get(t, []) if t in topic_words else [] for t in topics
                ]

                # Update dataframe
                data_df[f'topic_{n_topics}_{n_words}_{min_size}'] = topics
                data_df[f'tags_{n_topics}_{n_words}_{min_size}'] = doc_tags
                data_df[f'keywords_{n_topics}_{n_words}_{min_size}'] = topic_keywords_per_doc


                results[(n_topics, n_words, ngram_range, min_size)] = {
                    'model': topic_model,
                    'topics': topics,
                    'probs': probs,
                    'topic_words': topic_words,
                    'coherence': coherence,
                    'topic_info': topic_info
                }


                summary_data.append({
                    'min_topic_size': min_size,
                    'n_topics': n_topics,
                    'n_words': n_words,
                    'ngram_range': str(ngram_range),
                    'coherence': coherence
                })


    grid_summary = pd.DataFrame(summary_data).sort_values(by='coherence', ascending=False)
    return results, grid_summary, topic_model, data_df


In [5]:
def print_results_summary(results, texts, save_to_file=True, filename=None):
    import datetime

    output_lines = []

    # for (n_topics, n_words), result in results.items():
    for key, result in results.items():
        print('key:', key)
        if isinstance(key, tuple) and len(key) == 4:
            n_topics, n_words, ngram_range, min_size = key

        elif isinstance(key, tuple) and len(key) == 3:
            n_topics, n_words, min_size = key
            ngram_range = "(1, 1)"
            # min_size = 'Not given'
        else:
            n_topics, n_words = key
            ngram_range = "(1, 1)"
            min_size = 'Not given'

        # output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        output_lines.append(f"\n=== Results for {n_topics} topics with {n_words} words, (ngram_range={ngram_range}) and min_topic_size:{min_size} ===")
        # output_lines.append(f"Coherence Score: {result['coherence']:.4f}")
        coherence = result.get('coherence')
        if coherence is not None:
            output_lines.append(f"Coherence Score: {coherence:.4f}")
        else:
            output_lines.append("Coherence Score: Not available")

        
        output_lines.append("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                output_lines.append(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        output_lines.append("\nTopic sizes: " + str(topic_sizes))
        output_lines.append("\n" + "="*50)

    full_output = "\n".join(output_lines)

    # Print to console
    print(full_output)

    # Optionally save to file
    if save_to_file:
        if not filename:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"topic_results_summary_{timestamp}.txt"
        
        with open(filename, "w", encoding="utf-8") as f:
            f.write(full_output)
        print(f"\n📁 Summary saved to: {filename}")

In [6]:
def fix_character_tokens(char_list):
    text = ''.join(char_list)
    tokens = text.split()  # crude but often works
    return tokens

def flatten_nested_char_lists(nested_list):
    return [''.join(token) for token in nested_list if isinstance(token, list)]
    # return [token for token in nested_list if token != []]




In [7]:
import ast

# Load your data
data = pd.read_csv('a:/df_cleaned.csv')

# Parse the stringified lists into real Python lists
data['tokens'] = data['tokens'].apply(lambda x: ast.literal_eval(x))


remove_list = [
        # Units, measurements, filler symbols
        'mg', 'mmoll', 'mmolL', 'x', 'per', 'dag', 'min', 'uur', 'ml', 'eenhed',

        # Admin & structure
        'samenvatting', 'memo', 'beleid', 'conclusie', 'aanvullend', 'afgewerken', 
        'opdracht', 'opdrachten', 'rapportage', 'diversen', 'contact', 'afspraak', 
        'tijd', 'tijdsduur', 'datum', 'poli', 'recept', 'gefaxt', 'bellen', 
        'akkoord', 'nodig', 'bekende', 'bekend', 'memo', 'scorelijzen', 'naslag',

        # Identifying or privacy-sensitive terms
        'bsn', 'city', 'postcode', 'firstname', 'lastname', 'streetname', 'phonenumber', 'voicemail', 
        'mw', 'dhr', 'mevrouw', 'meneer', 'zoon', 'mevr', 'mvr', 'dhr'
        'inge', 'valkenburg', 'peter', 'miriam', 'debby', 'eliane',

        # Clinical history / often uninformative by itself
        'anamnese', 'anamnees', 'voorgeschiedenis', 'huisarts', 
        'hoofdbehandelaar',

        # Temporal or ambiguous
        'sinds', 'dagen', 'weken', 'maanden', 'week', 'avond', 'nacht', 'ochtend', 'middag', 
        'extra', 'stop', 'gehad', 'gezien', 'zien', 'dd', 
        'ivm', 'links', 'rechts', 'linker', 'dr', 'overige', 'algemeen', 'patiënt', 'patiënte',

        # Admin/communication
        'verzoek', 'mail', 'mailen', 'verstuuren', 'brief', 'uitinen', 'ak', 'bespreeklijst', 
        'wijzigingopmerking',
        'voicemail', 'telefonisch', 'mobiel', 'ingesproek', 'aanleiding', 'telefoon', 'email', 'bereiken', 'svp', 'contactpersoon'
        'terugbellen', 'gemaild', 'insproken', 'voicemail',

        # Unclear / possibly noise
        'eenhed', 'aangeeft', 'scorelijzen', 'inten', 'intn', 'vb', 'sub', 

        # extra         
        'regelen', 'opmerking', 'bespreeklijst', 'sehperiode', 'bedrijf', 
        'ivb', 'mtps', 'cp', 'pat', 'huisadres', 'gg', 
        'medewerker', 'medewerk', 'laboratorium', 'apotheek', 'maand', 'tc', 
        'wonen', 'gezondheidsinstelling', 'leven', 'varken', 'soms', 'jaar', 'mgdag', 'lateraal',
        'bespreking', 'wondfoto', 'cze',         
        
        'vrijdag', 'maandag', 'donderdag', 'woensdag', 'zaterdag', 'zondag',      
        'juli', 'augustus', 'september', 'oktober', 'november', 'december', 'januari', 'februari', 'maart', 'april', 'mei', 'jun',
        
        
        # 'voltooid', 'verdenking', 'waarvoor', 'reden', 'waarschijnlijk', 'mogelijk', 'stuk', 'basisdosering', 'probleem', 'probleemlijst', 'actie',
        # 'nee', 'ja',  'arts',  'radiologie', 'internist', 'evaluatie', 'intake', 'controle',


        ]

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# Remove double words that are next to each other
for i in range(len(data)):
    tokens = data['tokens'].loc[i]
    # print("tokens list length:", len(tokens))
    new_tokens = []
    for j in range(len(tokens) - 1):
        if tokens[j] == tokens[j + 1]:
            print("Duplicate token found:", tokens[j], 'and', tokens[j + 1])
        else:
            new_tokens.append(tokens[j])
    # Append the last token since it won't be checked in the loop
    if tokens:
        new_tokens.append(tokens[-1])
    data['tokens'].loc[i] = new_tokens
    # print("New tokens list length:", len(new_tokens))

# Remove empty documents
data = data[data['tokens'].apply(len) > 0]

# Convert token lists to strings for BERTopic
data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))
texts = data['text'].tolist()

# Debug prints
print("First 3 token lists:", data['tokens'].head(3).tolist())
print("First 3 texts:", data['text'].head(3).tolist())
print("Total documents after filtering:", len(texts))


Duplicate token found: bloedgroep and bloedgroep
Duplicate token found: coloscopie and coloscopie
Duplicate token found: acenocoumarol and acenocoumarol
Duplicate token found: Fentanyl and Fentanyl
Duplicate token found: bloed and bloed
Duplicate token found: coloscopie and coloscopie
Duplicate token found: visite and visite
Duplicate token found: visite and visite
Duplicate token found: duodenum and duodenum
Duplicate token found: seh and seh
Duplicate token found: visite and visite
Duplicate token found: visite and visite
Duplicate token found: pijnscore and pijnscore
Duplicate token found: gibloeding and gibloeding
Duplicate token found: triage and triage
Duplicate token found: intern and intern
Duplicate token found: collumcaar and collumcaar
Duplicate token found: pijnscore and pijnscore
Duplicate token found: triage and triage
Duplicate token found: def and def
Duplicate token found: vocht and vocht
Duplicate token found: collumcaar and collumcaar
Duplicate token found: pijnscore

In [8]:
# Load embeddings

### model7 is from before removing duplicates
### model8 is with removal of duplicates based on words that are next to each other

embeddings = np.load('embeddings_model8.npy')
# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = embedding_model.encode(texts, show_progress_bar=True)
# np.save('embeddings_model8.npy', embeddings)

# print("embeddings:", embeddings)
print("Embeddings shape:", embeddings.shape)

Embeddings shape: (9493, 384)


In [9]:
# def generate_event_labels(topic_words_dict, label_style="title"):
#     label_dict = {}
#     for topic_id, words in topic_words_dict.items():
#         if not words: continue
#         label = " / ".join(words[:3])  # pick top n-grams
#         if label_style == "title":
#             label = label.replace("_", " ").title()
#         label_dict[topic_id] = label
#     return label_dict

# # Apply
# event_labels = generate_event_labels(results_2to3[(20, 5, (2, 3))]['topic_words'])
# event_labels

In [10]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd

# # Extract probabilities and topic labels
# probs = results_2to3[(20, 5, (2, 3))]['probs']
# topics = results_2to3[(20, 5, (2, 3))]['topics']
# labels = generate_event_labels(results_2to3[(20, 5, (2, 3))]['topic_words'])

# # Build a DataFrame (optional: filter to a subset of docs for legibility)
# df_probs = pd.DataFrame(probs).iloc[:50]  # first 50 docs
# df_probs.columns = [labels.get(i, f"Topic {i}") for i in df_probs.columns]

# # Heatmap
# plt.figure(figsize=(14, 8))
# sns.heatmap(df_probs, cmap="YlGnBu", cbar_kws={'label': 'Topic Relevance'}, linewidths=0.1)
# plt.title("Document vs Topic Heatmap (Top 50 reports)")
# plt.xlabel("Topics (Events)")
# plt.ylabel("Documents")
# plt.tight_layout()
# plt.show()


In [11]:
# results_2to3, grid_summary_2to3, topic_model_2to3, data_df_2to3 = analyze_topics_with_sentiment(texts, 
#                                                                                                 embeddings, 
#                                                                                                 data, 
#                                                                                                 n_topics_list=[5], 
#                                                                                                 n_words_list=[5], 
#                                                                                                 min_topic_size_list=[10],
#                                                                                                 ngram_range=(2, 3))


In [12]:
# results_2to3

In [13]:
# topic_info = topic_model_2to3.get_topic_info()
# topic_info

In [14]:
# data_df_2to3.head(5)

In [15]:
data

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"[rectaal, bloedverlie, obvn, divertikelbloedin...",rectaal bloedverlie obvn divertikelbloeding ac...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"[coloscopie, betreffen, initials, adresgegeven...",coloscopie betreffen initials adresgegevens st...
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"[gastroscopie, betreffen, initials, adresgegev...",gastroscopie betreffen initials adresgegevens ...
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"[rectaal, bloedverlie, eenmalig, hd, hbstabiel...",rectaal bloedverlie eenmalig hd hbstabiel inr ...
...,...,...,...,...,...,...,...,...
9572,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,Consult,Samenvatting: \n1e consult\r\n-Type 1e consult...,2015-03-20 08:13:00,2015-03-20 08:13:00,samenvatting consult type consult uitbreiden a...,"[consult, type, consult, uitbreiden, consult, ...",consult type consult uitbreiden consult reden ...
9573,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Type decursus: De...,2015-01-14 15:39:00,2015-01-14 15:39:00,samenvatting decursus type decursus decursus s...,"[decursus, type, decursus, Rvo, nstemi, uwi, r...",decursus type decursus Rvo nstemi uwi rvc prom...
9574,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nVerpleegkundige verslaglegging...,2014-12-21 09:31:00,2014-12-21 09:31:00,samenvatting verpleegkundig verslaglegging ver...,"[verpleegkundig, verslaglegging, verantwoordel...",verpleegkundig verslaglegging verantwoordelijk...
9575,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,2010-11-10 21:03:00,2010-11-10 21:03:00,samenvatting medisch dossier vk sputovamo leef...,"[medisch, dossier, vk, sputovamo, leeftijd, re...",medisch dossier vk sputovamo leeftijd registra...


In [16]:
texts = data['text'].tolist()
texts

['aj dingemans streetnaam Kenmerk patientid betreffen initials geb birthdate streetnaam zip tel geacht collega bovengenoemde opnemen afdeling maag darm leverziekt verband melaena rectaal bloedverlie diep veneuaz trombose longembolie cholecystectomie diverticulitis atriumfibrilleren spontaan conversie sinusritme melena waarvoor verklaring vinden verband stabiel hb overleg expectatief vermoeidheid sinusbradycardie waarvoor metoprolol tambocor vanmiddag fors Helderrood bloedverlie stolsel vermengen ontlasting zwart kleur zeuren pijn bovenbuik maagpijn waarvoor stoppen koffie drinken vet eten ontlasting intaak bloed zwart verkleuring bemerken tractus bijdragen mn lwklachten all penicilline urticaria lichamelijk onderzoek controle hr bpm nibp mmhg temp alg acuut ziek duidelijk anemisch hh pearl lymfadenopathie Cor souffle pulm vag beiderzijds bijgeluiod abd normaal peristaltiek wisselen tympanie soepel abdomen mild drukpijn epigastrio loslaatpijn murphy rt Helderrood bloed handschoen feces 

In [17]:
# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = embedding_model.encode(texts, show_progress_bar=True)

In [18]:
# results_2to3, grid_summary_2to3, topic_model_2to3, data_df_2to3 = analyze_topics_with_sentiment(texts, 
#                                                                                                 embeddings, 
#                                                                                                 data, 
#                                                                                                 n_topics_list=[5], 
#                                                                                                 n_words_list=[5], 
#                                                                                                 min_topic_size_list=[10],
#                                                                                                 ngram_range=(2, 3))

In [19]:
# tag_df = data[data['verslagen_report_tags'] == 'Klinische Brief']
# tag_df

In [20]:
for tag in data['verslagen_report_tags'].unique():
    tag_df = data[data['verslagen_report_tags'] == tag]
    print(f"Tag {tag} - Number of documents: {len(tag_df)}")

Tag Klinische Brief - Number of documents: 318
Tag Consult, Kliniek: vervolgconsult - Number of documents: 2248
Tag Poliklinische Brief - Number of documents: 481
Tag Spoedeisende Hulp - Number of documents: 250
Tag Consult, SEH - Number of documents: 1177
Tag Consult - Number of documents: 1553
Tag Consult, Kliniek: eerste consult - Number of documents: 139
Tag Consult, Polikliniek: vervolgconsult - Number of documents: 1992
Tag Consult, Polikliniek: eerste consult - Number of documents: 128
Tag Consult, Overige aantekeningen - Number of documents: 674
Tag Consult, Telefonisch consult - Number of documents: 452
Tag Radiologieverslag, ECG - Number of documents: 1
Tag Consult, Thuisbehandeling - Number of documents: 14
Tag Consult, Multidisciplinair overleg - Number of documents: 11
Tag Algemeen - Number of documents: 14
Tag Consult, Dagbehandeling - Number of documents: 9
Tag Consult, Intercollegiaal consult - Number of documents: 12
Tag Consult, Research - Number of documents: 1
Tag C

In [21]:
# import matplotlib.pyplot as plt

# tag_counts = data['verslagen_report_tags'].value_counts()
# tag_counts.plot(kind='barh', figsize=(10, 6))
# plt.title('Document Count per Tag')
# plt.xlabel('Number of Documents')
# plt.ylabel('Tag')
# plt.tight_layout()
# plt.show()


In [22]:
def get_ttr(tag_df):
    tag_tokens = tag_df['tokens'].sum()
    total_tokens = len(tag_tokens)
    unique_tokens = len(set(tag_tokens))
    ttr = unique_tokens / total_tokens if total_tokens else 0
   
    return ttr

In [23]:
def choose_topic_params(doc_count, ttr):
    if doc_count >= 2000:
        if ttr < 0.07:
            return 5, 100  # very repetitive
        elif ttr < 0.15:
            return 8, 100
        else:
            return 10, 100  # high diversity, allow more topics

    elif doc_count >= 1000:
        if ttr < 0.1:
            return 4, 50
        else:
            return 6, 50

    elif doc_count >= 300:
        if ttr < 0.1:
            return 3, 15
        elif ttr < 0.2:
            return 4, 15
        else:
            return 5, 15

    elif doc_count >= 100:
        return (4, 5) if ttr >= 0.25 else (3, 5)

    elif doc_count >= 30:
        return (2, 5) if ttr >= 0.3 else (2, 5)

    else:
        return None, None



# 


In [24]:
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
tag_results = {}

for tag in data['verslagen_report_tags'].unique():
    print(f"\nAnalyzing tag: {tag}")
    tag_df = data[data['verslagen_report_tags'] == tag]
    n_docs = len(tag_df)
    ttr = get_ttr(tag_df)

    print(f"Number of documents: {n_docs}, TTR: {ttr:.4f}")


    if n_docs < 20:
        print(f"Skipping tag '{tag}' due to insufficient data: {n_docs} documents.")
        break

    # Tiered parameter settings

    n_topics, min_topic_size = choose_topic_params(n_docs, ttr)
    print(f"Chosen parameters - n_topics: {n_topics}, min_topic_size: {min_topic_size}")


    tag_texts = tag_df['text'].tolist()
    tag_embeddings = embedding_model.encode(tag_texts, show_progress_bar=True)



    results, grid_summary, topic_model, data_df = analyze_topics_with_sentiment(
        tag_texts,
        tag_embeddings,
        tag_df,
        n_topics_list=[n_topics],
        n_words_list=[5],
        min_topic_size_list=[min_topic_size],
        ngram_range=(2, 3)
    )

    tag_results[tag] = {
        'results': results,
        'grid_summary': grid_summary,
        'topic_model': topic_model,
        'data_df': data_df
    }



Analyzing tag: Klinische Brief
Number of documents: 318, TTR: 0.0676
Chosen parameters - n_topics: 3, min_topic_size: 15


Batches: 100%|██████████| 10/10 [00:13<00:00,  1.39s/it]



Analyzing with min_topic_size=15, n_topics=3, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 3 topics and min_topic_size=15...


2025-05-16 00:20:43,989 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:20:54,639 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:20:54,639 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:20:54,657 - BERTopic - Cluster - Completed ✓
2025-05-16 00:20:54,658 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:20:55,250 - BERTopic - Representation - Completed ✓
2025-05-16 00:20:55,253 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:20:55,253 - BERTopic - Topic reduction - Number of topics (3) is equal or higher than the clustered topics(3).
2025-05-16 00:20:55,254 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:20:55,990 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.3204

Analyzing tag: Consult, Kliniek: vervolgconsult
Number of documents: 2248, TTR: 0.0493
Chosen parameters - n_topics: 5, min_topic_size: 100


Batches: 100%|██████████| 71/71 [01:26<00:00,  1.22s/it]



Analyzing with min_topic_size=100, n_topics=5, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 5 topics and min_topic_size=100...


2025-05-16 00:22:54,460 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:23:03,818 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:23:03,820 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:23:03,985 - BERTopic - Cluster - Completed ✓
2025-05-16 00:23:03,986 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:23:05,355 - BERTopic - Representation - Completed ✓
2025-05-16 00:23:05,360 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:23:05,361 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(4).
2025-05-16 00:23:05,361 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:23:06,922 - BERTopic - Representation - Completed ✓


Calculating coherence score...
Coherence Score: 0.7763

Analyzing tag: Poliklinische Brief
Number of documents: 481, TTR: 0.0970
Chosen parameters - n_topics: 3, min_topic_size: 15


Batches: 100%|██████████| 16/16 [00:20<00:00,  1.28s/it]



Analyzing with min_topic_size=15, n_topics=3, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 3 topics and min_topic_size=15...


2025-05-16 00:24:04,431 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:24:05,398 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:24:05,399 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:24:05,415 - BERTopic - Cluster - Completed ✓
2025-05-16 00:24:05,416 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:24:05,823 - BERTopic - Representation - Completed ✓
2025-05-16 00:24:05,825 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:24:05,837 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:24:06,264 - BERTopic - Representation - Completed ✓
2025-05-16 00:24:06,268 - BERTopic - Topic reduction - Reduced number of topics from 5 to 3


Calculating coherence score...
Coherence Score: 0.7423

Analyzing tag: Spoedeisende Hulp
Number of documents: 250, TTR: 0.1027
Chosen parameters - n_topics: 3, min_topic_size: 5


Batches: 100%|██████████| 8/8 [00:11<00:00,  1.39s/it]



Analyzing with min_topic_size=5, n_topics=3, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 3 topics and min_topic_size=5...


2025-05-16 00:24:40,833 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:24:41,285 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:24:41,286 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:24:41,295 - BERTopic - Cluster - Completed ✓
2025-05-16 00:24:41,296 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:24:41,821 - BERTopic - Representation - Completed ✓
2025-05-16 00:24:41,824 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:24:41,828 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:24:42,374 - BERTopic - Representation - Completed ✓
2025-05-16 00:24:42,377 - BERTopic - Topic reduction - Reduced number of topics from 13 to 3


Calculating coherence score...
Coherence Score: 0.4954

Analyzing tag: Consult, SEH
Number of documents: 1177, TTR: 0.0740
Chosen parameters - n_topics: 4, min_topic_size: 50


Batches: 100%|██████████| 37/37 [00:40<00:00,  1.10s/it]



Analyzing with min_topic_size=50, n_topics=4, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 4 topics and min_topic_size=50...


2025-05-16 00:25:49,524 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:25:53,060 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:25:53,061 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:25:53,105 - BERTopic - Cluster - Completed ✓
2025-05-16 00:25:53,106 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:25:54,708 - BERTopic - Representation - Completed ✓
2025-05-16 00:25:54,713 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:25:54,722 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:25:56,110 - BERTopic - Representation - Completed ✓
2025-05-16 00:25:56,117 - BERTopic - Topic reduction - Reduced number of topics from 5 to 4


Calculating coherence score...
Coherence Score: 0.3993

Analyzing tag: Consult
Number of documents: 1553, TTR: 0.1312
Chosen parameters - n_topics: 6, min_topic_size: 50


Batches: 100%|██████████| 49/49 [00:26<00:00,  1.86it/s]



Analyzing with min_topic_size=50, n_topics=6, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 6 topics and min_topic_size=50...


2025-05-16 00:26:59,905 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:27:06,110 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:27:06,111 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:27:06,172 - BERTopic - Cluster - Completed ✓
2025-05-16 00:27:06,173 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:27:06,500 - BERTopic - Representation - Completed ✓
2025-05-16 00:27:06,503 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:27:06,513 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:27:06,845 - BERTopic - Representation - Completed ✓
2025-05-16 00:27:06,848 - BERTopic - Topic reduction - Reduced number of topics from 8 to 6


Calculating coherence score...
Coherence Score: 0.6620

Analyzing tag: Consult, Kliniek: eerste consult
Number of documents: 139, TTR: 0.1682
Chosen parameters - n_topics: 3, min_topic_size: 5


Batches: 100%|██████████| 5/5 [00:06<00:00,  1.22s/it]



Analyzing with min_topic_size=5, n_topics=3, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 3 topics and min_topic_size=5...


2025-05-16 00:27:33,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:27:34,120 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:27:34,120 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:27:34,127 - BERTopic - Cluster - Completed ✓
2025-05-16 00:27:34,128 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:27:34,423 - BERTopic - Representation - Completed ✓
2025-05-16 00:27:34,425 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:27:34,430 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:27:34,760 - BERTopic - Representation - Completed ✓
2025-05-16 00:27:34,763 - BERTopic - Topic reduction - Reduced number of topics from 6 to 3


Calculating coherence score...
Coherence Score: 0.2985

Analyzing tag: Consult, Polikliniek: vervolgconsult
Number of documents: 1992, TTR: 0.0477
Chosen parameters - n_topics: 4, min_topic_size: 50


Batches: 100%|██████████| 63/63 [01:18<00:00,  1.24s/it]



Analyzing with min_topic_size=50, n_topics=4, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 4 topics and min_topic_size=50...


2025-05-16 00:29:19,734 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:29:28,581 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:29:28,582 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:29:28,670 - BERTopic - Cluster - Completed ✓
2025-05-16 00:29:28,672 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:29:29,697 - BERTopic - Representation - Completed ✓
2025-05-16 00:29:29,702 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:29:29,716 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:29:30,782 - BERTopic - Representation - Completed ✓
2025-05-16 00:29:30,818 - BERTopic - Topic reduction - Reduced number of topics from 8 to 4


Calculating coherence score...
Coherence Score: 0.5134

Analyzing tag: Consult, Polikliniek: eerste consult
Number of documents: 128, TTR: 0.2415
Chosen parameters - n_topics: 3, min_topic_size: 5


Batches: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]



Analyzing with min_topic_size=5, n_topics=3, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 3 topics and min_topic_size=5...


2025-05-16 00:30:06,457 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:30:06,684 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:30:06,684 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:30:06,691 - BERTopic - Cluster - Completed ✓
2025-05-16 00:30:06,692 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:30:06,867 - BERTopic - Representation - Completed ✓
2025-05-16 00:30:06,868 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:30:06,873 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:30:07,055 - BERTopic - Representation - Completed ✓
2025-05-16 00:30:07,057 - BERTopic - Topic reduction - Reduced number of topics from 7 to 3


Calculating coherence score...
Coherence Score: 0.6455

Analyzing tag: Consult, Overige aantekeningen
Number of documents: 674, TTR: 0.2209
Chosen parameters - n_topics: 5, min_topic_size: 15


Batches: 100%|██████████| 22/22 [00:12<00:00,  1.81it/s]



Analyzing with min_topic_size=15, n_topics=5, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 5 topics and min_topic_size=15...


2025-05-16 00:30:37,990 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:30:39,458 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:30:39,459 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:30:39,475 - BERTopic - Cluster - Completed ✓
2025-05-16 00:30:39,475 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:30:39,640 - BERTopic - Representation - Completed ✓
2025-05-16 00:30:39,642 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:30:39,648 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:30:39,782 - BERTopic - Representation - Completed ✓
2025-05-16 00:30:39,784 - BERTopic - Topic reduction - Reduced number of topics from 9 to 5


Calculating coherence score...
Coherence Score: 0.4411

Analyzing tag: Consult, Telefonisch consult
Number of documents: 452, TTR: 0.1054
Chosen parameters - n_topics: 4, min_topic_size: 15


Batches: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]



Analyzing with min_topic_size=15, n_topics=4, 5 words, ngram_range=(2, 3)
Creating BERTopic model with 4 topics and min_topic_size=15...


2025-05-16 00:31:15,954 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-16 00:31:17,132 - BERTopic - Dimensionality - Completed ✓
2025-05-16 00:31:17,133 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-16 00:31:17,156 - BERTopic - Cluster - Completed ✓
2025-05-16 00:31:17,159 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-16 00:31:17,401 - BERTopic - Representation - Completed ✓
2025-05-16 00:31:17,403 - BERTopic - Topic reduction - Reducing number of topics
2025-05-16 00:31:17,409 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-16 00:31:17,636 - BERTopic - Representation - Completed ✓
2025-05-16 00:31:17,639 - BERTopic - Topic reduction - Reduced number of topics from 8 to 4


Calculating coherence score...
Coherence Score: 0.6841

Analyzing tag: Radiologieverslag, ECG
Number of documents: 1, TTR: 1.0000
Skipping tag 'Radiologieverslag, ECG' due to insufficient data: 1 documents.


In [25]:
import joblib

joblib.dump(tag_results, "tag_results_2.joblib")


# tag_results_1 = joblib.load("tag_results_1.joblib")
# tag_results_1


['tag_results_2.joblib']

In [None]:
# tag_results = tag_results_1

NameError: name 'tag_results_1' is not defined

In [27]:
df_test = tag_results['Klinische Brief']['data_df']

# size_key = None

# col = df_test.columns[8]
# if col.startswith('topic_'):
#     print("Topic column found:", col)

for col in df_test.columns:
    if col.startswith('topic_'):
        print("Topic column found:", col)
        # size_key = set(col.split('_', 1)[1] for col in df_test.columns if col.startswith('topic_'))
        # print("Size keys:", size_key)
        df_test['Topic'] = df_test[col]
        df_test.drop(columns=[col], inplace=True)
    elif col.startswith('tags_'):
        print("Tags column found:", col)
        df_test['Tags'] = df_test[col]
        df_test.drop(columns=[col], inplace=True)
    elif col.startswith('keywords_'):
        print("Keywords column found:", col)
        df_test['Keywords'] = df_test[col]
        df_test.drop(columns=[col], inplace=True)
    else:
        print("No topic column found.")

# df_test['Topic_sizes'] = df_test['Topic']



# for i in range(len(df_test)):
#     df_test['Topic_sizes'].loc[i] = size_key

df_test


# id_vars = [col for col in df_test.columns if not col.startswith(('topic_', 'tags_', 'keywords_'))]
# id_vars


No topic column found.
No topic column found.
No topic column found.
No topic column found.
No topic column found.
No topic column found.
No topic column found.
No topic column found.
Topic column found: topic_3_5_15
Tags column found: tags_3_5_15
Keywords column found: keywords_3_5_15


Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,Topic,Tags,Keywords
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
64,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-29 12:08:00,2023-01-29 12:08:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[]
71,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-20 11:53:00,2023-01-20 11:53:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[]
134,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\nFransebaan 586\r\n...",2017-11-13 18:22:00,2017-11-13 18:22:00,mw cm staal huisarts fransebaan re city datum ...,"[cm, staal, fransebaan, re, kenmerk, patientid...",cm staal fransebaan re kenmerk patientid betre...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
236,0A5645E02FA818D1629926B6BEFA81CF91C25A46,Klinische Brief,Aan de weledelgeleerde heer\r\ndrs. L.J.A.L. H...,2021-01-21 18:32:00,2021-01-21 18:32:00,weledelgeleer heer drs ljal hendrikx huisarts ...,"[weledelgeleer, heer, drs, ljal, hendrikx, gro...",weledelgeleer heer drs ljal hendrikx groenstra...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
...,...,...,...,...,...,...,...,...,...,...,...
9236,F8D116F01EE0039678998F393FA337C10AD4F4E2,Klinische Brief,Aan de weledelgeleerde vrouwe\r\ndrs. H.N.J. G...,2020-09-30 12:24:00,2020-09-30 12:24:00,weledelgeleer vrouwe drs hnj gie huisarts stre...,"[weledelgeleer, vrouwe, drs, hnj, gie, streetn...",weledelgeleer vrouwe drs hnj gie streetnaam pb...,-1,[Topic -1],[]
9239,F8D116F01EE0039678998F393FA337C10AD4F4E2,Klinische Brief,Aan de weledelgeleerde vrouwe\r\ndrs. H.N.J. G...,2020-09-29 15:07:00,2020-09-29 15:07:00,weledelgeleer vrouwe drs hnj gie huisarts stre...,"[weledelgeleer, vrouwe, drs, hnj, gie, streetn...",weledelgeleer vrouwe drs hnj gie streetnaam pb...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
9245,F8D116F01EE0039678998F393FA337C10AD4F4E2,Klinische Brief,Aan de weledelgeleerde vrouwe\r\ndrs. H.N.J. G...,2020-09-24 18:47:00,2020-09-24 18:47:00,weledelgeleer vrouwe drs hnj gie huisarts stre...,"[weledelgeleer, vrouwe, drs, hnj, gie, streetn...",weledelgeleer vrouwe drs hnj gie streetnaam pb...,-1,[Topic -1],[]
9246,F8D116F01EE0039678998F393FA337C10AD4F4E2,Klinische Brief,Aan de weledelgeleerde vrouwe\r\ndrs. H.N.J. G...,2020-09-24 17:16:00,2020-09-24 17:16:00,weledelgeleer vrouwe drs hnj gie huisarts stre...,"[weledelgeleer, vrouwe, drs, hnj, gie, streetn...",weledelgeleer vrouwe drs hnj gie streetnaam pb...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."


In [28]:
for tag in tag_results.keys():
    print(f"Tag: {tag}")

Tag: Klinische Brief
Tag: Consult, Kliniek: vervolgconsult
Tag: Poliklinische Brief
Tag: Spoedeisende Hulp
Tag: Consult, SEH
Tag: Consult
Tag: Consult, Kliniek: eerste consult
Tag: Consult, Polikliniek: vervolgconsult
Tag: Consult, Polikliniek: eerste consult
Tag: Consult, Overige aantekeningen
Tag: Consult, Telefonisch consult


In [29]:
import pandas as pd

all_dfs = []

for tag in tag_results.keys():
    df_test = tag_results[tag]['data_df'].copy()
    
    for col in df_test.columns:
        if col.startswith('topic_'):
            # print("Topic column found:", col)
            df_test['Topic'] = df_test[col]
            df_test.drop(columns=[col], inplace=True)
        elif col.startswith('tags_'):
            # print("Tags column found:", col)
            df_test['Tags'] = df_test[col]
            df_test.drop(columns=[col], inplace=True)
        elif col.startswith('keywords_'):
            # print("Keywords column found:", col)
            df_test['Keywords'] = df_test[col]
            df_test.drop(columns=[col], inplace=True)
        else:
            # print("No topic column found.")
            continue

    # df['tag'] = tag  # Add tag info for traceability
    all_dfs.append(df_test)


combined_df = pd.concat(all_dfs, ignore_index=True)
# combined_df.to_csv("combined_tag_results.csv", index=False)
combined_df


Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,Topic,Tags,Keywords
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
1,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-29 12:08:00,2023-01-29 12:08:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[]
2,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-20 11:53:00,2023-01-20 11:53:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[]
3,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\nFransebaan 586\r\n...",2017-11-13 18:22:00,2017-11-13 18:22:00,mw cm staal huisarts fransebaan re city datum ...,"[cm, staal, fransebaan, re, kenmerk, patientid...",cm staal fransebaan re kenmerk patientid betre...,1,[Topic 1],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
4,0A5645E02FA818D1629926B6BEFA81CF91C25A46,Klinische Brief,Aan de weledelgeleerde heer\r\ndrs. L.J.A.L. H...,2021-01-21 18:32:00,2021-01-21 18:32:00,weledelgeleer heer drs ljal hendrikx huisarts ...,"[weledelgeleer, heer, drs, ljal, hendrikx, gro...",weledelgeleer heer drs ljal hendrikx groenstra...,0,[Topic 0],"[oraal stuk, tablet oraal, tablet oraal stuk, ..."
...,...,...,...,...,...,...,...,...,...,...,...
9407,F8C241E69061BB450778B7A4CC336FF520469A0A,"Consult, Telefonisch consult",Samenvatting: \nVoorgeschiedenis\n1994 cervixc...,2018-12-21 20:13:00,2018-12-21 20:13:00,samenvatting voorgeschiedenis cervixcarcinoom ...,"[cervixcarcinoom, hypertensie, hypothyreoïdie,...",cervixcarcinoom hypertensie hypothyreoïdie tot...,2,[Topic 2],"[totaal heup, polycythaemia vera, toename anem..."
9408,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-06-26 15:00:00,2020-06-26 15:00:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"[overig actie, aanvullen onderzoek, dun darm, ..."
9409,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-04-16 11:57:00,2020-04-16 11:57:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"[overig actie, aanvullen onderzoek, dun darm, ..."
9410,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-03-27 12:14:00,2020-03-27 12:14:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"[overig actie, aanvullen onderzoek, dun darm, ..."


In [30]:
print(type(combined_df['Keywords'][1]))

for i in range(len(combined_df)):
    if isinstance(combined_df['Keywords'].loc[i], list):
        combined_df['Keywords'].loc[i] = str(combined_df['Keywords'].loc[i])
    else:
        combined_df['Keywords'].loc[i] = combined_df['Keywords'].loc[i]

combined_df['Keywords'].unique()

<class 'list'>


array(["['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']",
       '[]',
       "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'document vervangen document']",
       "['aanvullen onderzoek', 'rectaal bloedverlie', 'lichamelijk onderzoek', 'beloop vpk', 'hd stabiel']",
       "['int knoppen', 'decursus type decursus', 'decursus type', 'knoppen int', 'type decursus']",
       "['decursus podo decursus', 'decursus podo', 'podo decursus', 'uitvoeren controle', 'controle decursus podo']",
       "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'streetnaam zip', 'geacht collega']",
       "['gastroscopie betreffen', 'collega gastroscopie uitvoeren', 'collega gastroscopie', 'gastroscopie uitvoeren', 'geacht collega gastroscopie']",
       "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'spoedeisen hulp', 'rectaal bloedverlie']",
       "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'zwart ontlasting', 'angiodysplasie jejunum']",

In [31]:
from collections import Counter

vocab_stats = []

for tag in data['verslagen_report_tags'].unique():
    tag_tokens = data[data['verslagen_report_tags'] == tag]['tokens'].sum()
    total_tokens = len(tag_tokens)
    unique_tokens = len(set(tag_tokens))
    ttr = unique_tokens / total_tokens if total_tokens else 0
    vocab_stats.append({
        'tag': tag,
        'doc_count': len(data[data['verslagen_report_tags'] == tag]),
        'total_tokens': total_tokens,
        'unique_tokens': unique_tokens,
        'ttr': round(ttr, 3)
    })

import pandas as pd
vocab_df = pd.DataFrame(vocab_stats).sort_values(by='doc_count', ascending=False)
print(vocab_df)


                                     tag  doc_count  total_tokens  \
1       Consult, Kliniek: vervolgconsult       2248        265311   
7   Consult, Polikliniek: vervolgconsult       1992        250257   
5                                Consult       1553         54825   
4                           Consult, SEH       1177        209532   
9         Consult, Overige aantekeningen        674         20433   
2                    Poliklinische Brief        481         80430   
10          Consult, Telefonisch consult        452         46465   
0                        Klinische Brief        318        128438   
3                      Spoedeisende Hulp        250         90433   
6       Consult, Kliniek: eerste consult        139         36881   
8   Consult, Polikliniek: eerste consult        128         23738   
18                 Consult, Samenvatting         16          1098   
12             Consult, Thuisbehandeling         14           753   
14                              Al

In [39]:
import pandas as pd

# Define the data
new_labels = [
    [0, "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']", "Oral Medication"],
    [1, "[]", "Outlier"],
    [2, "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'document vervangen document']", "Oral Meds & Documentation"],
    [3, "['aanvullen onderzoek', 'rectaal bloedverlie', 'lichamelijk onderzoek', 'beloop vpk', 'hd stabiel']", "GI Bleed Assessment"],
    [4, "['int knoppen', 'decursus type decursus', 'decursus type', 'knoppen int', 'type decursus']", "Note Types"],
    [5, "['decursus podo decursus', 'decursus podo', 'podo decursus', 'uitvoeren controle', 'controle decursus podo']", "Podiatry Follow-up"],
    [6, "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'streetnaam zip', 'geacht collega']", "Oral Meds & Correspondence"],
    [7, "['gastroscopie betreffen', 'collega gastroscopie uitvoeren', 'collega gastroscopie', 'gastroscopie uitvoeren', 'geacht collega gastroscopie']", "Gastroscopy Request"],
    [8, "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'spoedeisen hulp', 'rectaal bloedverlie']", "GI Emergency"],
    [9, "['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'zwart ontlasting', 'angiodysplasie jejunum']", "GI Bleed (Jejunum)"],
    [10, "['reden komst', 'oraal stuk', 'komst verwijzing', 'aanvullen onderzoek', 'lichamelijk onderzoek']", "Intake & Initial Assessment"],
    [11, "['sputovamo leeftijd', 'medisch dossier', 'triage nee', 'naam functie', 'toediening medicatie']", "Patient Intake"],
    [12, "['beloop verpleegkundig', 'soort infuus medicatie', 'infuus medicatie', 'vocht toediening soort', 'vocht toediening']", "Infusion & Fluid Management"],
    [13, "['indicatie aanvraag', 'asa klasse', 'antibiotisch profylaxe', 'antibiotisch profylaxe nvt', 'profylaxe nvt']", "Surgical Prophylaxis"],
    [14, "['lab diana', 'tevoren lab', 'diana lab', 'lab int', 'lab oz']", "Lab Requests"],
    [15, "['aantal controle', 'controle controle', 'mdlarts aantal', 'controle aantal', 'controle aantal controle']", "Follow-up Planning"],
    [16, "['diabete mellitus', 'communicatie specialistisch', 'specialistisch verpleegkundigen communicatie', 'communicatie specialistisch verpleegkundigen', 'verpleegkundigen communicatie']", "Diabetes & Specialist Communication"],
    [17, "['diabete lunch', 'diabete lunch slapen', 'lunch slapen', 'basisdosering diabete lunch', 'basisdosering diabete']", "Diabetes Lifestyle Planning"],
    [18, "['oraal stuk', 'reden komst', 'int knoppen', 'arts assistent', 'rectaal bloedverlie']", "Initial Triage"],
    [19, "['anemie wd', 'actief bloeding', 'streef hb', 'anemie hb', 'dun darm']", "Anemia & GI Bleed"],
    [20, "['overig actie', 'knoppen consult', 'lichamelijk onderzoek', 'consult knoppen', 'knoppen consult knoppen']", "Consultation Workflow"],
    [21, "['diabete mellitus', 'oraal stuk', 'diabete vpk', 'mellitus type', 'diabete mellitus type']", "Diabetes Medication"],
    [22, "['intern geneeskun', 'opname intern geneeskun', 'opname intern', 'secundair hyperparathyreoïdie', 'status niertransplantatie']", "Internal Medicine Admission"],
    [23, "['oraal stuk', 'lichamelijk onderzoek', 'overig actie', 'reden komst', 'gewicht kg']", "Admission Assessment"],
    [24, "['intake type', 'intake type consult', 'consult osteoporose', 'consult osteoporose intake', 'osteoporose intake']", "Osteoporosis Intake"],
    [25, "['intern geneeskun', 'opname intern', 'ferriprief anemie', 'opname intern geneeskun', 'chronisch nierinsufficiëntie']", "Iron Deficiency & CKD"],
    [26, "['krijgen remicade', 'gb krijgen', 'gb krijgen remicade', 'remicade iv', 'krijgen remicade iv']", "Remicade Administration"],
    [27, "['lab labbon', 'lab formulier', 'blank osnp', 'ca nf', 'lab formulier opgesturen']", "Lab Documentation"],
    [28, "['allergisch reaktie', 'uitslag videocapsule', 'thuis prikken', 'laten prikken', 'videocapsule via']", "Allergy & Capsule Diagnostics"],
    [29, "['overig actie', 'aanvullen onderzoek', 'dun darm', 'hemoglobine mmoll', 'ferriprief anemie']", "Iron Deficiency Workup"],
    [30, "['progressie cll', 'graad ii', 'hormonaal therapie', 'lobulair carcinoom', 'ductaal lobulair carcinoom']", "Cancer Progression (CLL/Breast)"],
    [31, "['totaal heup', 'polycythaemia vera', 'toename anemie', 'myelofibrose duidelijk progressie', 'progressie splenomegalie']", "Hematologic Disorders"]
]

# Create the DataFrame
df_new_labels = pd.DataFrame(new_labels, columns=["Index", "Topics", "Label"])

# Display the DataFrame
df_new_labels

Unnamed: 0,Index,Topics,Label
0,0,"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Medication
1,1,[],Outlier
2,2,"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Meds & Documentation
3,3,"['aanvullen onderzoek', 'rectaal bloedverlie',...",GI Bleed Assessment
4,4,"['int knoppen', 'decursus type decursus', 'dec...",Note Types
5,5,"['decursus podo decursus', 'decursus podo', 'p...",Podiatry Follow-up
6,6,"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Meds & Correspondence
7,7,"['gastroscopie betreffen', 'collega gastroscop...",Gastroscopy Request
8,8,"['oraal stuk', 'tablet oraal', 'tablet oraal s...",GI Emergency
9,9,"['oraal stuk', 'tablet oraal', 'tablet oraal s...",GI Bleed (Jejunum)


In [40]:
combined_df['new_topic_label'] = combined_df['Keywords']

for i in range(len(combined_df)):
    for j in range(len(df_new_labels)):
        if df_new_labels['Topics'].loc[j] == combined_df['Keywords'].loc[i]:
            print("Found match:", df_new_labels['Topics'].loc[j], "==", combined_df['Keywords'].loc[i])
            combined_df['new_topic_label'].loc[i] = df_new_labels['Label'].loc[j]
            break

Found match: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet'] == ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']
Found match: [] == []
Found match: [] == []
Found match: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet'] == ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']
Found match: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'document vervangen document'] == ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'document vervangen document']
Found match: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet'] == ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']
Found match: ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet'] == ['oraal stuk', 'tablet oraal', 'tablet oraal stuk', 'mcv fl', 'vriendelijk groet']
Found match: [] == []
Foun

In [43]:
combined_df['verslagen_report_tags_new_topic_label'] = combined_df['verslagen_report_tags'] + " | " + combined_df['new_topic_label']
combined_df

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,text,Topic,Tags,Keywords,new_topic_label,verslagen_report_tags_new_topic_label
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"[aj, dingemans, streetnaam, Kenmerk, patientid...",aj dingemans streetnaam Kenmerk patientid betr...,1,[Topic 1],"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Medication,Klinische Brief | Oral Medication
1,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-29 12:08:00,2023-01-29 12:08:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[],Outlier,Klinische Brief | Outlier
2,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\n[STREETNAME] RE [...",2023-01-20 11:53:00,2023-01-20 11:53:00,mw cm staal huisarts streetname re city datum ...,"[cm, staal, re, kenmerk, patientid, betreffen,...",cm staal re kenmerk patientid betreffen heer i...,-1,[Topic -1],[],Outlier,Klinische Brief | Outlier
3,088C9FD98B8B2CBCB597C17C07AC1845B21F0849,Klinische Brief,"Mw. C.M. Staal, huisarts\r\nFransebaan 586\r\n...",2017-11-13 18:22:00,2017-11-13 18:22:00,mw cm staal huisarts fransebaan re city datum ...,"[cm, staal, fransebaan, re, kenmerk, patientid...",cm staal fransebaan re kenmerk patientid betre...,1,[Topic 1],"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Medication,Klinische Brief | Oral Medication
4,0A5645E02FA818D1629926B6BEFA81CF91C25A46,Klinische Brief,Aan de weledelgeleerde heer\r\ndrs. L.J.A.L. H...,2021-01-21 18:32:00,2021-01-21 18:32:00,weledelgeleer heer drs ljal hendrikx huisarts ...,"[weledelgeleer, heer, drs, ljal, hendrikx, gro...",weledelgeleer heer drs ljal hendrikx groenstra...,0,[Topic 0],"['oraal stuk', 'tablet oraal', 'tablet oraal s...",Oral Meds & Documentation,Klinische Brief | Oral Meds & Documentation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,F8C241E69061BB450778B7A4CC336FF520469A0A,"Consult, Telefonisch consult",Samenvatting: \nVoorgeschiedenis\n1994 cervixc...,2018-12-21 20:13:00,2018-12-21 20:13:00,samenvatting voorgeschiedenis cervixcarcinoom ...,"[cervixcarcinoom, hypertensie, hypothyreoïdie,...",cervixcarcinoom hypertensie hypothyreoïdie tot...,2,[Topic 2],"['totaal heup', 'polycythaemia vera', 'toename...",Hematologic Disorders,"Consult, Telefonisch consult | Hematologic Dis..."
9408,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-06-26 15:00:00,2020-06-26 15:00:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"['overig actie', 'aanvullen onderzoek', 'dun d...",Iron Deficiency Workup,"Consult, Telefonisch consult | Iron Deficiency..."
9409,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-04-16 11:57:00,2020-04-16 11:57:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"['overig actie', 'aanvullen onderzoek', 'dun d...",Iron Deficiency Workup,"Consult, Telefonisch consult | Iron Deficiency..."
9410,F8D116F01EE0039678998F393FA337C10AD4F4E2,"Consult, Telefonisch consult",Samenvatting: \n2016 geen osteoporose\n2016 hy...,2020-03-27 12:14:00,2020-03-27 12:14:00,samenvatting osteoporose hypocalciurie april d...,"[osteoporose, hypocalciurie, dementie, Jun, pr...",osteoporose hypocalciurie dementie Jun prol Re...,0,[Topic 0],"['overig actie', 'aanvullen onderzoek', 'dun d...",Iron Deficiency Workup,"Consult, Telefonisch consult | Iron Deficiency..."
