In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')
from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# embedding_model1 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embedding_model2 = SentenceTransformer("distiluse-base-multilingual-cased-v2")
# # embedding_model3 = SentenceTransformer("KPN/bert-base-dutch-cased")
# # embedding_model4 = SentenceTransformer("GroNLP/bert-base-dutch-cased")

# def prep_embeddings(docs):
#     """
#     Prepares the embeddings for the documents using the specified embedding model.
#     """
#     embeddings1 = embedding_model1.encode(docs, show_progress_bar=True)
#     embeddings2 = embedding_model2.encode(docs, show_progress_bar=True)

#     return [embeddings1, embeddings2]

In [3]:
def get_embedding(n_models, embeddings):
    """
    Get the embedding model.
    """
    if n_models == 1:
        embedding_model = embedding_model1
        embeddings = embeddings
    elif n_models == 2:
        embedding_model = embedding_model2
        embeddings = embeddings
    # elif n_models == 3:
    #     embedding_model = embedding_model3
    # elif n_models == 4:
    #     embedding_model = embedding_model4
    else:
        raise ValueError("Invalid number of models.")
    return embedding_model, embeddings

In [4]:
def create_bertopic_model(n_topics, min_topic_size=10):
    # embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    # embedding_model, embeddings = get_embedding(n_models)
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words=dutch_stopwords)
    # vectorizer_model = CountVectorizer(stop_words='english')
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        nr_topics=n_topics,
        verbose=True
    )
    
    return topic_model

In [5]:
def calculate_coherence_score(texts, topic_words):
    # Convert each document to a list if it's a string representation of a list
    texts = [ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in texts]
    
    dictionary = Dictionary(texts)

    coherence_model = CoherenceModel(
        topics=topic_words,  
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )

    return coherence_model.get_coherence()

In [6]:
# def analyze_topics_with_sentiment(texts, embeddings, n_topics_list=[5, 10, 15], n_words_list=[5, 10, 15]):
def analyze_topics_with_sentiment(texts, embeddings, n_topics_list=[5], n_words_list=[5]):
    results = {}

    # n_models_list = [1, 2]
    
    # to create figure for coherence scores across different configurations
    coherence_fig = go.Figure()
    
    for n_topics in n_topics_list:
        coherence_scores = []
        for n_words in n_words_list:
            # for n_models in n_models_list:
                # print(f"\nAnalyzing with {n_topics} topics and {n_words} words per topic and embedding model {n_models}...")
                print(f"\nAnalyzing with {n_topics} topics and {n_words} words per topic...")
                
                # if n_models == 1:
                #     embeddings = embedding1
                # elif n_models == 2:
                #     embeddings = embeddings2

                topic_model = create_bertopic_model(n_topics)
                
                topics, probs = topic_model.fit_transform(texts, embeddings)
                
                topic_info = topic_model.get_topic_info()
                
                # Get topics with specified number of words
                topic_words = {}
                for topic in range(-1, len(set(topics))-1):
                    words = topic_model.get_topic(topic)[:n_words]
                    topic_words[topic] = [word for word, _ in words]
            

                for topic in range(-1, len(set(topics)) - 1):
                    print(f"Topic {topic}: {topic_model.get_topic(topic)}")

                # Get topics with specified number of words (words only, no probabilities)
                topic_word_list = []
                for topic in range(len(set(topics)) - 1):
                    topic_words = [word for word, _ in topic_model.get_topic(topic)[:n_words]]
                    topic_word_list.append(topic_words)


                # Coherence score
                coherence = calculate_coherence_score(texts, topic_word_list)
                coherence_scores.append(coherence)
                
                # # Topic-sentiment relationship to see how much the topics relate to a positive stock result
                # topic_sentiment = {}
                # for topic_num in range(-1, len(set(topics))-1):
                #     topic_docs = [i for i, t in enumerate(topics) if t == topic_num]
                #     if topic_docs:
                #         topic_targets = [targets[i] for i in topic_docs]
                #         positive_ratio = sum(topic_targets) / len(topic_targets)
                #         topic_sentiment[topic_num] = positive_ratio
                
                results[(n_topics, n_words)] = {
                    'model': topic_model,
                    'topics': topics,
                    'topic_info': topic_info,
                    'topic_words': topic_words,
                    # 'topic_sentiment': topic_sentiment,
                    'coherence': coherence
                }
                
                # to save the visualizations
                fig_topics = topic_model.visualize_topics()
                fig_topics.write_html(f'topic_visualization_{n_topics}_{n_words}.html')
                
                fig_heatmap = topic_model.visualize_heatmap()
                fig_heatmap.write_html(f'topic_heatmap_{n_topics}_{n_words}.html')
                
                # # Topic-sentiment visualization
                # sentiment_data = pd.DataFrame.from_dict(topic_sentiment, orient='index', columns=['positive_ratio'])
                # fig_sentiment = go.Figure(data=[
                #     go.Bar(x=sentiment_data.index, y=sentiment_data['positive_ratio'])
                # ])
                # fig_sentiment.update_layout(
                #     title=f'Topic-Sentiment Relationship ({n_topics} topics)',
                #     xaxis_title='Topic Number',
                #     yaxis_title='Ratio of Positive Stock Movement',
                #     yaxis_range=[0, 1]
                # )
                # fig_sentiment.write_html(f'topic_sentiment_{n_topics}_{n_words}.html')
            
        # Add coherence scores to the plot
        coherence_fig.add_trace(go.Scatter(
            x=n_words_list,
            y=coherence_scores,
            mode='lines+markers',
            name=f'{n_topics} topics'
        ))
    
    coherence_fig.update_layout(
        title='Coherence Scores across Different Configurations',
        xaxis_title='Number of Words per Topic',
        yaxis_title='Coherence Score (C_v)',
        showlegend=True
    )
    coherence_fig.write_html('coherence_scores.html')
    
    return results

In [7]:
# def print_results_summary(results):
#     for (n_topics, n_words), result in results.items():
#         print(f"\n=== Results for {n_topics} topics with {n_words} words ===")
#         print(f"Coherence Score: {result['coherence']:.4f}")
        
#         # Print topics and their words
#         print("\nTopics and their key words:")
#         print("results:", result)

#         topic_info = result['topic_info']
#         for index, row in topic_info.iterrows():
#             topic_num = row['Topic']
#             if topic_num != -1:  # skip outlier topic
#                 words = row['Representation']
#                 print(f"Topic {topic_num}: {words}")


        
#         # If `result['topic_words']` is a list, we iterate directly over it
#         for topic_num, words in enumerate(result['topic_words']):
#             print('topic_num:', topic_num)
#             print('words:', words)
#             if topic_num != -1:  
#                 print(f"Topic {topic_num}: {words}")
        
       
#         # Print topic sizes
#         topic_sizes = result['topic_info']['Count'].tolist()
#         print("\nTopic sizes:", topic_sizes)
        
#         print("\n" + "="*50)

def print_results_summary(results, texts):
    for (n_topics, n_words), result in results.items():
        print(f"\n=== Results for {n_topics} topics with {n_words} words ===")
        print(f"Coherence Score: {result['coherence']:.4f}")
        
        print("\nTopics and their key words:")
        topic_info = result['topic_info']
        for _, row in topic_info.iterrows():
            topic_num = row['Topic']
            if topic_num != -1:
                words = row['Representation']
                print(f"Topic {topic_num}: {', '.join(words)}")

        topic_sizes = topic_info[topic_info['Topic'] != -1]['Count'].tolist()
        print("\nTopic sizes:", topic_sizes)
        print("\n" + "="*50)

    
        # Print per-document topic assignment and top words
        model = result['model']
        topics = result['topics']
        docs = texts

        print("\nTop topic keywords per document:")
        for doc, topic in zip(docs, topics):
            if topic != -1:
                topic_words = model.get_topic(topic)
                top_words = ", ".join([word for word, _ in topic_words[:5]])  # Top 5 words
                print(f"Document: {doc[:100]}...")  # preview first 100 chars
                print(f"Assigned Topic: {topic}")
                print(f"Topic Keywords: {top_words}")
                print("-" * 50)



In [8]:
def main():
    data = pd.read_csv('a:/df_cleaned.csv')


    remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
            'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
            'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
            'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
            'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
            'stop', 'tijd', 'patiënt', 'onderzoek']

    data['tokens'] = data['tokens']#.apply(lambda tokens: ''.join(tokens))
    print("tokens:", data['tokens'])
    
        
    # Convert the tokens column (which is a list of words) into a single string for each document
    data['text'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])
    print("text:", data['text'])
    
#     data['tokens'] = data['tokens'].apply(ast.literal_eval)
    
    texts = data['text'].tolist() 
    print("texts:", texts)

    # embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    # # embedding_model2 = SentenceTransformer("distiluse-base-multilingual-cased-v2")
    # embeddings = embedding_model.encode(texts, show_progress_bar=True)
    # # embeddings2 = embedding_model2.encode(docs, show_progress_bar=True)

    import numpy as np
    # # Save the embeddings to a file
    # np.save('embeddings_model1.npy', embeddings)
    # # Load the embeddings from the file
    embeddings = np.load('embeddings_model1.npy')
    print("embeddings:", embeddings)    

    # targets = merged_df['target'].tolist()


    # for i in range(len(data['tokens'])):
    #     data['tokens'][i] = f"{data['tokens'][i]}"

    # Analyze with different numbers of topics and words
    results = analyze_topics_with_sentiment(texts, embeddings)
    
    # Print detailed results summary
    print_results_summary(results, texts)

In [None]:
if __name__ == "__main__":
    main()

tokens: 0       ['dhr', 'aj', 'dingemans', 'huisarts', 'street...
1       ['samenvatting', 'rectaal', 'bloedverlie', 'ob...
2       ['coloscopie', 'betreffen', 'mw', 'initials', ...
3       ['gastroscopie', 'betreffen', 'mw', 'initials'...
4       ['samenvatting', 'rectaal', 'bloedverlie', 'ee...
                              ...                        
9572    ['samenvatting', 'consult', 'type', 'consult',...
9573    ['samenvatting', 'decursus', 'type', 'decursus...
9574    ['samenvatting', 'verpleegkundig', 'verslagleg...
9575    ['samenvatting', 'medisch', 'dossier', 'vk', '...
9576    ['samenvatting', 'verpleegkundig', 'verslagleg...
Name: tokens, Length: 9577, dtype: object
text: 0       [[, ', d, h, r, ', ,,  , ', a, j, ', ,,  , ', ...
1       [[, ', s, a, m, e, n, v, a, t, t, i, n, g, ', ...
2       [[, ', c, o, l, o, s, c, o, p, i, e, ', ,,  , ...
3       [[, ', g, a, s, t, r, o, s, c, o, p, i, e, ', ...
4       [[, ', s, a, m, e, n, v, a, t, t, i, n, g, ', ...
                

In [None]:
data = pd.read_csv('a:/df_cleaned.csv')

# # Adjustments for word removal
# merged_df['tokens'] = merged_df['tokens'].apply(ast.literal_eval)

# print(data.head())
# print(data.columns)

    
# Convert the tokens column (which is a list of words) into a single string for each document
data['text'] = data['tokens'].apply(lambda tokens: ''.join(tokens))    
print(data['text'].head())

data['tokens'] = data['tokens'].apply(ast.literal_eval)

texts = data['text'].tolist() 

remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
            'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
            'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
            'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
            'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
            'stop', 'tijd', 'patiënt', 'onderzoek']

data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))   
texts = data['text'].tolist() 
print(data['tokens'].head())
print(data['text'].head())
print(texts[:5])    

0    ['dhr', 'aj', 'dingemans', 'huisarts', 'street...
1    ['samenvatting', 'rectaal', 'bloedverlie', 'ob...
2    ['coloscopie', 'betreffen', 'mw', 'initials', ...
3    ['gastroscopie', 'betreffen', 'mw', 'initials'...
4    ['samenvatting', 'rectaal', 'bloedverlie', 'ee...
Name: text, dtype: object
0    [dhr, aj, dingemans, streetnaam, Kenmerk, pati...
1    [rectaal, bloedverlie, obvn, divertikelbloedin...
2    [coloscopie, betreffen, mw, initials, adresgeg...
3    [gastroscopie, betreffen, mw, initials, adresg...
4    [rectaal, bloedverlie, eenmalig, hd, hbstabiel...
Name: tokens, dtype: object
0    dhr aj dingemans streetnaam Kenmerk patientid ...
1    rectaal bloedverlie obvn divertikelbloeding ac...
2    coloscopie betreffen mw initials adresgegevens...
3    gastroscopie betreffen mw initials adresgegeve...
4    rectaal bloedverlie eenmalig hd hbstabiel inr ...
Name: text, dtype: object
['dhr aj dingemans streetnaam Kenmerk patientid betreffen mevrouw initials geb birthdate street

In [None]:
# data = pd.read_csv('a:/df_cleaned.csv')

# # # Adjustments for word removal
# # merged_df['tokens'] = merged_df['tokens'].apply(ast.literal_eval)

# # print(data.head())
# # print(data.columns)

    
# # Convert the tokens column (which is a list of words) into a single string for each document
# remove_list = ['mg', 'x', 'per', 'dag', 'samenvatting', 'beleid', 'conclusie', 'mmolL', 'waarvoor', 'goed', 'wel', 'beloop', 
#             'voorgeschiedenis', 'opdrachten', 'gehad', 'aanvullend', 'bekende', 'voltooid', 'mogelijk', 'gezien', 'city', 'bsn', 
#             'nodig', 'firstname', 'streetname', 'lastname', 'postcode', 'anamnese',
#             'dd', 'stuk', 'ivm', 'rechts', 'links', 'dr', 'sinds', 'huisarts', 'datum', 'dagen', 'min', 'extra', 'weken', 'algemeen', 
#             'patiënte', 'overige','linker', 'week', 'accepteren', 'maanden', 'waarschijnlijk', 'reden', 'uur', 'verdenking', 'ontslag', 
#             'stop', 'tijd', 'patiënt', 'onderzoek']

# data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word not in remove_list])

# data['text'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))   
# texts = data['text'].tolist() 

# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# # embedding_model2 = SentenceTransformer("distiluse-base-multilingual-cased-v2")
# embeddings = embedding_model.encode(texts, show_progress_bar=True)


# # Save the embeddings to a file
# np.save('embeddings_model1.npy', embeddings)
# # # Load the embeddings from the file
# embeddings = np.load('embeddings_model1.npy')

In [None]:
# import numpy as np
# # Save the embeddings to a file
# np.save('embeddings_model1.npy', embeddings)
# # # Load the embeddings from the file
# embeddings = np.load('embeddings_model1.npy')