In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # to avoid warning when using a tokenizer to calculate the coherence score during evaluation of a topic model
import sys
import json
import pandas as pd
import pickle
import numpy as np
from sklearn.decomposition import PCA
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Parallel, delayed
from sklearn.metrics import silhouette_score
#from sklearn.cluster import KMeans
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import ParameterGrid
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from bertopic import BERTopic
from typing import Optional, Tuple, Dict, Union, List
import datetime
from sklearn.metrics.pairwise import cosine_similarity

# download stopwords and tokenizers
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Get the parent directory of the current notebook directory and add it to the python path to import custom modules
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from util.clustering_utils import *

# Clustering Experiment

In this notebook, we will cluster the chats using the chat representations created in 02_feature_engineering with BERTopic.

To address the randomness inherent in topic modeling, we will run the model multiple times, analyzing the average evaluation metrics and selecting the iteration that is closest to these averages.

The overall process will be consistent for each feature, with some adjustments to accommodate their specific characteristics:

1. **Load the Chat Representations:** Import the chat representations.

2. **Cluster Using BERTopic:** Apply BERTopic to the chat embeddings.

3. **Evaluate Performance**:** Analyze the average evaluation metrics across multiple runs.

4. **Visualize Clusters:** Create visualizations to represent the clustered topics.

5. **Inspect Representative Texts:** Identify the most representative texts for each topic.


Finally, we'll compare the results by:

1. **Comparing Evaluation Metrics:** Analyze the evaluation metrics across different models to identify the best-performing one and asses the impact of the inclusion of different features on the clustering results.

2. **Comparing Topic Assignments:** Evaluate how topic assignments differ from those of the base model to understand the effect different features have on clustering results.

3. **Qualitatively asses the topics:** Use the most representative texts and visualisations to qualitativly asses the results of each model. 


## 1. Preparations


Before we start clustering the data, we have to:

1. **Load the Dataset:** In  order to inpect message later on, we'll need to load the dataset we used to create the chat representations. 

2. **Create Chat-Text Aggregations:** Since we will use chat embeddings generated in a previous notebook rather than relying on BERTopic to create its own embeddings, we need to create custom text documents associated with each representation. These will be passed to BERTopic in order to ensure that the topics identified by BERTopic are interpretable.

#### 1. Load the Dataset

First, we'll load the dataset.

In [None]:
dataset_path = os.path.join(os.getcwd(), '../data/preprocessed/df_preprocessed.pkl')

with open(dataset_path, 'rb') as f:
    df = pickle.load(f)

df.shape

#### 2. Create Chat-Text-Aggregations for Topic-Interpretability

To ensure the interpretability of the topics generated by BERTopic, we need to provide it with the message texts we used to create features for each chat. 

To make sure that the "chat-text-aggregations" used for this purpose are as meaningful as possible, we will perform the following operations: 

- **Basic preprocessing**, including lowercasing, stop word removal, removal of punctuation and digits, and tokenization.

- **Removing custom stopwords** specific to the Telegram domain, such as:

    - Telegram chat handles, which are frequently used to "sign" each  message in broadcast chats and could distort the analysis of the most common words.

    - Common social media call-to-action phrases, such as "share," "follow," and "comment," which are often repeated irrespective of topic.

- **Multilingual processing:** Since our corpus is multilingual, language-dependent preprocessing will be applied only to messages in the most frequent languages, as the other languages contribute only a marginal number of messages.
- **TF-IDF filtering:** We will filter out words below a certain TF-IDF threshold to ensure that only distinctive terms are included in the aggregation.

Afterwards, we will aggregate the messages and webpage previews for each chat into a single string. This string, along with the chat vector representations created earlier, will be passed to BERTopic as a basis for its topic description.

##### 1. Preprocess Data

**Define the Preprocessing Function**

In [3]:
def preprocess(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    """
    Preprocess the text data in a specified column of a DataFrame by performing several cleaning operations:
    
    Parameters:
        df (pd.DataFrame): the DataFrame containing the text to preprocess.
        text_column (str): the name of the column that contains the text to clean.

    Returns:
        pd.DataFrame: A DataFrame with additional columns for cleaned (`<text_column>_cleaned`) and tokenized 
        (`<text_column>_preprocessed`) text.
    """

    print("Preprocessing messages...")

    # get stop words 
    stop_words_en = set(stopwords.words('english'))
    stop_words_de = set(stopwords.words('german'))

    cta_stop_words_en = {'click', 'tap', 'press', 'subscribe', 'follow', 'share', 'like', 'comment',
                        'join', 'sign', 'visit', 'download', 'register', 'give', 'message', 'chat', 'group', 'channel', 'bot', 'reply'}
    cta_stop_words_de = {'klicken', 'tippen', 'drücken', 'abonnieren', 'folgen', 'teilen', 'mögen', 'kommentieren',
                        'beitreten', 'anmelden', 'besuchen', 'herunterladen', 'registrieren', 'geben', 'message', 'chat', 'group', 'channel', 'bot', 'reply'}

    stop_words_en = stop_words_en.union(cta_stop_words_en)
    stop_words_de = stop_words_de.union(cta_stop_words_de)
    print("Stop words loaded")

    # get frequent chat handles
    frequent_chat_handles = df["referenced_chat_handles"].explode().value_counts()
    frequent_chat_handles = frequent_chat_handles[frequent_chat_handles > 100].index.tolist()
    print("Frequent chat handles loaded")

    # create regex patterns
    def create_pattern(words):
        return rf'\b(?:{"|".join(map(re.escape, words))})\b'

    frequent_chat_pattern = create_pattern(frequent_chat_handles)
    stop_words_en_pattern = create_pattern(stop_words_en)
    stop_words_de_pattern = create_pattern(stop_words_de)
    print("Regex-Patterns created")
    
    # remove the most frequent chat handles  #TODO: Seems not to work
    df[f"{text_column}_cleaned"] = df[text_column].str.replace(frequent_chat_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("Handles removed")

    # remove URLs
    pattern = r"(https?:\/\/[^\s/$.?#].[^\s]*[^\s.,?!)](?![\])]))|(www\.[^\s/$.?#].[^\s]*[^\s.,?!)](?![\])]))|(t\.me\/[^\s.,?!)]*)"
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(pattern, '', regex=True).str.strip()
    print("URLs removed")

    # lowercase text
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.lower()
    print("Lowercase")
    
    # remove punctuation
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(f"[{re.escape(string.punctuation)}]", ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
    print("Punctuation removed")

    # remove the most frequent chat handles that included an @
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(frequent_chat_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("Handles with @ removed")

    # remove punctuation again
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(f"[{re.escape(string.punctuation)}]", ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
    print("Punctuation removed")

    # remove digits
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(r'\d+', '', regex=True).str.strip()
    print("Digits removed")

    # remove english stop words
    df.loc[df["message_text_lang"] == "English", f"{text_column}_cleaned"] = \
        df.loc[df["message_text_lang"] == "English", f"{text_column}_cleaned"].str.replace(stop_words_en_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("English stop words removed")

    # remove german stop words
    df.loc[df["message_text_lang"] == "German", f"{text_column}_cleaned"] = \
        df.loc[df["message_text_lang"] == "German", f"{text_column}_cleaned"].str.replace(stop_words_de_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("German stop words removed")

    # fill NaN with empty string
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].fillna('')
    
    # tokenize text
    df[f"{text_column}_preprocessed"] = df[f"{text_column}_cleaned"].apply(lambda x: word_tokenize(x) if x else [])

    df[f"{text_column}_preprocessed"] = df[f"{text_column}_preprocessed"].apply(lambda x: ' '.join(x))
    print("Tokenized")

    return df

**Apply Preprocessing to Message Texts**

In [None]:
# check, if the data was already preprocessed
preprocessen_msg_path = os.path.join(os.getcwd(), '../data/preprocessed/preprocessed_msgs_viz.pkl')
already_preprocessed = os.path.exists(preprocessen_msg_path)
already_preprocessed

In [None]:
if not already_preprocessed:

    # apply preprocessing
    df = preprocess(df, "message_text")

    # save the preprocessed data
    df["message_text_preprocessed"].to_pickle(preprocessen_msg_path)
    print("Preprocessed messages saved")

else:
    print("Loading preprocessed messages...")
    preprocessed_msg = pd.read_pickle(preprocessen_msg_path)
    df["message_text_preprocessed"] = preprocessed_msg

# display five random samples
with pd.option_context('display.max_colwidth', None):
    display(df[["message_text", "message_text_preprocessed", "referenced_chat_handles"]].sample(5))

**Apply Preprocessing to Webpage Previews**

In [None]:
# check, if the data was already preprocessed
preprocessen_web_path = os.path.join(os.getcwd(), '../data/preprocessed/preprocessed_web_viz.pkl')
already_preprocessed = os.path.exists(preprocessen_web_path)
already_preprocessed

In [7]:
if not already_preprocessed:
    # apply preprocessing
    df = preprocess(df, "webpage_description")

**Remove Emojis**

In [None]:
# create a list of emoji-unicodes using data from "https://unicode.org/Public/emoji/15.1/"
if not already_preprocessed:

    # list the paths to the unicode-files
    path_1 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-sequences.txt")
    path_2 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-test.txt")
    path_3 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-zwj-sequences.txt")
    file_paths = [path_1, path_2, path_3]

    # load all emojis from the unicode-files
    emoji_sequences = load_emoji_list(file_paths)

    # create a regex pattern from the emoji sequence
    emoji_pattern = '|'.join(re.escape(emoji) for emoji in emoji_sequences)
    print("Emoji pattern created")

    def demojize_chunk(chunk, emoji_pattern):
        # remove emojis
        chunk["webpage_description_preprocessed"] = chunk["webpage_description_preprocessed"].str.replace(emoji_pattern, " ", regex=True)
        return chunk

    n_jobs = 3  # Use three cores (seems to be fastest?)

    # remove emojis in parallel for each chunk
    chunks = np.array_split(df, n_jobs)
    df_chunks = Parallel(n_jobs=n_jobs)(delayed(demojize_chunk)(chunk, emoji_pattern) for chunk in chunks)
    df = pd.concat(df_chunks, ignore_index=True)

    # save the preprocessed data
    df["webpage_description_preprocessed"].to_pickle(preprocessen_web_path)
    print("Preprocessed messages saved")    

# simply load the preprocessed data, if it was already preprocessed
else:
    print("Loading preprocessed webpage previews...")
    preprocessed_web_previews = pd.read_pickle(preprocessen_web_path)
    df["webpage_description_preprocessed"] = preprocessed_web_previews

##### 2. Apply TF-IDF-Filtering

**Apply TF-IDF-Filtering to Message Texts**

In [None]:
tfidf_path = os.path.join(os.getcwd(), '../data/preprocessed/tfidf_msgs_viz.pkl')
already_tfidf = os.path.exists(tfidf_path)
already_tfidf

In [None]:
if not already_tfidf:
    # isolate English and German texts and prepare them for TF-IDF vectorization
    english_texts = df[df["message_text_lang"] == "English"]["message_text_preprocessed"]
    german_texts = df[df["message_text_lang"] == "German"]["message_text_preprocessed"]

    # create and fit TF-IDF vectorizers based on the isolated texts
    tfidf_vectorizer_en = TfidfVectorizer(ngram_range=(1,1))
    tfidf_vectorizer_de = TfidfVectorizer(ngram_range=(1,1)) 
    tfidf_vectorizer_en.fit(english_texts)
    tfidf_vectorizer_de.fit(german_texts)

    feature_names_en = tfidf_vectorizer_en.get_feature_names_out()
    feature_names_de = tfidf_vectorizer_de.get_feature_names_out()

    def apply_tf_idf_threshold(row, tfidf_vectorizer_en, tfidf_vectorizer_de, threshold):

        if row["message_text_lang"] == "English":
            tfidf_vectorizer = tfidf_vectorizer_en
            feature_names = feature_names_en
        elif row["message_text_lang"] == "German":
            tfidf_vectorizer = tfidf_vectorizer_de
            feature_names = feature_names_de
        else:
            return row["message_text_preprocessed"]

        tfidf_matrix = tfidf_vectorizer.transform([row["message_text_preprocessed"]])
        tfidf_values = tfidf_matrix.toarray().flatten()
        
        distinctive_words = [feature_names[i] for i in np.where(tfidf_values > threshold)[0]]

        return ' '.join(distinctive_words)

    # TODO: Change column name
    # apply the threshold to the TF-IDF values
    df["message_text_tfidf"] = df.apply(lambda x: apply_tf_idf_threshold(x, tfidf_vectorizer_en, tfidf_vectorizer_de, 0.15), axis=1)

    # save the preprocessed messages
    df["message_text_tfidf"].to_pickle(tfidf_path)

else:
    print("Loading tf-idf filtered messages...")
    tfidf_filtered_msg = pd.read_pickle(tfidf_path)
    df["message_text_tfidf"] = tfidf_filtered_msg

**Apply TF-IDF-Filtering to Webpage Previews**

In [None]:
tfidf_path = os.path.join(os.getcwd(), '../data/preprocessed/tfidf_web_viz.pkl')
already_tfidf = os.path.exists(tfidf_path)
already_tfidf

In [None]:
if not already_tfidf:
    # isolate English and German texts and prepare them for TF-IDF vectorization
    english_texts = df[df["webpage_description_lang"] == "English"]["webpage_description_preprocessed"]
    german_texts = df[df["webpage_description_lang"] == "German"]["webpage_description_preprocessed"]

    # create and fit TF-IDF vectorizers based on the isolated texts
    tfidf_vectorizer_en = TfidfVectorizer(ngram_range=(1,1))
    tfidf_vectorizer_de = TfidfVectorizer(ngram_range=(1,1)) 
    tfidf_vectorizer_en.fit(english_texts)
    tfidf_vectorizer_de.fit(german_texts)

    feature_names_en = tfidf_vectorizer_en.get_feature_names_out()
    feature_names_de = tfidf_vectorizer_de.get_feature_names_out()

    def apply_tf_idf_threshold_web(row, tfidf_vectorizer_en, tfidf_vectorizer_de, threshold):

        if row["webpage_description_lang"] == "English":
            tfidf_vectorizer = tfidf_vectorizer_en
            feature_names = feature_names_en
        elif row["webpage_description_lang"] == "German":
            tfidf_vectorizer = tfidf_vectorizer_de
            feature_names = feature_names_de
        else:
            return row["webpage_description_preprocessed"]

        tfidf_matrix = tfidf_vectorizer.transform([row["webpage_description_preprocessed"]])
        tfidf_values = tfidf_matrix.toarray().flatten()
        
        distinctive_words = [feature_names[i] for i in np.where(tfidf_values > threshold)[0]]

        return ' '.join(distinctive_words)

    # apply the threshold to the TF-IDF values
    df["webpage_description_tfidf"] = df.apply(lambda x: apply_tf_idf_threshold_web(x, tfidf_vectorizer_en, tfidf_vectorizer_de, 0.15), axis=1)

    # save the preprocessed messages
    df["webpage_description_tfidf"].to_pickle(tfidf_path)

else:
    print("Loading tf-idf-filtered messages...")
    tfidf_filtered_msg = pd.read_pickle(tfidf_path)
    df["webpage_description_tfidf"] = tfidf_filtered_msg

##### 3. Create Chat-Text-Aggregations for each Chat

In [None]:
grouped = df.groupby("telegram_chat_id")
chat_texts = grouped["message_text_tfidf"].agg(lambda x: " ".join(x))
chat_texts = chat_texts.astype(str)
chat_texts

##### 4. Create Webpage-Preview-Aggregations for each Chat

In [None]:
grouped = df.groupby("telegram_chat_id")
chat_webpage_previews = grouped["webpage_description_tfidf"].agg(lambda x: " ".join(x))
chat_webpage_previews = chat_webpage_previews.astype(str)
chat_webpage_previews

## 2. Basic Chat Embeddings

To start of, we'll cluster the chats in our dataset using an aggregation of the embeddings of messages sent inside of them. 

This model will act as a baseline. 

#### 1. Load Chat Representations

First, we load the chat representations we created in the notebook `02_feature_engineering`

In [None]:
base_path = os.path.join(os.getcwd(), '../features/0_base_chat_vectors.npy')
base_chat_vectors = np.load(base_path, allow_pickle=True)
print(f"Number of chat vectors: {base_chat_vectors.shape[0]}")
print(f"Vector Dimension: {base_chat_vectors.iloc[0].shape}")
base_chat_vectors

#### 2. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings. 

In addition to clustering, BERTopic will also automatically create summaries for each topic using a representational model. The method chosen here, KeyBERT-inspired, extracts keywords from all documents assigned to a topic using BERT embeddings.

In [None]:
# Load the model we used to create the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)

# check, if the base embeddings were already processed
feature_name = "base"
base_done = is_processed(feature_name)

# load the results, if the base embeddings were already processed
if base_done:
    print("Base embeddings already processed. Loading results...")
    base_evaluation_metrics, base_topic_model, representative_messages, representative_webpages = load_data(feature_name)
    
# run the experiment, if the base embeddings were not already processed
else:
    print("Running experiment for base embeddings...")
    base_evaluation_metrics, base_topics, base_probabilities, base_topic_model = run_experiment(
        chat_embeddings=base_chat_vectors, 
        chat_texts=chat_texts, 
        n=1, 
        topic_model_dir_path=os.path.join(os.getcwd(), "../results/base_embeddings/topic_models/"),
        feature_name=feature_name,
        used_embedding_model=transformer_model
    )

#### 3. Inspect the average evaluation metrics

In [None]:
for key, value in base_evaluation_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 4. Visualise and explore the results of the  most average topic model

In [None]:
create_topic_visualisations(base_topic_model, base_chat_vectors, chat_texts)

#### 5. Inspect the most representative messages

**1. Create Topic Vectors**

We average the embeddings of all chats assigned to a topic model to find a topics center. This vector will be used as a topic representation.

In [19]:
if not base_done:
    topic_vectors = create_topic_vectors(base_topic_model, base_chat_vectors)
    topic_vectors

**2. For each topic, extract the top n messages closest to the topic vector they were assigned to**

First, we add the message-text-embeddings created in `02_feature_engineering` back to the messages in the DataFrame. 
We will then use them to compare the messages to the topic vectors created earlier to find the most representative messages for each topic.

In [20]:
if not base_done:
    # load the message embeddings
    message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
    message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

    # create a series where each element is an message-vector
    message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

    # check if the message embeddings have the same shape as the dataframe
    assert message_embeddings_series.shape[0] == len(message_embeddings)
    message_embeddings_series

    # add the message embeddings to the dataframe
    df["message_vector"] = message_embeddings_series
    df[["message_text", "message_vector"]].head()

In [None]:
if not base_done:
    representative_messages = get_representative_texts(df = df,
                                                    topic_model = base_topic_model,
                                                    topic_vectors = topic_vectors,
                                                    chat_vectors = base_chat_vectors,
                                                    n = 10,
                                                    feature_name = "base",
                                                    text_column = "message_text",
                                                    text_embeddings_column = "message_vector",
                                                    text_preprocessed_column = "message_text_preprocessed")

    # save representative messages
    import json
    representative_messages_path = os.path.join(os.getcwd(), '../results/base_embeddings/representative_messages.json')
    representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
    with open(representative_messages_path, 'w') as jsonfile:
        json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

## 3. Filtered Chat Embeddings

#### 1. Load the filtered Chat Embeddings

First, we load the chat representations we created in the notebook `02_feature_engineering`

In [None]:
filtered_path = os.path.join(os.getcwd(), '../features/1_filtered_chat_vectors.npy')
filtered_chat_vectors = np.load(filtered_path, allow_pickle=True)
print(f"Number of chat vectors: {filtered_chat_vectors.shape[0]}")
print(f"Vector Dimension: {filtered_chat_vectors.iloc[0].shape}")
filtered_chat_vectors

#### 2. Create filtered Chat-Message-Text-Aggregations

Now we filter the dataset to remove all Forwarded/Original-Message-Pairs using the indices we saved in  `02_feature_engineering`.

In [None]:
indices_path = os.path.join(os.getcwd(), "../features/1_implicit_ref_filtered_indices.npy")
filtered_rows_indices = np.load(indices_path)
df_references_filtered = df.loc[filtered_rows_indices]
df_references_filtered.shape

As we removed some messages from our dataset, we need to create updated chat-text-aggregations that reflect these changes. 

In [None]:
grouped = df_references_filtered.groupby("telegram_chat_id")
filtered_chat_texts = grouped["message_text_tfidf"].agg(lambda x: " ".join(x))
filtered_chat_texts = filtered_chat_texts.astype(str)
filtered_chat_texts

#### 3. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings. Again, we will run the model multiple times and inspect the average results of the topic models.

In [None]:
# Load the model we used for the embeddings, in order to reuse it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)

# check, if the filtered embeddings were already processed
feature_name = "filtered"
filtered_done = is_processed(feature_name)

# load the results, if the filtered embeddings were already processed
if filtered_done:
    print("Filtered embeddings already processed. Loading results...")
    filtered_evaluation_metrics, filtered_topic_model, representative_messages, representative_webpages = load_data(feature_name)

# run the experiment, if the filtered embeddings were not already processed
else:
    print("Running experiment for filtered embeddings...")
    filtered_evaluation_metrics, filtered_topics, filtered_probabilities, filtered_topic_model = run_experiment(
        chat_embeddings=filtered_chat_vectors, 
        chat_texts=filtered_chat_texts, 
        n=1, 
        topic_model_dir_path=os.path.join(os.getcwd(), "../results/filtered_embeddings/topic_models/"),
        feature_name="filtered",
        used_embedding_model=transformer_model
)

#### 3. Inspect the average evaluation metrics

In [None]:
for key, value in filtered_evaluation_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 4. Visualise and explore the results

In [None]:
create_topic_visualisations(filtered_topic_model, filtered_chat_vectors, filtered_chat_texts)

#### 5. Inspect the most representative messages

Again, we'll inspect the most representative messages for each topic by comparing them to topic vectors derived from the topic assignments returned by the topic model.

**1. Create Topic Vectors**

We average the embeddings of all chats assigned to a topic model to find a topics center. This vector will be used as a topic representation.

In [28]:
if not filtered_done:
    topic_vectors = create_topic_vectors(filtered_topic_model, filtered_chat_vectors)
    topic_vectors

**2. For each topic, extract the top n messages closest to the topic vector they were assigned to**

Again, we add the embeddings created in `02_feature_engineering` back to the messages in the DataFrame and use them to find the most representative messages for each topic.

In [29]:
if not filtered_done:
    # filter the df to only include the messages that were used in the filtered chat vectors. We can use the indices saved in the feature engineering notebook
    filtered_rows_indices = np.load(indices_path)
    df_filtered = df.loc[filtered_rows_indices]

    # load the message embeddings
    message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
    message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

    # create a series where each element is a message-vector
    message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

    # check if the message embeddings have the same shape as the dataframe
    assert message_embeddings_series.shape[0] == len(message_embeddings)
    message_embeddings_series

    # add the message embeddings to the dataframe
    df_filtered["message_vector"] = message_embeddings_series
    df_filtered[["message_text", "message_vector"]].head()

In [None]:
if not filtered_done:
    representative_messages = get_representative_texts(df = df_filtered,
                                                    topic_model = filtered_topic_model,
                                                    topic_vectors = topic_vectors,
                                                    chat_vectors = filtered_chat_vectors,
                                                    n = 10,
                                                    feature_name = "filtered",
                                                    text_column = "message_text",
                                                    text_embeddings_column = "message_vector",
                                                    text_preprocessed_column = "message_text_preprocessed")

    # save representative messages
    import json
    representative_messages_path = os.path.join(os.getcwd(), '../results/filtered_embeddings/representative_messages.json')
    representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
    with open(representative_messages_path, 'w') as jsonfile:
        json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

## 4. Webpage Preview Embeddings

#### 1. Load Chat Representations

First, we load the chat representations we created based on webpage previews in the notebook `02_feature_engineering`

In [None]:
webpreview_path = os.path.join(os.getcwd(), '../features/3_webpreview_chat_vectors.npy')
webpreview_chat_vectors = np.load(webpreview_path, allow_pickle=True)
print(f"Number of chat vectors: {webpreview_chat_vectors.shape[0]}")
print(f"Vector Dimension: {webpreview_chat_vectors.iloc[0].shape}")
webpreview_chat_vectors

#### 2. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings. 

Again, we'll run the experiment multiple times in order to inspect average results and models. 

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)

# check, if the webpreview embeddings were already processed
feature_name = "webpreview"
webpreview_done = is_processed(feature_name, True)

# load the results, if the webpreview embeddings were already processed
if webpreview_done:
    print("Webpreview embeddings already processed. Loading results...")
    webpreview_evaluation_metrics, webpreview_topic_model, representative_messages, representative_webpreviews = load_data(feature_name, True)
    
# run the experiment, if the webpreview embeddings were not already processed
else:
    print("Running experiment for webpreview embeddings...")
    webpreview_evaluation_metrics, webpreview_topics, webpreview_probabilities, webpreview_topic_model = run_experiment(
        chat_embeddings=webpreview_chat_vectors, 
        chat_texts=chat_webpage_previews, 
        n=1, 
        topic_model_dir_path=os.path.join(os.getcwd(), "../results/webpreview_embeddings/topic_models/"),
        feature_name="webpreview",
        used_embedding_model=transformer_model
    )

#### 3. Inspect the average evaluation metrics

In [None]:
for key, value in webpreview_evaluation_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 4. Visualize and explore the Results

In [None]:
create_topic_visualisations(webpreview_topic_model, webpreview_chat_vectors, chat_webpage_previews)

#### 5. Inspect the most representative messages

**1. Create Topic Vectors**

In [43]:
if not webpreview_done:
    topic_vectors = create_topic_vectors(webpreview_topic_model, webpreview_chat_vectors)
    display(topic_vectors)

**2. For each topic, extract the top messages closest to the topic vector they were assigned to**

Again, we add the embeddings created in `02_feature_engineering` back to the messages in the DataFrame and use them to compare them to the topic vectors created earlier to find the most representative messages for each topic.

In [44]:
# load the message embeddings
if not webpreview_done:
    message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
    message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

    # create a series where each element is an message-vector
    message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

    # check if the message embeddings have the same shape as the dataframe
    assert message_embeddings_series.shape[0] == len(message_embeddings)
    message_embeddings_series

    # add the message embeddings to the dataframe
    df["message_vector"] = message_embeddings_series
    display(df[["message_text", "message_vector"]].head()) #TODO: Add display

In [None]:
# compare the message embeddings with the topic vectors
if not webpreview_done:
    representative_messages = get_representative_texts(df,
                                                    webpreview_topic_model,
                                                    topic_vectors,
                                                    webpreview_chat_vectors,
                                                    10,
                                                    "webpreview",
                                                    "message_text",
                                                    "message_vector",
                                                    "message_text_preprocessed")

    # save representative messages
    import json
    representative_messages_path = os.path.join(os.getcwd(), '../results/webpreview_embeddings/representative_messages.json')
    representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
    with open(representative_messages_path, 'w') as jsonfile:
        json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

#### 6. Inspect the most representative Webpage-Previews

Now, we will inspect the most representative webpage-previews. To do so, we'll compare the webpage-preview-embeddings to the topic-vectors created earlier and inspect the most similar ones.

In [46]:
# load the webpage embeddings
if not webpreview_done:
    webpreview_embeddings_path = os.path.join(os.getcwd(), '../features/3_webpage_embeddings.npy')

    # create a series where each element is an message-vector
    webpreview_embeddings = np.load(webpreview_embeddings_path)

    # add the message embeddings to the dataframe.
    df["webpreview_vector"] = webpreview_embeddings.tolist() 
    display(df[["webpage_title", "webpage_description", "webpreview_vector"]].head())

In [None]:
# compare the webpage embeddings with the topic vectors and get the most similar webpages
if not webpreview_done:
    representative_webpreviews = get_representative_texts(df, 
                                                        webpreview_topic_model, 
                                                        topic_vectors, 
                                                        webpreview_chat_vectors, 
                                                        10, 
                                                        "webpreview", 
                                                        "webpage_description",
                                                        "webpreview_vector",
                                                        "webpage_description_preprocessed")

    # save representative messages
    import json
    representative_webpreview_path = os.path.join(os.getcwd(), '../results/webpreview_embeddings/representative_webpreviews.json')
    representative_webpreviews = {int(topic): messages for topic, messages in representative_webpreviews.items()} # convert keys to int
    with open(representative_webpreview_path, 'w') as jsonfile:
        json.dump(representative_webpreviews, jsonfile, indent=4)

# print representative webpreviews
for topic, text in representative_webpreviews.items():
    print(f"Topic {topic}:")
    for i, text in enumerate(text):
        print(f"{i+1}. {text.strip()}")
    print("\n")

## 5. Combined Message & Webpage-Preview Embeddings

Next, we combine the message- and web-preview-embeddings and inspect the changes in clustering results.

#### 1. Combine Message-Text- and Webpage-Preview-Vectors

First, we load the chat-vectors we created by averaging the webpage-preview- and message-vectors.

In [48]:
combine_vectors_path = os.path.join(os.getcwd(), '../features/3_msg_webpreview_chat_vectors.npy')
combined_vectors = np.load(combine_vectors_path, allow_pickle=True)

#### 2. Create combined Chat-Text-Aggregations

Next, we combine the Text-Aggregations for Webpage-Previews and Chat-Messages in order to use them  to make the topics interpretable.

In [None]:
# create a DataFrame to combine the texts
combined_text_dataframe = pd.DataFrame({
    "chat_texts": chat_texts,
    "chat_webpage_previews": chat_webpage_previews
})

# combine the texts
combined_text_dataframe["combined_texts"] = combined_text_dataframe["chat_texts"] + " " + combined_text_dataframe["chat_webpage_previews"]

# calculate the length of the texts
combined_text_dataframe["chat_texts_len"] = combined_text_dataframe["chat_texts"].apply(lambda x: len(x.split()))
combined_text_dataframe["chat_webpage_previews_len"] = combined_text_dataframe["chat_webpage_previews"].apply(lambda x: len(x.split()))
combined_text_dataframe["combined_texts_len"] = combined_text_dataframe["combined_texts"].apply(lambda x: len(x.split()))

# check if the combined arrays are the same length as the original arrays combined
assert combined_text_dataframe["combined_texts_len"].equals(combined_text_dataframe["chat_texts_len"] + combined_text_dataframe["chat_webpage_previews_len"])

# get the combined texts
combined_texts = combined_text_dataframe["combined_texts"]
combined_texts

#### 3. Cluster the Combined Embeddings

Now, we can cluster the chats using the combined message- and webpage-preview embeddings.

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)

# check, if the webpreview embeddings were already processed
feature_name = "combined_webpreview"
combined_webpreview_done = is_processed(feature_name, True)

# load the results, if the webpreview embeddings were already processed
if combined_webpreview_done:
    print("Combined webpreview embeddings already processed. Loading results...")
    combined_webpreview_evaluation_metrics, combined_webpreview_topic_model, representative_messages, representative_webpreviews = load_data(feature_name, True)
    
# run the experiment, if the webpreview embeddings were not already processed
else:
    print("Running experiment for combined webpreview embeddings...")

    combined_webpreview_evaluation_metrics, combined_webpreview_topics, combined_webpreview_probabilities, combined_webpreview_topic_model = run_experiment(
        chat_embeddings=combined_vectors, 
        chat_texts=combined_texts, 
        n=1, 
        topic_model_dir_path=os.path.join(os.getcwd(), "../results/combined_webpreview_embeddings/topic_models/"),
        feature_name="combined_webpreview",
        used_embedding_model=transformer_model
    )

#### 4. Inspect the average evaluation metrics

In [None]:
for key, value in combined_webpreview_evaluation_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 5. Visualise and explore the Results

In [None]:
create_topic_visualisations(combined_webpreview_topic_model, combined_vectors, combined_texts)

#### 6. Inspect the most representative messages

**1. Create Topic Vectors**

In [53]:
if not combined_webpreview_done:
    topic_vectors = create_topic_vectors(combined_webpreview_topic_model, combined_vectors)
    display(topic_vectors)

**2. For each topic, extract the top n messages closest to the topic vector they were assigned to**

Like with the features used before, we add the embeddings created in `02_feature_engineering` back to the messages in the DataFrame and use them to compare them to the topic vectors created earlier to find the most representative messages for each topic.

In [54]:
# load the message embeddings
if not combined_webpreview_done:
    message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
    message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

    # create a series where each element is an message-vector
    message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

    # check if the message embeddings have the same shape as the dataframe
    assert message_embeddings_series.shape[0] == len(message_embeddings)
    message_embeddings_series

    # add the message embeddings to the dataframe
    df["message_vector"] = message_embeddings_series
    display(df[["message_text", "message_vector"]].head())

In [None]:
# get the messages closest to the topic centers
if not combined_webpreview_done:
    representative_messages = get_representative_texts(df = df,
                                                    topic_model = combined_webpreview_topic_model,
                                                    topic_vectors = topic_vectors,
                                                    chat_vectors = combined_vectors,
                                                    n = 10,
                                                    feature_name = "combined_webpreview",
                                                    text_column = "message_text",
                                                    text_embeddings_column = "message_vector",
                                                    text_preprocessed_column = "message_text_preprocessed")

    # save representative messages
    import json
    representative_messages_path = os.path.join(os.getcwd(), '../results/combined_webpreview_embeddings/representative_messages.json')
    representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
    with open(representative_messages_path, 'w') as jsonfile:
        json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

#### 7. Inspect the most representative Webpage-Previews

Now, we will inspect the most representative webpage-previews using the same approach we have already used to find representative messages. We will reuse the topic vectors created while inspecting the most representative messages.

In [56]:
# load the webpage-preview embeddings
if not combined_webpreview_done:
    webpreview_embeddings_path = os.path.join(os.getcwd(), '../features/3_webpage_embeddings.npy')

    # create a series where each element is an message-vector
    webpreview_embeddings = np.load(webpreview_embeddings_path)

    # add the message embeddings to the dataframe.
    df["webpreview_vector"] = webpreview_embeddings.tolist() 
    df[["webpage_title", "webpage_description", "webpreview_vector"]].head()

In [None]:
# get the webpage previews closest to the topic centers
if not combined_webpreview_done:
    representative_webpreviews = get_representative_texts(df, 
                                                        combined_webpreview_topic_model, 
                                                        topic_vectors, 
                                                        combined_vectors, 
                                                        10, 
                                                        "combined_webpreview", 
                                                        "webpage_description",
                                                        "webpreview_vector",
                                                        "webpage_description_preprocessed")

    # save representative webpreviews
    import json
    representative_webpreview_path = os.path.join(os.getcwd(), '../results/combined_webpreview_embeddings/representative_webpreviews.json')
    representative_webpreviews = {int(topic): messages for topic, messages in representative_webpreviews.items()} # convert keys to int
    with open(representative_webpreview_path, 'w') as jsonfile:
        json.dump(representative_webpreviews, jsonfile, indent=4)

# print representative webpreviews
for topic, text in representative_webpreviews.items():
    print(f"Topic {topic}:")
    for i, text in enumerate(text):
        print(f"{i+1}. {text.strip()}")
    print("\n")

## 6. Structural Vectors

Next, we conduct chat-clustering using a chats structural attributes. Structural attributes are a chats connections to other telegram entities. 

For our purposes, we have considered two kinds of connections:

1. Forwarded (fwd) messages between chats.

2. Textual references (ref) to chats or other telegram-entities.

#### 1. Load the Chat-Vectors

To vectorize these connections, we created chat-chat-matrices based on forwards and text based references between chats in `02_feature_engineering`, which we'll load now.

As we used both one-hot-encoding and log scaling to normalize the matrices, we'll compare results for both approaches and continue with the one archiving better scores.

In [27]:
# define paths
fwd_log_path = os.path.join(os.getcwd(), '../features/2_log_fwd_vectors.pkl')
fwd_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_fwd_vectors.pkl')
ref_log_path = os.path.join(os.getcwd(), '../features/2_log_ref_vectors.pkl')
ref_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_ref_vectors.pkl')

# load the chat vectors
fwd_log_vectors = pd.read_pickle(fwd_log_path)
fwd_onehot_vectors = pd.read_pickle(fwd_onehot_path)
ref_log_vectors = pd.read_pickle(ref_log_path)
ref_onehot_vectors = pd.read_pickle(ref_onehot_path)


Now we combine the forward-based and the reference-based chat-vectors to create our feature.

In [28]:
# combine the vectors
structure_log_vectors = fwd_log_vectors.combine(ref_log_vectors, lambda x, y: np.concatenate([x,y]))
structure_onehot_vectors = fwd_onehot_vectors.combine(ref_onehot_vectors, lambda x, y: np.concatenate([x,y]))

# check if the combined vectors have the expected length of a sum of the original vectors
dimension_fwd_vectors = len(fwd_log_vectors.iloc[1])
dimension_ref_vectors = len(ref_log_vectors.iloc[1])
assert len(structure_log_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors
assert len(structure_onehot_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors

#### 2. Cluster the Structural Vectors

**Get average models and evaluation results**

Eventhough they are no traditional text-embeddings we will pass the structural vectors to BERTopic for clustering. This is possible, as BERTopic can accept any kind of custom numerical vector representation of a document instead of generating them from text.

We will then use the message-text-aggregations created earlier to make the topics found by BERTopic interpretable. These documents will only be used for topic labeling and interpretation. The clustering itself will be entirely driven by the chat-chat-matrices we pass as embeddings.


In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)
    
# set the parameters for the HDBSCAN model
# alpha = 1.0
# min_cluster_size = 7
# min_samples = 5
# hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,
#                         min_samples=min_samples,
#                         alpha=alpha,
#                         prediction_data=True)

# run the experiment for the structural vectors normalized using the log transformation
log_structural_evaluation_metrics, log_structural_topics, log_structural_probabilities, log_structural_topic_model = run_experiment(
    chat_embeddings=structure_log_vectors, 
    chat_texts=chat_texts, 
    n=1, 
    topic_model_dir_path=os.path.join(os.getcwd(), "../results/log_structural_embeddings/topic_models/"),
    feature_name="log_structural",
    used_embedding_model=transformer_model
)

# run the experiment for the structural vectors normalized using one hot encoding
onehot_structural_evaluation_metrics, onehot_structural_topics, onehot_structural_probabilities, onehot_structural_topic_model = run_experiment(
    chat_embeddings=structure_onehot_vectors, 
    chat_texts=chat_texts, 
    n=1, 
    topic_model_dir_path=os.path.join(os.getcwd(), "../results/onehot_structural_embeddings/topic_models/"),
    feature_name="onehot_structural",
    used_embedding_model=transformer_model
)

# compare the evaluation metrics
print("\n#### Log Structural Evaluation Metrics: ####")
for key, value in log_structural_evaluation_metrics.items():
    print(f"{key.replace('_', ' ').title()}: {value}")
    
print("\n#### Onehot Structural Evaluation Metrics: ####")
for key, value in onehot_structural_evaluation_metrics.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

**Compare Models:**

Next, we compare the average evaluation results of the models created using the two normalization techniques. To determine which model has more favorable evaluation metrics, we will apply the following heuristics:

- **Coherence Score**: Lower values indicate better topic coherence.
- **Silhouette Score**: Higher values suggest better-defined and more distinct clusters.
- **Davies-Bouldin Score**: Lower values reflect more compact and well-separated clusters.
- **Topic Count**: This metric will not be considered, as the number of topics alone does not reliably indicate model quality.
- **Noise Count**: Lower values are preferable, as they imply fewer data points are classified as noise.

Based on these criteria, we will select the model with the more desirable evaluation results for further analysis and inspection.



In [None]:
#TODO: Remove
log_structural_evaluation_metrics["avg_coherence_scores"] = 0
log_structural_evaluation_metrics["avg_silhouette_scores"] = 0
log_structural_evaluation_metrics["avg_davies_bouldin_scores"] = 0
log_structural_evaluation_metrics["average_noise_counts"] = 0

In [None]:
structural_topics, structural_propabilities, structural_topic_model, structural_evaluation_metrics, structural_vectors  = compare_averages(
    metrics_model_1 = log_structural_evaluation_metrics,
    topics_model_1 = log_structural_topics,
    propabilities_model_1 = log_structural_probabilities,
    model_1 = log_structural_topic_model,
    vectors_1 = structure_log_vectors,
    metrics_model_2 = onehot_structural_evaluation_metrics,
    topics_model_2 = onehot_structural_topics, 
    propabilities_model_2 = onehot_structural_probabilities,
    model_2 = onehot_structural_topic_model,
    vectors_2 = structure_onehot_vectors
)
    

#### 3. Inspect the average evaluation metrics

In [None]:
for key, value in structural_evaluation_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 4. Visualise and Explore the results

In [None]:
create_topic_visualisations(structural_topic_model, structural_vectors, chat_texts)

#### 5. Inspect the most representative messages

In this approach, chat vectors were generated using a chat-to-chat matrix, rather than by averaging message vectors. As a result, we cannot directly compare message vectors to the topic center to identify the most representative messages.

However, we can still work with the chat vectors. 
Accordingly, we'll use the following appraoch to find the most representative messages:

1. Create topic vectors by taking the mean of the chat vectors of each topic.
2. Identify the chat whose vector is closest to the topic center.
3. Compute the mean of all message embeddings within this chat. (We can reuse the base chat representations created in `02_feature_engineering`, as they are simply averages of all message vectors of a chat.)
4. Retrieve the messages from this chat that are closest to this mean, as they are likely to be the most representative.


**1. Create Topic Vectors**

In [None]:
topic_vectors = create_topic_vectors(structural_topic_model, structural_vectors)
topic_vectors

**2. Find the most representative chat for each topic**

In [None]:
# create a DataFrame containing the structural vectors, the topics assigned by the topic models and the associated topic vectors
vector_assignment_df = pd.DataFrame(structural_vectors, columns=["structural_chat_vector"])
vector_assignment_df["topic_assignment"] = structural_topic_model.topics_
vector_assignment_df["topic_vectors"] = vector_assignment_df["topic_assignment"].map(topic_vectors)

# drop the rows where the topic assignment is -1 (i.e. the "Other" topic)
vector_assignment_df = vector_assignment_df[vector_assignment_df["topic_assignment"] != -1]

def cosine_similarity_row(row):
    
    # reshape the vectors to be 2D arrays (input format of cosine_similarity function)
    chat_vector = np.array(row['structural_chat_vector']).reshape(1, -1)
    topic_vector = np.array(row['topic_vectors']).reshape(1, -1)
    
    similarity = cosine_similarity(chat_vector, topic_vector)
    
    return similarity[0][0]

# calculate the cosine similarity between the chat vectors and their topic vectors
vector_assignment_df["similarity"] = vector_assignment_df.apply(lambda row: cosine_similarity_row(row), axis=1)

# get the chat vectors and the chat index of the chat vector most similar to its topic vector for each topic
most_similar_index = vector_assignment_df.groupby("topic_assignment")["similarity"].idxmax()
most_similar_chat = vector_assignment_df.loc[most_similar_index]

most_similar_chats = (
    most_similar_chat
    .reset_index()  
    .rename(columns={"index": "chat_id"}) 
    .set_index("topic_assignment") 
)[["chat_id","structural_chat_vector"]]

most_similar_chats

In [None]:
# add the chat embeddings created using averaged message embeddings to the dataframe
most_similar_chats["base_chat_vectors"] = most_similar_chats["chat_id"].map(base_chat_vectors)

# get the average chat vector for each topic
avg_chat_vectors = most_similar_chats["base_chat_vectors"]
avg_chat_vectors

**3. For each topic, extract the most representative messages of the chat closest to the topic vector**

As in previous steps, we'll add the message text embeddings from `02_feature_engineering` back to the DataFrame of messages.

For each topic, we'll then extract the most representative messages (i.e. the ones closest to the mean of all message embeddings of this chat) from the chat closest to the topic center.

In [None]:
# load the message embeddings
message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

# create a series where each element is an message-vector
message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

# check if the message embeddings have the same shape as the dataframe
assert message_embeddings_series.shape[0] == len(message_embeddings)
message_embeddings_series

# add the message embeddings to the dataframe
df["message_vector"] = message_embeddings_series
df[["message_text", "message_vector"]].head()

In [None]:
representative_messages = get_representative_texts(
    df = df,
    topic_model = structural_topic_model,
    topic_vectors = avg_chat_vectors, 
    chat_vectors = base_chat_vectors, # comparison between the "average chat vector" of a topic and the base chat vectors will always return the chat closest to the topic vector
    n = 10,
    feature_name = "structural",
    text_column = "message_text",
    text_embeddings_column = "message_vector",
    text_preprocessed_column = "message_text_preprocessed"
    )

# save representative messages
import json
representative_messages_dir = os.path.join(os.getcwd(), '../results/structural_embeddings/')
representative_messages_path = os.path.join(os.getcwd(), '../results/structural_embeddings/representative_messages.json')
os.makedirs(representative_messages_dir, exist_ok=True)
representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
with open(representative_messages_path, 'w') as jsonfile:
    json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

## 7. Combined Message Embeddings & Structural Vectors

Finally, we will cluster the chats based on a combination of their structural attributes and their averaged message embeddings. We will use the same approach as above.

#### 1. Load the Chat-Vectors

First, we load the structural vectors.

In [61]:
# define paths
fwd_log_path = os.path.join(os.getcwd(), '../features/2_log_fwd_vectors.pkl')
fwd_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_fwd_vectors.pkl')
ref_log_path = os.path.join(os.getcwd(), '../features/2_log_ref_vectors.pkl')
ref_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_ref_vectors.pkl')

# load the chat vectors
fwd_log_vectors = pd.read_pickle(fwd_log_path)
fwd_onehot_vectors = pd.read_pickle(fwd_onehot_path)
ref_log_vectors = pd.read_pickle(ref_log_path)
ref_onehot_vectors = pd.read_pickle(ref_onehot_path)

Now we combine the forward-based and the reference-based chat-vectors to create a single feature.

In [62]:
# combine the vectors normalized using the log transformation
structure_log_vectors = fwd_log_vectors.combine(ref_log_vectors, lambda x, y: np.concatenate([x,y]))

# combine the vectors normalized using one hot encoding
structure_onehot_vectors = fwd_onehot_vectors.combine(ref_onehot_vectors, lambda x, y: np.concatenate([x,y]))

# check if the combined vectors have the expected length of a sum of the original vectors
dimension_fwd_vectors = len(fwd_log_vectors.iloc[1])
dimension_ref_vectors = len(ref_log_vectors.iloc[1])
assert len(structure_log_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors
assert len(structure_onehot_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors

#### 2. Concenate the Features

To supplement the text embeddings with structural information, we'll concatenate the new feature vectors and the chat embeddings.

In [63]:
# combine the message embedding based chat vectors with the log-scaled structure-based chat vectors
combined_vectors_log = base_chat_vectors.combine(structure_log_vectors, lambda x, y: np.concatenate([x, y]))

# combine the message embedding based chat vectors with the one-hot encoded structure-based chat vectors
combined_vectors_onehot = base_chat_vectors.combine(structure_onehot_vectors, lambda x, y: np.concatenate([x, y]))

# check if the combined vectors have the expected length of a sum of the message text embeddings and the structure-based vectors
assert len(combined_vectors_log.iloc[1]) == len(base_chat_vectors.iloc[1]) + len(structure_log_vectors.iloc[1])
assert len(combined_vectors_onehot.iloc[1]) == len(base_chat_vectors.iloc[1]) + len(structure_onehot_vectors.iloc[1])

#### 3. Cluster the Combined Chat Vectors

Now, we can cluster the resulting combined vectors using BERTopic. 

We'll cluster the chats multiple times, using each normalization method and continue with the model that yields the more favorable evaluation metrics.

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    transformer_model = SentenceTransformer(model_name)
    transformer_model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    transformer_model = SentenceTransformer(model_path)
    
# set the parameters for the HDBSCAN model
alpha = 1.0
min_cluster_size = 7
min_samples = 5
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,
                        min_samples=min_samples,
                        alpha=alpha,
                        prediction_data=True)

# run the experiment for the combined vectors normalized containing structural vectors created using the log transformation
log_combined_structural_evaluation_metrics, log_combined_structural_topics, log_combined_structural_probabilities, log_combined_structural_topic_model = run_experiment(
    chat_embeddings=combined_vectors_log, 
    chat_texts=chat_texts, 
    n=1, 
    topic_model_dir_path=os.path.join(os.getcwd(), "../results/log_combined_structural_embeddings/topic_models/"),
    feature_name="log_combined_structural",
    used_embedding_model=transformer_model
)

# run the experiment for the combined vectors containing structural vectors created using one hot encoding
onehot_combined_structural_evaluation_metrics, onehot_combined_structural_topics, onehot_combined_structural_probabilities, onehot_combined_structural_topic_model = run_experiment(
    chat_embeddings=structure_onehot_vectors, 
    chat_texts=chat_texts, 
    n=1, 
    topic_model_dir_path=os.path.join(os.getcwd(), "../results/onehot_combined_structural_embeddings/topic_models/"),
    feature_name="onehot_combined_structural",
    used_embedding_model=transformer_model
)

# compare the evaluation metrics
print("\n#### Log Combined Structural Evaluation Metrics: ####")
for key, value in log_combined_structural_evaluation_metrics.items():
    print(f"{key.replace('_', ' ').title()}: {value}")
    
print("\n#### Onehot Combined Structural Evaluation Metrics: ####")
for key, value in onehot_combined_structural_evaluation_metrics.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

**Compare Models:**

Like we did with the models created using only structural vectors, we'll compare the average evaluation results of the models created using the two normalization techniques. To determine which model has more favorable evaluation metrics, we will apply the heuristics outlined earlier in this notebook and continue to work with the more favourable model.



In [None]:
combined_structural_topics, combined_structural_propabilities, combined_structural_topic_model, combined_structural_metrics, combined_structural_vectors  = compare_averages(
    metrics_model_1 = log_combined_structural_evaluation_metrics,
    topics_model_1 = log_combined_structural_topics,
    propabilities_model_1 = log_combined_structural_probabilities,
    model_1 = log_combined_structural_topic_model,
    vectors_1 = combined_vectors_log,
    metrics_model_2 = onehot_combined_structural_evaluation_metrics,
    topics_model_2 = onehot_combined_structural_topics, 
    propabilities_model_2 = onehot_combined_structural_probabilities,
    model_2 = onehot_combined_structural_topic_model,
    vectors_2 = combined_vectors_onehot
)

#### 4. Inspect the average evaluation metrics

In [None]:
for key, value in combined_structural_metrics.items():
    print(f"{key.replace("_", " ")[4:].title()}: {value}")

#### 5. Visualise and explore the results

In [None]:
create_topic_visualisations(combined_structural_topic_model, combined_structural_vectors, chat_texts)

#### 6. Inspect the most representative messages

Since the chat vectors were generated by combining chat-cooccurrence vectors and message text embeddings, we can't directly compare individual message vectors to the topic center in order to find the most representative messages.

To address this, we'll add its chat's cooccurrence vector to each message vectors. This will allow us to compare the resulting message vectors to the topic centers, which we'll create by averaging all chat vectors assigned to a given topic.


**1. Create Topic Vectors**

In [None]:
topic_vectors = create_topic_vectors(combined_structural_topic_model, combined_structural_vectors)
topic_vectors

**2. Add the structural vector of each chat to its messages**

In [None]:
#TODO: Avoid repeating code

# load the message embeddings
message_embeddings_path = os.path.join(os.getcwd(), '../features/0_message_embeddings.npy')
message_embeddings = np.load(message_embeddings_path, allow_pickle=True)

# create a series where each element is an message-vector
message_embeddings_series = pd.Series([embedding for embedding in message_embeddings])

# check if the message embeddings have the same shape as the dataframe
assert message_embeddings_series.shape[0] == len(message_embeddings)
message_embeddings_series

# add the message embeddings to the dataframe
df["message_vector"] = message_embeddings_series
df[["message_text", "message_vector"]].head()

In [None]:
# isolate the structural part of the combined vectors to be appended to the message embeddings
structural_part = combined_structural_vectors.apply(lambda x: x[384:])

# get the representative messages
representative_messages =  get_representative_texts(df = df,
                                                    topic_model = combined_structural_topic_model,
                                                    topic_vectors = topic_vectors, 
                                                    chat_vectors = combined_structural_vectors, 
                                                    n = 10,
                                                    feature_name = "combined_structural",
                                                    text_column = "message_text",
                                                    text_embeddings_column = "message_vector",
                                                    text_preprocessed_column = "message_text_preprocessed",
                                                    add_structural_info = True,
                                                    structural_embedding_chat_map = structural_part)

# save representative messages
import json
representative_messages_dir = os.path.join(os.getcwd(), '../results/combined_structural_embeddings/')
representative_messages_path = os.path.join(os.getcwd(), '../results/combined_structural_embeddings/representative_messages.json')
os.makedirs(representative_messages_dir, exist_ok=True)
representative_messages = {int(topic): messages for topic, messages in representative_messages.items()} # convert keys to int
with open(representative_messages_path, 'w') as jsonfile:
    json.dump(representative_messages, jsonfile, indent=4)

# print representative messages
for topic, messages in representative_messages.items():
    print(f"Topic {topic}:")
    for i, message in enumerate(messages):
        print(f"{i+1}. {message.strip()}")
    print("\n")

## 8. Evaluation

Finally, we can compare the different approaches based on the evaluation data we collected for each feature and feature combination.

In [None]:
evaluation_df = pd.DataFrame({
    "Features": ["Msg", "Filtered", "Webpreview", "Msg + Webpreview", "Structural", "Msg + Structural"],
    "Silhouette Score": [ss_base_embeddings, ss_filtered_embeddings, ss_webpreview_embeddings, ss_msg_webpreview_embeddings, ss_structural_embeddings, ss_msg_structural_embeddings],
    "Topic Count": [topic_count_base_embeddings, topic_count_filtered_embeddings, topic_count_webpreview_embeddings, topic_count_msg_webpreview_embeddings, topic_count_structural_embeddings, topic_count_msg_structural_embeddings], 
    "Noise Instances": [noise_base_embeddings, noise_filtered_embeddings, noise_webpreview_embeddings, noise_count_msg_webpreview_embeddings, noise_structural_embeddings, noise_structural_embeddings]
}).set_index("Features")
evaluation_df