In [None]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.decomposition import PCA
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Parallel, delayed
from sklearn.metrics import silhouette_score

# download stopwords and tokenizers
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Clustering Experiment

## 1. Preparation

#### 1. Load the Dataset

First, we'll load the dataset.

In [None]:
dataset_path = os.path.join(os.getcwd(), '../data/preprocessed/df_preprocessed.pkl')

with open(dataset_path, 'rb') as f:
    df = pickle.load(f)

df.shape

#### 2. Create Chat-Text-Aggregations for Topic-Interpretability

To ensure the interpretability of the topics generated by BERTopic, we need to provide it with the texts for each chat. 

To make sure that the "chat-text-aggregations" used for this purpose are as meaningful as possible, we will perform the following operations: 

- **Basic preprocessing**, including lowercasing, stop word removal, removal of punctuation and digits, and tokenization.

- **Removing custom stopwords** specific to the Telegram domain, such as:

    - Telegram chat handles, which are frequently used to "sign" each  message in broadcast chats and could distort the analysis of the most common words.

    - Common social media call-to-action phrases, such as "share," "follow," and "comment," which are often repeated irrespective of topic.

- **Multilingual processing:** Since our corpus is multilingual, language-dependent preprocessing will be applied only to messages in the most frequent languages, as the other languages contribute only a marginal number of messages.
- **TF-IDF filtering:** We will filter out words below a certain TF-IDF threshold to ensure that only distinctive terms are included in the aggregation.

Afterwards, we will aggregate the messages and webpage previews for each chat into a single string. This string, along with the chat vector representations created earlier, will be passed to BERTopic as a basis for its topic description.

#### 1. Preprocess Data

**Define the Preprocessing Function**

In [21]:
def preprocess(df:pd.DataFrame, text_column: str) -> pd.DataFrame:

    print("Preprocessing messages...")

    # get stop words 
    stop_words_en = set(stopwords.words('english'))
    stop_words_de = set(stopwords.words('german'))

    cta_stop_words_en = {'click', 'tap', 'press', 'subscribe', 'follow', 'share', 'like', 'comment',
                        'join', 'sign', 'visit', 'download', 'register', 'give', 'message', 'chat', 'group', 'channel', 'bot', 'reply'}
    cta_stop_words_de = {'klicken', 'tippen', 'drücken', 'abonnieren', 'folgen', 'teilen', 'mögen', 'kommentieren',
                        'beitreten', 'anmelden', 'besuchen', 'herunterladen', 'registrieren', 'geben', 'message', 'chat', 'group', 'channel', 'bot', 'reply'}

    stop_words_en = stop_words_en.union(cta_stop_words_en)
    stop_words_de = stop_words_de.union(cta_stop_words_de)
    print("Stop words loaded")

    # get frequent chat handles
    frequent_chat_handles = df["referenced_chat_handles"].explode().value_counts()
    frequent_chat_handles = frequent_chat_handles[frequent_chat_handles > 100].index.tolist()
    print("Frequent chat handles loaded")

    # create regex patterns
    def create_pattern(words):
        return rf'\b(?:{"|".join(map(re.escape, words))})\b'

    frequent_chat_pattern = create_pattern(frequent_chat_handles)
    stop_words_en_pattern = create_pattern(stop_words_en)
    stop_words_de_pattern = create_pattern(stop_words_de)
    print("Regex-Patterns created")
    
    # remove the most frequent chat handles  #TODO: Seems not to work
    df[f"{text_column}_cleaned"] = df[text_column].str.replace(frequent_chat_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("Handles removed")

    # remove URLs
    pattern = r"(https?:\/\/[^\s/$.?#].[^\s]*[^\s.,?!)](?![\])]))|(www\.[^\s/$.?#].[^\s]*[^\s.,?!)](?![\])]))|(t\.me\/[^\s.,?!)]*)"
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(pattern, '', regex=True).str.strip()
    print("URLs removed")

    # lowercase text
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.lower()
    print("Lowercase")
    
    # remove punctuation
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(f"[{re.escape(string.punctuation)}]", ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
    print("Punctuation removed")

    # remove the most frequent chat handles that included an @
    #df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(frequent_chat_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    #print("Handles with @ removed")

    # remove punctuation again
    #df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(f"[{re.escape(string.punctuation)}]", ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
    #print("Punctuation removed")

    # remove digits
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].str.replace(r'\d+', '', regex=True).str.strip()
    print("Digits removed")

    # remove english stop words
    df.loc[df["message_text_lang"] == "English", f"{text_column}_cleaned"] = \
        df.loc[df["message_text_lang"] == "English", f"{text_column}_cleaned"].str.replace(stop_words_en_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("English stop words removed")

    # remove german stop words
    df.loc[df["message_text_lang"] == "German", f"{text_column}_cleaned"] = \
        df.loc[df["message_text_lang"] == "German", f"{text_column}_cleaned"].str.replace(stop_words_de_pattern, '', regex=True, flags=re.IGNORECASE).str.strip()
    print("German stop words removed")

    # fill NaN with empty string
    df[f"{text_column}_cleaned"] = df[f"{text_column}_cleaned"].fillna('')
    
    # tokenize text
    df[f"{text_column}_preprocessed"] = df[f"{text_column}_cleaned"].apply(lambda x: word_tokenize(x) if x else [])

    #TODO:  Lemmatize???

    #TODO: Remove Emoj

    df[f"{text_column}_preprocessed"] = df[f"{text_column}_preprocessed"].apply(lambda x: ' '.join(x))
    print("Tokenized")

    return df


**Apply Preprocessing to Message Texts**

In [None]:
# check, if the data was already preprocessed
preprocessen_msg_path = os.path.join(os.getcwd(), '../data/preprocessed/preprocessed_msgs_viz.pkl')
already_preprocessed = os.path.exists(preprocessen_msg_path)
already_preprocessed

In [None]:
if not already_preprocessed:

    # apply preprocessing
    df = preprocess(df, "message_text")

    # save the preprocessed data
    df["message_text_preprocessed"].to_pickle(preprocessen_msg_path)
    print("Preprocessed messages saved")

else:
    print("Loading preprocessed messages...")
    preprocessed_msg = pd.read_pickle(preprocessen_msg_path)
    df["message_text_preprocessed"] = preprocessed_msg

# display five random samples
with pd.option_context('display.max_colwidth', None):
    display(df[["message_text", "message_text_preprocessed", "referenced_chat_handles"]].sample(5))

**Apply Preprocessing to Webpage Previews**

In [None]:
# check, if the data was already preprocessed
preprocessen_web_path = os.path.join(os.getcwd(), '../data/preprocessed/preprocessed_web_viz.pkl')
already_preprocessed = os.path.exists(preprocessen_web_path)
already_preprocessed

In [25]:
if not already_preprocessed:

    # apply preprocessing
    df = preprocess(df, "webpage_description")

**Remove Emojis**

In [None]:
# create a list of emoji-unicodes using data from "https://unicode.org/Public/emoji/15.1/"
if not already_preprocessed:
    
    def load_emoji_list(file_paths: list[str]) -> list[str]:
        """
        Load a list of all emoji from the given file paths.
        Args:
            file_paths (list): A list of file paths to load emoji sequences from.
        Returns:
            list: A list of unicode sequences representing the loaded emoji sequences.
        """
        
        unicode_list = []

        # match lines with unicode, including ranges like 231A..231B 
        range_pattern = re.compile(r"([0-9A-Fa-f]{4,6})\.\.([0-9A-Fa-f]{4,6})\s*;\s*")
        code_point_pattern = re.compile(r"([0-9A-Fa-f]{4,6}(?:\s[0-9A-Fa-f]{4,6})*)\s*;\s*")

        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

            for line in lines:
                range_match = range_pattern.match(line)
                
                # add elements of ranges as individual codes to list
                if range_match:
                    start_code, end_code = range_match.groups()
                    start_int = int(start_code, 16)
                    end_int = int(end_code, 16)
                    unicode_list.extend([chr(code) for code in range(start_int, end_int + 1)])
                else:
                    code_match = code_point_pattern.match(line)
                    if code_match:
                        code_points = code_match.group(1)       
                        code_point_list = code_points.split()
                        # create zwj sequences by combining all code points
                        unicode_list.append(''.join([chr(int(code, 16)) for code in code_point_list]))
        print("Emoji sequences loaded")
        return unicode_list

    # list the paths to the unicode-files
    path_1 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-sequences.txt")
    path_2 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-test.txt")
    path_3 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-zwj-sequences.txt")
    file_paths = [path_1, path_2, path_3]

    # load all emojis from the unicode-files
    emoji_sequences = load_emoji_list(file_paths)

    # create a regex pattern from the emoji sequence
    emoji_pattern = '|'.join(re.escape(emoji) for emoji in emoji_sequences)
    print("Emoji pattern created")

    def demojize_chunk(chunk, emoji_pattern):
        # remove emojis
        chunk["webpage_description_preprocessed"] = chunk["webpage_description_preprocessed"].str.replace(emoji_pattern, " ", regex=True)
        return chunk

    n_jobs = 3  # Use three cores (seems to be fastest?)

    # remove emojis in parallel for each chunk
    chunks = np.array_split(df, n_jobs)
    df_chunks = Parallel(n_jobs=n_jobs)(delayed(demojize_chunk)(chunk, emoji_pattern) for chunk in chunks)
    df = pd.concat(df_chunks, ignore_index=True)

    # save the preprocessed data
    df["webpage_description_preprocessed"].to_pickle(preprocessen_web_path)
    print("Preprocessed messages saved")    

# simply load the preprocessed data, if it was already preprocessed
else:
    print("Loading preprocessed webpage previews...")
    preprocessed_web_previews = pd.read_pickle(preprocessen_web_path)
    df["webpage_description_preprocessed"] = preprocessed_web_previews

#### 2. Apply TF-IDF-Filtering

**Apply TF-IDF-Filtering to Message Texts**

In [None]:
tfidf_path = os.path.join(os.getcwd(), '../data/preprocessed/tfidf_msgs_viz.pkl')
already_tfidf = os.path.exists(tfidf_path)
already_tfidf

In [None]:
if not already_tfidf:
    # isolate English and German texts and prepare them for TF-IDF vectorization
    english_texts = df[df["message_text_lang"] == "English"]["message_text_preprocessed"]
    german_texts = df[df["message_text_lang"] == "German"]["message_text_preprocessed"]

    # create and fit TF-IDF vectorizers based on the isolated texts
    tfidf_vectorizer_en = TfidfVectorizer(ngram_range=(1,1))
    tfidf_vectorizer_de = TfidfVectorizer(ngram_range=(1,1)) 
    tfidf_vectorizer_en.fit(english_texts)
    tfidf_vectorizer_de.fit(german_texts)

    feature_names_en = tfidf_vectorizer_en.get_feature_names_out()
    feature_names_de = tfidf_vectorizer_de.get_feature_names_out()

    def apply_tf_idf_threshold(row, tfidf_vectorizer_en, tfidf_vectorizer_de, threshold):

        if row["message_text_lang"] == "English":
            tfidf_vectorizer = tfidf_vectorizer_en
            feature_names = feature_names_en
        elif row["message_text_lang"] == "German":
            tfidf_vectorizer = tfidf_vectorizer_de
            feature_names = feature_names_de
        else:
            return row["message_text_preprocessed"]

        tfidf_matrix = tfidf_vectorizer.transform([row["message_text_preprocessed"]])
        tfidf_values = tfidf_matrix.toarray().flatten()
        
        distinctive_words = [feature_names[i] for i in np.where(tfidf_values > threshold)[0]]

        return ' '.join(distinctive_words)

    # TODO: Change column name
    # apply the threshold to the TF-IDF values
    df["message_text_tfidf"] = df.apply(lambda x: apply_tf_idf_threshold(x, tfidf_vectorizer_en, tfidf_vectorizer_de, 0.15), axis=1)

    # save the preprocessed messages
    df["message_text_tfidf"].to_pickle(tfidf_path)

else:
    print("Loading tf-idf filtered messages...")
    tfidf_filtered_msg = pd.read_pickle(tfidf_path)
    df["message_text_tfidf"] = tfidf_filtered_msg

**Apply TF-IDF-Filtering to Webpage Previews**

In [None]:
tfidf_path = os.path.join(os.getcwd(), '../data/preprocessed/tfidf_web_viz.pkl')
already_tfidf = os.path.exists(tfidf_path)
already_tfidf

In [None]:
if not already_tfidf:
    # isolate English and German texts and prepare them for TF-IDF vectorization
    english_texts = df[df["webpage_description_lang"] == "English"]["webpage_description_preprocessed"]
    german_texts = df[df["webpage_description_lang"] == "German"]["webpage_description_preprocessed"]

    # create and fit TF-IDF vectorizers based on the isolated texts
    tfidf_vectorizer_en = TfidfVectorizer(ngram_range=(1,1))
    tfidf_vectorizer_de = TfidfVectorizer(ngram_range=(1,1)) 
    tfidf_vectorizer_en.fit(english_texts)
    tfidf_vectorizer_de.fit(german_texts)

    feature_names_en = tfidf_vectorizer_en.get_feature_names_out()
    feature_names_de = tfidf_vectorizer_de.get_feature_names_out()

    def apply_tf_idf_threshold_web(row, tfidf_vectorizer_en, tfidf_vectorizer_de, threshold):

        if row["webpage_description_lang"] == "English":
            tfidf_vectorizer = tfidf_vectorizer_en
            feature_names = feature_names_en
        elif row["webpage_description_lang"] == "German":
            tfidf_vectorizer = tfidf_vectorizer_de
            feature_names = feature_names_de
        else:
            return row["webpage_description_preprocessed"]

        tfidf_matrix = tfidf_vectorizer.transform([row["webpage_description_preprocessed"]])
        tfidf_values = tfidf_matrix.toarray().flatten()
        
        distinctive_words = [feature_names[i] for i in np.where(tfidf_values > threshold)[0]]

        return ' '.join(distinctive_words)

    # apply the threshold to the TF-IDF values
    df["webpage_description_tfidf"] = df.apply(lambda x: apply_tf_idf_threshold_web(x, tfidf_vectorizer_en, tfidf_vectorizer_de, 0.15), axis=1)

    # save the preprocessed messages
    df["webpage_description_tfidf"].to_pickle(tfidf_path)

else:
    print("Loading tf-idf-filtered messages...")
    tfidf_filtered_msg = pd.read_pickle(tfidf_path)
    df["webpage_description_tfidf"] = tfidf_filtered_msg

#### 3. Create Chat-Text-Aggregations for each Chat

In [None]:
grouped = df.groupby("telegram_chat_id")
chat_texts = grouped["message_text_tfidf"].agg(lambda x: " ".join(x))
chat_texts = chat_texts.astype(str)
chat_texts

#### 4. Create Webpage-Preview-Aggregations for each Chat

In [None]:
grouped = df.groupby("telegram_chat_id")
chat_webpage_previews = grouped["webpage_description_tfidf"].agg(lambda x: " ".join(x))
chat_webpage_previews = chat_webpage_previews.astype(str)
chat_webpage_previews

## 2. Basic Chat Embeddings

#### 1. Load Chat Representations

First, we load the chat representations we created in the notebook `02_feature_engineering`

In [None]:
base_path = os.path.join(os.getcwd(), '../features/0_base_chat_vectors.npy')
base_chat_vectors = np.load(base_path, allow_pickle=True)
print(f"Number of chat vectors: {base_chat_vectors.shape[0]}")
print(f"Vector Dimension: {base_chat_vectors.iloc[0].shape}")
base_chat_vectors

#### 2. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings.

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

# Load or download the model
if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    model = SentenceTransformer(model_name)
    model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    model = SentenceTransformer(model_path)

##### use KeyBERTInspired¶



In [None]:
def apply_bertTopic(chat_embeddings: pd.Series, chat_texts: pd.Series): 

    # prepare the embeddings for dimensionality reduction by stacking them
    chat_embeddings = np.vstack(chat_embeddings)
    print("Preperation: Done")
    
    # apply dimensionality reduction (We use PCA and 5 dimensions, as suggested by the BERTopic documentation)
    #pca = PCA(n_components=5)
    #reduced_embeddings = pca.fit_transform(chat_embeddings)
    #print("Dimensionality Reduction: Done")

    # create your representation model
    representation_model = KeyBERTInspired() #TODO: Configure?
    
    #TODO: Random state?

    # initiate the BERTopic model
    docs = chat_texts.tolist()
    #cluster_model = KMeans(n_clusters=14) #9 #15->gut
    topic_model = BERTopic(embedding_model=model, 
                           verbose=True, 
                           calculate_probabilities=True, 
                           representation_model=representation_model)
    #hdbscan_model=cluster_model 
    print("Loading model: Done")
        
    # fit the model to the reduced embeddings
    topics, propabilities = topic_model.fit_transform(embeddings = chat_embeddings, documents = docs)
    print("Model fitting: Done")

    return topics, propabilities, topic_model

topics, propabilities, topic_model = apply_bertTopic(base_chat_vectors, chat_texts)

#### 3. Visualise and explore the results

In [None]:
def create_topic_visualisations(topic_model, embeddings, texts_aggregations):
    # Visualize topics
    #topic_model.visualize_topics().show()  

    print("Topic Map:")
    # UMAP dimensionality reduction
    from umap import UMAP
    import numpy as np
    docs = texts_aggregations.tolist()
    embeddings = np.vstack(embeddings)
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
    
    # Visualize documents using UMAP embeddings
    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings).show()

    print("Bar Chart, displaying the top 13 topics and top 20 words per topic:")
    # Visualize bar chart for top 13 topics and 20 words per topic
    topic_model.visualize_barchart(top_n_topics=13, n_words=20).show()

    print("Hierarchical Topics:")
    # Visualize hierarchical topics
    hierarchical_topics = topic_model.hierarchical_topics(texts_aggregations)
    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()

create_topic_visualisations(topic_model, base_chat_vectors, chat_texts)

#### 4. Evaluate Clusters

In [None]:
def get_evaluations(chat_embeddings, propabilities, topic_model, text_aggregations):
    """
    
    Returns:
        silhouette_score_result (float): The silhouette score of the chat embeddings
        topic_count (int): The number of topics (including the "Other" (-1) topic)
    """
    
    # get the document info and topics
    document_info = topic_model.get_document_info(text_aggregations)
    topics = document_info['Topic']
    
    # prepare the embeddings by stacking them
    chat_embeddings = np.vstack(chat_embeddings)    

    # get the silhouette score while ignoring the "Other" topic
    valid_indices_filter = topics != -1
    filtered_embeddings = chat_embeddings[valid_indices_filter]
    filtered_topics = topics[valid_indices_filter]
    silhouette_score_result = silhouette_score(X=filtered_embeddings, labels=filtered_topics)
    print(f'Silhouette Score: {silhouette_score_result}')
    
    # calculate the number of topics found
    topic_count = len(np.unique(topics))
    print(f'Topic Count: {topic_count}')
    
    # calculate the number of noise points
    noise_count = len(topics[topics == -1])
    print(f'Noise Count: {noise_count}')
    
    return silhouette_score_result, topic_count, noise_count
    
(ss_base_embeddings, 
 topic_count_base_embeddings, 
 noise_base_embeddings) = get_evaluations(base_chat_vectors, 
                                          propabilities, 
                                          topic_model, 
                                          chat_texts)

## 3. Filtered Chat Embeddings

#### 1. Load the filtered Chat Embeddings

First, we load the chat representations we created in the notebook `02_feature_engineering`

In [None]:
filtered_path = os.path.join(os.getcwd(), '../features/1_filtered_chat_vectors.npy')
filtered_chat_vectors = np.load(filtered_path, allow_pickle=True)
print(f"Number of chat vectors: {filtered_chat_vectors.shape[0]}")
print(f"Vector Dimension: {filtered_chat_vectors.iloc[0].shape}")
filtered_chat_vectors

#### 2. Create filtered Chat-Message-Text-Aggregations

Now we filter the dataset to remove all forwarded/original.Message-Pairs using the indices we saveed in  `02_feature_engineering`.

In [None]:
indices_path = os.path.join(os.getcwd(), "../features/1_implicit_ref_filtered_indices.npy")
filtered_rows_indices = np.load(indices_path)
df_references_filtered = df.loc[filtered_rows_indices]
df_references_filtered.shape

As the dataset already contains the preprocessed Message-Text, we simply need to aggregate them again to create Chat-Message-Text-Aggregation that exclude Original/Forward-Pairs.

In [None]:
grouped = df_references_filtered.groupby("telegram_chat_id")
filtered_chat_texts = grouped["message_text_tfidf"].agg(lambda x: " ".join(x))
filtered_chat_texts = filtered_chat_texts.astype(str)
filtered_chat_texts

#### 3. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings.

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

# Load or download the model
if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    model = SentenceTransformer(model_name)
    model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    model = SentenceTransformer(model_path)

In [None]:
topics, propabilities, topic_model = apply_bertTopic(filtered_chat_vectors, filtered_chat_texts)

#### 3. Visualise and explore the results

In [None]:
create_topic_visualisations(topic_model, filtered_chat_vectors, filtered_chat_texts)

#### 4. Evaluate the results

In [None]:
(ss_filtered_embeddings, 
 topic_count_filtered_embeddings,
 noise_filtered_embeddings) = get_evaluations(filtered_chat_vectors,
                                              propabilities,
                                              topic_model,
                                              filtered_chat_texts)

## 4. Webpage Preview Embeddings

#### 1. Load Chat Representations

First, we load the chat representations we created in the notebook `02_feature_engineering`

In [None]:
webpreview_path = os.path.join(os.getcwd(), '../features/3_webpreview_chat_vectors.npy')
webpreview_chat_vectors = np.load(webpreview_path, allow_pickle=True)
print(f"Number of chat vectors: {webpreview_chat_vectors.shape[0]}")
print(f"Vector Dimension: {webpreview_chat_vectors.iloc[0].shape}")
webpreview_chat_vectors

#### 2. Cluster the embeddings

Now we can use BERTopic to cluster the embeddings.

In [None]:
# Load the model we used for the embeddings, in order to use it for the representational model
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

# Load or download the model
if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    model = SentenceTransformer(model_name)
    model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    model = SentenceTransformer(model_path)

In [None]:
topics, propabilities, topic_model = apply_bertTopic(webpreview_chat_vectors, chat_webpage_previews)

#### 3. Visualize and explore the Results

In [None]:
create_topic_visualisations(topic_model, webpreview_chat_vectors, chat_webpage_previews)

#### 4. Evaluate the Results

In [None]:
(ss_webpreview_embeddings, 
 topic_count_webpreview_embeddings,
 noise_webpreview_embeddings) = get_evaluations(webpreview_chat_vectors, 
                                                propabilities,
                                                topic_model,
                                                chat_webpage_previews)

## 5. Combined Message & Webpage-Preview Embeddings

Next, we combine the two kinds of text embeddings and inspect the changes in clustering results.

#### 1. Combine Message-Text- and Webpage-Preview-Vectors

First, we load the chat-vectors we created by combining the two webpage-preview- and message-vectors by taking their mean.

In [53]:
combine_vectors_path = os.path.join(os.getcwd(), '../features/3_msg_webpreview_chat_vectors.npy')
combined_vectors = np.load(combine_vectors_path, allow_pickle=True)

#### 2. Create combined Chat-Text-Aggregations

Next, we aggregate the Text-Aggregations for Webpage-Previews and Chat-Messages in order to use them  to make the topics interpretable

In [None]:
# create a DataFrame to combine the texts
combined_text_dataframe = pd.DataFrame({
    "chat_texts": chat_texts,
    "chat_webpage_previews": chat_webpage_previews
})

# combine the texts
combined_text_dataframe["combined_texts"] = combined_text_dataframe["chat_texts"] + " " + combined_text_dataframe["chat_webpage_previews"]

# calculate the length of the texts
combined_text_dataframe["chat_texts_len"] = combined_text_dataframe["chat_texts"].apply(lambda x: len(x.split()))
combined_text_dataframe["chat_webpage_previews_len"] = combined_text_dataframe["chat_webpage_previews"].apply(lambda x: len(x.split()))
combined_text_dataframe["combined_texts_len"] = combined_text_dataframe["combined_texts"].apply(lambda x: len(x.split()))

# check if the combined arrays are the same length as the original arrays combined
assert combined_text_dataframe["combined_texts_len"].equals(combined_text_dataframe["chat_texts_len"] + combined_text_dataframe["chat_webpage_previews_len"])

# get the combined texts
combined_texts = combined_text_dataframe["combined_texts"]
combined_texts

#### 3. Cluster the Combined Embeddings

In [None]:
topics, _, topic_model = apply_bertTopic(combined_vectors, combined_texts)

#### 4. Visualise and explore the Results

In [None]:
create_topic_visualisations(topic_model, combined_vectors, combined_texts)

#### 5. Evaluate the Results

In [None]:
(ss_msg_webpreview_embeddings, 
 topic_count_msg_webpreview_embeddings, 
 noise_count_msg_webpreview_embeddings) = get_evaluations(combined_vectors,
                                                          propabilities, 
                                                          topic_model,
                                                          combined_texts)

## 6. Structural Vectors

Now, we conduct chat-clustering using a chats structural attributes. Structural attributes are a chats connections to other telegram entities. 

For our purposes, we have considered two kinds of connections:

1. Forwarded (fwd) messages between chats.

2. Textual references (ref) to chats or other telegram-entities.

#### 1. Load the Chat-Vectors

To vectorize these connections, we created chat-chat-matrices based on forwards and text based references between chats in `02_feature_engineering`, which we'll load now.

In [58]:
# define paths
fwd_log_path = os.path.join(os.getcwd(), '../features/2_log_fwd_vectors.pkl')
fwd_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_fwd_vectors.pkl')
ref_log_path = os.path.join(os.getcwd(), '../features/2_log_ref_vectors.pkl')
ref_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_ref_vectors.pkl')

# load the chat vectors
fwd_log_vectors = pd.read_pickle(fwd_log_path)
fwd_onehot_vectors = pd.read_pickle(fwd_onehot_path)
ref_log_vectors = pd.read_pickle(ref_log_path)
ref_onehot_vectors = pd.read_pickle(ref_onehot_path)



Now we combine the forward-based and the reference-based chat-vectors to create our feature.

In [59]:
# combine the vectors
structure_log_vectors = fwd_log_vectors.combine(ref_log_vectors, lambda x, y: np.concatenate([x,y]))
structure_onehot_vectors = fwd_onehot_vectors.combine(ref_onehot_vectors, lambda x, y: np.concatenate([x,y]))

# check if the combined vectors have the expected length of a sum of the original vectors
dimension_fwd_vectors = len(fwd_log_vectors.iloc[1])
dimension_ref_vectors = len(ref_log_vectors.iloc[1])
assert len(structure_log_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors
assert len(structure_onehot_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors

#### 2. Cluster the Structural Vectors

Eventhough they are no Text-Embeddings we will pass the structural vectors to BERTopic for clustering. This is possible, as BERTopic can accept any kind of numerical custom embeddings instead of generating them from text.

We will again use the filtered Message-Text-Aggregations to make the topics interpretable. These documents will only be used for topic labeling and interpretation. The clustering itself will be entirely driven by the chat-chat-matrices we pass as embeddings.


In [None]:
topics, _, topic_model = apply_bertTopic(structure_log_vectors, chat_texts)

#### 3. Visualise and Explore the results

In [None]:
create_topic_visualisations(topic_model, structure_log_vectors, chat_texts)

#### 4. Evaluate the Results

In [None]:
(ss_structural_embeddings, 
topic_count_structural_embeddings, 
noise_structural_embeddings) = get_evaluations(structure_log_vectors,
                                               propabilities,
                                               topic_model, 
                                               chat_texts)

## 7. Combined Structural Vectors & Message Embeddings

Now, we conduct chat-clustering using a chats structural attributes. Structural attributes are a chats connections to other telegram entities. 

For our purposes, we have considered two kinds of connections:

1. Forwarded (fwd) messages between chats.

2. Textual references (ref) to chats or other telegram-entities.

#### 1. Load the Chat-Vectors

To vectorize these connections, we created chat-chat-matrices based on forwards and text based references between chats in `02_feature_engineering`, which we'll load now.

In [63]:
# define paths
fwd_log_path = os.path.join(os.getcwd(), '../features/2_log_fwd_vectors.pkl')
fwd_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_fwd_vectors.pkl')
ref_log_path = os.path.join(os.getcwd(), '../features/2_log_ref_vectors.pkl')
ref_onehot_path = os.path.join(os.getcwd(), '../features/2_onehot_ref_vectors.pkl')

# load the chat vectors
fwd_log_vectors = pd.read_pickle(fwd_log_path)
fwd_onehot_vectors = pd.read_pickle(fwd_onehot_path)
ref_log_vectors = pd.read_pickle(ref_log_path)
ref_onehot_vectors = pd.read_pickle(ref_onehot_path)



Now we combine the forward-based and the reference-based chat-vectors to create our feature.

In [64]:
# combine the vectors
structure_log_vectors = fwd_log_vectors.combine(ref_log_vectors, lambda x, y: np.concatenate([x,y]))
structure_onehot_vectors = fwd_onehot_vectors.combine(ref_onehot_vectors, lambda x, y: np.concatenate([x,y]))

# check if the combined vectors have the expected length of a sum of the original vectors
dimension_fwd_vectors = len(fwd_log_vectors.iloc[1])
dimension_ref_vectors = len(ref_log_vectors.iloc[1])
assert len(structure_log_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors
assert len(structure_onehot_vectors.iloc[1]) == dimension_fwd_vectors + dimension_ref_vectors

#### 2. Concenate the Features

To supplement the text embeddings with structural information while still using BERTopic for clustering, we'll concatenate the new feature vectors and the chat embeddings.

In [None]:
# combine the message embedding based chat vectors with the log-scaled structure-based chat vectors
combined_vectors = base_chat_vectors.combine(structure_log_vectors, lambda x, y: np.concatenate([x, y]))

# check if the combined vectors have the expected length of a sum of the message text embeddings and the structure-based vectors
assert len(combined_vectors.iloc[1]) == len(base_chat_vectors.iloc[1]) + len(structure_log_vectors.iloc[1])

combined_vectors

#### 3. Cluster the Combined Chat Vectors

Now, we can cluster the resulting combined vectors using BERTopic. As we're only use the Messages as textual feature, we will reuse the filtered Chat-Message-Text-Aggregations we created earlier.

In [None]:
topics, _, topic_model = apply_bertTopic(combined_vectors, chat_texts)

#### 4. Visualise and explore the results

In [None]:
create_topic_visualisations(topic_model, combined_vectors, chat_texts)

#### 5. Evaluate the Results

In [None]:
(ss_msg_structural_embeddings, 
 topic_count_msg_structural_embeddings,
 noise_structural_embeddings) = get_evaluations(combined_vectors,
                                                propabilities, 
                                                topic_model, 
                                                chat_texts)

## 8. Evaluation

Finally, we can compare the different approaches based on the evaluation data we collected for each feature and feature combination.

In [None]:
evaluation_df = pd.DataFrame({
    "Model": ["Base", "Filtered", "Webpreview", "Msg + Webpreview", "Structural", "Msg + Structural"],
    "Silhouette Score": [ss_base_embeddings, ss_filtered_embeddings, ss_webpreview_embeddings, ss_msg_webpreview_embeddings, ss_structural_embeddings, ss_msg_structural_embeddings],
    "Topic Count": [topic_count_base_embeddings, topic_count_filtered_embeddings, topic_count_webpreview_embeddings, topic_count_msg_webpreview_embeddings, topic_count_structural_embeddings, topic_count_msg_structural_embeddings], 
    "Noise Instances": [noise_base_embeddings, noise_filtered_embeddings, noise_webpreview_embeddings, noise_count_msg_webpreview_embeddings, noise_structural_embeddings, noise_structural_embeddings]
}).set_index("Model")

evaluation_df