# Part 2: Feature Engineering

## Imports and settings

In [1]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
import re
import sys
import unicodedata

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Get the parent directory of the current notebook directory and add it to the python path to import custom modules
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

In [3]:
## Re-Import the dataset


First, we'll import the cleaned dataset and check, if we retained the datatypes.

In [4]:
path = os.path.join(os.getcwd(), "../data/csv/cleaned_data.csv")
dtypes_path = os.path.join(os.getcwd(), '../data/auxiliary/cleaned_data_dtypes.json')

# load datatypes
with open(dtypes_path, 'r') as f:
    dtypes_dict = json.load(f)
    
# isolate datetime and non-datetime columns
datetime_cols = [col for col, dtype in dtypes_dict.items() if dtype == 'datetime64[ns]']
dtype_dict_nodate = {col: dtype for col, dtype in dtypes_dict.items() if dtype != 'datetime64[ns]'}

# load cleaned dataset using the types defined above
df = pd.read_csv(path, low_memory=False, parse_dates=datetime_cols, dtype=dtype_dict_nodate)
df.shape

(1581498, 38)

In [5]:
#create a dataframe to compare the original datatypes and the datatypes of the imported dataframe
dtypes_df = pd.DataFrame({
    'Saved Data Types': dtypes_dict,
    'Current Data Types': df.dtypes
})
display(dtypes_df)
differences_df = dtypes_df[dtypes_df['Saved Data Types'] != dtypes_df['Current Data Types']]
print(f"Number of mismatched data-types: {differences_df.shape[0]}")

Unnamed: 0,Saved Data Types,Current Data Types
chat_handle,object,object
chat_name,object,object
chat_type,object,object
collection_time,datetime64[ns],datetime64[ns]
fwd_from_chat_handle,object,object
fwd_from_chat_id,Int64,Int64
fwd_from_user_name,object,object
is_fwd,bool,bool
is_group_elem,bool,bool
is_reply,bool,bool


Number of mismatched data-types: 0


## Preprocessing

Before we create the features, we preproess the columns that contain text we want to embed using SBERT. 

SBERT is a bidirectional encoder, meaning it considers both preceding and following words, thereby accounting for a sentence's structure and context. To preserve as much of this structure and context as possible, we will retain stopwords and regular punctuation. The preprocessing steps will include:

1. **Combining Text Fields**: We will concatenate the webpage title and description into a single text field for embedding later on.

2. **Handling Missing Values**: Any NaN values in the text columns will be replaced with empty strings to ensure seamless processing later on.

3. **Removing Markdown Artifacts**: Since Telegram supports markdown, we will clean the text by removing any artifacts related to Telegram's markdown formatting.

4. **Removing URLs**: We will remove any URLs present in the text.

5. **Removing Emojis**: We remove the emojis, but retain a version of the text including them, as they might be usefull later on.

6. **Normalize styled text**: Some Telegram Users use styled text (for example: 𝒾𝒹𝑜𝓇𝓊). We normalize them to include them in the embedding. 

7. **Removing multiple Whitespaces**


If we have already performed the preprocessing in earlier runs of the notebook, we'll re-load the preprocessed dataframe. 


In [6]:
# check if we already preprocessed the data in earlier runs. If so, load the preprocessed data.
preprocessed_path = os.path.join(os.getcwd(), "../data/preprocessed/df_preprocessed.pkl")
preprocessed = os.path.isfile(preprocessed_path)
if preprocessed:
    print(f"Already preprocessed: {preprocessed}")
    df = pd.read_pickle(preprocessed_path)

Already preprocessed: True


In [7]:
# add webpage title to its description
if not preprocessed:
    df["webpage_texts"] = df["webpage_title"] + df["webpage_description"]

In [8]:
# Fill nan values with empty strings
if not preprocessed:
    df["message_text"] = df["message_text"].fillna('')
    df["webpage_texts"] = df["webpage_texts"].fillna('')

In [9]:
# remove urls
if not preprocessed:
    url_pattern = r"\(?\bhttps?:\/\/[^\s/$.?#].[^\s]*[^\s.,?!)]\)?|\(?\bwww\.[^\s/$.?#].[^\s]*[^\s.,?!)]\)?"
    df["message_text"] = df["message_text"].str.replace(url_pattern, '', regex=True)
    df["webpage_texts"] = df["webpage_texts"].str.replace(url_pattern, '', regex=True)

In [10]:
# remove markdown-artifacts
if not preprocessed:
    bold_pattern = r"\*\*|__"
    italic_pattern = r"\*|_"
    strikethrough_pattern = r"~~"
    link_brackets_pattern = r"[()\[\]]"

    df["message_text"] = (
        df["message_text"]
        .str.replace(bold_pattern, '', regex=True)
        .str.replace(italic_pattern, '', regex=True)
        .str.replace(strikethrough_pattern, '', regex=True)
        .str.replace(link_brackets_pattern, '', regex=True)
    )

    df["webpage_texts"] = (
        df["webpage_texts"]
        .str.replace(bold_pattern, '', regex=True)
        .str.replace(italic_pattern, '', regex=True)
        .str.replace(strikethrough_pattern, '', regex=True)
        .str.replace(link_brackets_pattern, '', regex=True)
    )

In [11]:
# remove urls again, as some were surrounded by brackets before
if not preprocessed:
    url_pattern = r"\(?\bhttps?:\/\/[^\s/$.?#].[^\s]*[^\s.,?!)]\)?|\(?\bwww\.[^\s/$.?#].[^\s]*[^\s.,?!)]\)?"
    df["message_text"] = df["message_text"].str.replace(url_pattern, '', regex=True)
    df["webpage_texts"] = df["webpage_texts"].str.replace(url_pattern, '', regex=True)

In [12]:
# create a list of emoji-unicodes using data from "https://unicode.org/Public/emoji/15.1/"
if not preprocessed:
    
    def load_emoji_list(file_paths: list[str]) -> list[str]:
        """
        Load a list of all emoji from the given file paths.
        Args:
            file_paths (list): A list of file paths to load emoji sequences from.
        Returns:
            list: A list of unicode sequences representing the loaded emoji sequences.
        """
        
        unicode_list = []

        # match lines with unicode, including ranges like 231A..231B 
        range_pattern = re.compile(r"([0-9A-Fa-f]{4,6})\.\.([0-9A-Fa-f]{4,6})\s*;\s*")
        code_point_pattern = re.compile(r"([0-9A-Fa-f]{4,6}(?:\s[0-9A-Fa-f]{4,6})*)\s*;\s*")

        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

            for line in lines:
                range_match = range_pattern.match(line)
                
                # add elements of ranges as individual codes to list
                if range_match:
                    start_code, end_code = range_match.groups()
                    start_int = int(start_code, 16)
                    end_int = int(end_code, 16)
                    unicode_list.extend([chr(code) for code in range(start_int, end_int + 1)])
                else:
                    code_match = code_point_pattern.match(line)
                    if code_match:
                        code_points = code_match.group(1)       
                        code_point_list = code_points.split()
                        # create zwj sequences by combining all code points
                        unicode_list.append(''.join([chr(int(code, 16)) for code in code_point_list]))

        return unicode_list

    # list the paths to the unicode-files
    path_1 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-sequences.txt")
    path_2 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-test.txt")
    path_3 = os.path.join(os.getcwd(), "../data/auxiliary/emoji_unicode/emoji-zwj-sequences.txt")
    file_paths = [path_1, path_2, path_3]

    # load all emojis from the unicode-files
    emoji_sequences = load_emoji_list(file_paths)

    # create a regex pattern from the emoji sequence
    emoji_pattern = '|'.join(re.escape(emoji) for emoji in emoji_sequences)

In [13]:
# remove emojis (this will take a while, so we parallelize the process)
"""
if not preprocessed:
    df["message_text_emoji"] = df["message_text"]
    df["message_text"] = df["message_text"].str.replace(emoji_pattern, " ", regex=True)
"""

def demojize_chunk(chunk, emoji_pattern):
    # keep text including emojis in a seperate column
    chunk["message_text_emoji"] = chunk["message_text"]
    # remove emojis
    chunk["message_text"] = chunk["message_text"].str.replace(emoji_pattern, " ", regex=True)
    return chunk

n_jobs = 3  # Use three cores (seems to be fastest?)

# apply the preprocessing in parallel to each chunk
if not preprocessed:
    chunks = np.array_split(df, n_jobs)
    df_chunks = Parallel(n_jobs=n_jobs)(delayed(demojize_chunk)(chunk, emoji_pattern) for chunk in chunks)
    df = pd.concat(df_chunks, ignore_index=True)

In [14]:
# normalize styled scripts

def normalize_style(text):
    # remove characters that dont normalize
    text = re.sub(r'[𝔄-𝔷𝒜-𝓏𝗔-𝗭𝗮-𝗯]', '', text)
    # normalize Unicode
    text = unicodedata.normalize('NFKC', text)
    return text

if not preprocessed:
    df["message_text"] = df["message_text"].apply(lambda x: normalize_style(x))


In [15]:
# remove duplicated whitespaces
if not preprocessed:
    df['message_text'] = df['message_text'].str.replace(r'\s+', ' ', regex=True)

In [16]:
# save preprocessed dataframe
if not preprocessed:
    path = os.path.join(os.getcwd(), "../data/preprocessed/df_preprocessed.pkl")
    df.to_pickle(path)

## Feature 1: Embeddings

First, we will create document embeddings based on a chats text. 

This is the most frequently used approach to vectorizing Telegram-Chats and will serve as a baseline for comparison in this experiment.

### Load model

In [17]:
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

# Load or download the model
if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    model = SentenceTransformer(model_name)
    model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    model = SentenceTransformer(model_path)

Model already downloaded. Loading...


### Create Message-Embeddings

To create the baseline chat-embeddings, we first define the functions we'll use to create the embeddings for each message. To speed the process up, we'll also provide a function to do so in parallel. 


-> REALLY???
Afterwards, we'll take the mean of the message-embeddings of each chat to create document embeddings.

In [18]:
def get_embeddings(text, default_embedding, model):
    """
    Get embeddings for the given text using a specified model.
    Parameters:
    text (str): The input text to encode.
    default_embedding: The default embedding to return if the input text is empty or NaN.
    model: The model used for encoding the text.
    Returns:
    numpy.ndarray: The embeddings of the input text.
    """
    
    if pd.isna(text) or text.strip() == '':
        return default_embedding
    
    return model.encode(text, convert_to_tensor=False)


def process_chunk_with_saving(chunk, chunk_index, default_embedding, model, tmp_dir):
    """
    Process a chunk of text data by computing embeddings for each text and saving the embeddings for future use.
    Args:
        chunk (list): A list of text data.
        chunk_index (int): The index of the chunk.
        default_embedding: The default embedding to use if a text is empty or NaN.
        model: The embedding model to use for computing embeddings.
        tmp_dir (str): The directory to temporarily store the embeddings of the chunk.
    Returns:
        numpy.ndarray: The computed embeddings for the chunk.
    """
    

    # create path to temporarily store this chunk
    chunk_path = os.path.join(tmp_dir, f"chunk_{chunk_index}.npy")
    
    if os.path.isfile(chunk_path):
        # load the precomputed embeddings if they were already created
        print(f"Chunk #{chunk_index} already embedded. Loading...")
        return np.load(chunk_path)
    
    print(f"Processing chunk #{chunk_index}...")
    embeddings = []

    for text in chunk:
        embedding = get_embeddings(text, default_embedding, model)
        embeddings.append(embedding)

    # Save the embeddings for this chunk
    embeddings = np.array(embeddings)
    np.save(chunk_path, embeddings)
    
    return embeddings


Now we create the embeddings. 
To avoid redundant calculations, we'll only calculate the embeddings if we have not saved them yet. If they are already in our project, we'll simply load them.

In [19]:
# check if the embeddings were already saved.
message_embeddings_path = os.path.join(os.getcwd(), '../features/message_embeddings.npy')
message_embeddings_df_path = os.path.join(os.getcwd(), '../features/message_embeddings.csv')
already_vectorized = (os.path.isfile(message_embeddings_path) and os.path.isfile(message_embeddings_df_path))
print(f"Already created: {already_vectorized}")

Already created: True


In [20]:
# calculate embeddings if they have not already been created

if not already_vectorized:
    
    print("Embeddings not yet created. Vectorizing...")

    df_embeddings = df.copy()

    # set environment variable to control tokenizers parallelism
    os.environ["TOKENIZERS_PARALLELISM"] = "true"

    # define default embedding
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))

    # split DataFrame into chunks for parallel processing
    num_chunks = 3  # three seems to work fastest
    df_chunks = np.array_split(df_embeddings["message_text"], num_chunks)
    
    # set up the directory to save intermediate results
    output_dir = os.path.join(os.getcwd(), "../tmp")
    os.makedirs(output_dir, exist_ok=True)

    # process each chunk in parallel and save intermediate results to limit the impact of crashes
    results = Parallel(n_jobs=num_chunks)(
        delayed(process_chunk_with_saving)(chunk, chunk_index, default_embedding, model, output_dir) 
        for chunk_index, chunk in enumerate(df_chunks)
    )

    # combine results into a single df
    df_embeddings["message_text_embeddings"] = np.concatenate(results).tolist()
    
    # save the final df with embeddings as a csv
    df_embeddings.to_csv(message_embeddings_df_path)
    print(df_embeddings.shape)
    
    # save the embeddings separately as a np array
    embeddings_array = np.array(df_embeddings["message_text_embeddings"].tolist())
    np.save(message_embeddings_path, embeddings_array)

else:
    # loading the whole dataframe takes about 45min. Instead we'll load only the embedding column and add it to the dataframe we loaded earlier
    print("Embeddings already created. Loading Embeddings...")
    #feature = ['message_text_embeddings']
    #embeddings = pd.read_csv(message_embeddings_df_path, skipinitialspace=True, usecols=feature)
    embeddings = np.load(message_embeddings_path)
    df_embeddings = df.copy()
    df_embeddings["message_text_embeddings"] = embeddings.tolist() # converting to list to save one array per row
    

Embeddings already created. Loading Embeddings...


#### Create Webpage Preview Embeddings

In [21]:
# check if the embeddings were already saved.
webpage_embeddings_path = os.path.join(os.getcwd(), '../features/webpage_embeddings.npy')
webpage_embeddings_df_path = os.path.join(os.getcwd(), '../features/webpage_embeddings.csv')
wp_already_vectorized = (os.path.isfile(webpage_embeddings_path) and os.path.isfile(webpage_embeddings_df_path))
print(f"Already created: {wp_already_vectorized}")

Already created: True


: 

In [22]:
# calculate embeddings if they have not already been created

if not wp_already_vectorized:
    
    print("Embeddings not yet created. Vectorizing...")
    
    # set environment variable to control tokenizers parallelism
    os.environ["TOKENIZERS_PARALLELISM"] = "true"

    # define default embedding
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))

    # split df into chunks for parallel processing
    num_chunks = 3 # Three seems to be fastest on the machine the code was written on.
    df_chunks = np.array_split(df_embeddings["webpage_description"], num_chunks)
    
    # set up the directory to save intermediate results
    output_dir = os.path.join(os.getcwd(), "../tmp")
    os.makedirs(output_dir, exist_ok=True)    

    # process each chunk in parallel and save intermediate results to limit the impact of crashes
    results = Parallel(n_jobs=num_chunks)(
        delayed(process_chunk_with_saving)(chunk, chunk_index, default_embedding, model, output_dir) 
        for chunk_index, chunk in enumerate(df_chunks)
    )

    # combine results into a single df
    df_embeddings["webpage_description_embeddings"] = np.concatenate(results).tolist()
    
    # save the results as a csv-file
    df_embeddings.to_csv(webpage_embeddings_df_path)
    print(df_embeddings.shape)
    
    # save the embeddings seperatly as a numpy array
    embeddings_array = np.array(df_embeddings["webpage_description_embeddings"].tolist())
    np.save(webpage_embeddings_path, embeddings_array)

    
else:
    # loading the whole dataframe takes about 45min. Instead we'll load only the embedding column and add it to the dataframe we loaded earlier
    print("Webpage embeddings already created. Loading Embeddings...") 
    #feature = ['webpage_description_embeddings']
    #embeddings = pd.read_csv(webpage_embeddings_df_path, skipinitialspace=True, usecols=feature)
    embeddings = np.load(webpage_embeddings_path)
    df_embeddings["webpage_description_embeddings"] = embeddings.tolist()
    

Webpage embeddings already created. Loading Embeddings...


In [None]:
df_embeddings["webpage_description_embeddings"]

## Feature 1: (Controll for) Implicit References in Text-Embeddings

--> Better Name: Non-Repeated messages???

-- > # also remove handles of other telegram channels?? @....

To filter out implicit connections between chats containing forwarded messages and chats that contain their original versions, we simply remove all instances where both a forwarded message and its source are present in our dataset.

To avoid repeating the operation each time we run the notebook, we'll save the indices of the remaining messages. These indices will be used in subsequent runs to filter the data.

In [None]:
# check if we already filtered the dataset
feature_1_path = os.path.join(os.getcwd(), "../features/implicit_ref_filtered.csv")
indices_path = os.path.join(os.getcwd(), "../data/auxiliary/references_filtered_indices.npy")

filtered = os.path.isfile(indices_path) and os.path.isfile(feature_1_path)
print(f"Data already filtered: {filtered}")

In [None]:
if not filtered: 
    
    print("Data not yet filtered. Filtering...")
    
    df_references_filtered = df_embeddings.copy()

    # group messages according to their message text
    grouped = df_references_filtered.groupby("message_text")

    # Filter the dataset. True indicates that either the source chat and original message of a duplicates are in our dataset, 
    # or that one of the duplicates chat id is the source chat of one of the duplicates.
    fwds_with_source = grouped.apply(
        lambda group: group['fwd_from_chat_id'].isin(group['telegram_chat_id']) | group['telegram_chat_id'].isin(group['fwd_from_chat_id']),
        include_groups=False
    )

    # filter the dataframe to remove messages of which we have either the original or a forwarded instance of in the dataset 
    msg_to_keep = fwds_with_source[fwds_with_source == False]
    msg_to_keep = msg_to_keep.reset_index()
    msg_to_keep_indices = msg_to_keep["level_1"].to_list() #level 1 contains the original row indices 
    df_references_filtered = df_references_filtered.loc[msg_to_keep_indices]

    print(f"Removed {abs(df.shape[0] - df_references_filtered.shape[0])} messages.")

    print("Saving filtered data...")
    df_references_filtered.to_csv(feature_1_path)    

    print("Saving indices...")
    indices_array = np.array(df_references_filtered.index)
    np.save(indices_path, indices_array)
    
else:
    print("Data already filtered. Using saved indices to (re)filter the data...")
    filtered_rows_indices = np.load(indices_path)
    df_references_filtered = df_embeddings.loc[filtered_rows_indices]

## Feature 2: Strucutral Equivalence

1. We construct an adjacency matrix representing the frequency of forwarded messages from one chat to another in our dataset.

2. We correlate the rows of the matrix, ignoring diagonals to produce a correlation matrix. 

#### 1. Create the Chat/Feature Matrix

An adjacency matrix is a standard representation of a graph where each cell indicates the number of connections between nodes. In our case, the columns represent the source chats of messages, and the row indices represent the chats in our dataset.

Due to limitations in the data collection process, the current adjacency matrix does not capture all forward-based relationships between chats. Instead, it only reflects the incoming connections observed within the data collection timeframe.

To create a comprehensive adjacency matrix, we will:

1. Isolate Rows with forwarded messages


2. Construct the Adjacency Matrix


3. Add Chats with no forwarded messages

We'll start with isolating rows with forwarded messages and create the initial adjacency matrix.

In [None]:
# isolate rows that contain messages forwarded from a public chat
fwd_messages = df[~(pd.isna(df["fwd_from_chat_id"]))]

# Create the adjacency matrix 
adj_matrix = fwd_messages.pivot_table(
                            index='telegram_chat_id', 
                            columns='fwd_from_chat_id', 
                            aggfunc='size', # count the number of occurrences of each combination of telegram_chat_id and fwd_from_chat_id
                            fill_value=0) # fills all cells with no co-occurances of chat and source-chat with 0
adj_matrix.head()

Next, we check, if we have chats without any forwarded messages in our dataset and if they are already in the matrix.

In [None]:
# group messages by chat they were sent in
grouped = df.groupby("telegram_chat_id")

# get the chat ids of all chats, that have 0 messages forwarded from public chats
def all_nans(series):
    return series.isna().all()
no_fwd_chats = grouped["fwd_from_chat_id"].apply(all_nans)
no_fwd_chats = no_fwd_chats[no_fwd_chats==True].index

# check if the Adjacency Matrix already contains the chats we identified
index_adj_matrix = adj_matrix.index
in_matrix = []
not_in_matrix = []

for index in no_fwd_chats:
    if index in (index_adj_matrix):
        in_matrix.append(index)
    else:
        not_in_matrix.append(index)
        
print(f"{len(in_matrix)}/{len(no_fwd_chats)} chats without forwarded messages are already in the matrix.")

Finally, we'll add rows for chats containing no forwarded messages to the matrix.

In [None]:
# create the adjacency matrix for non-forward chats
non_fwd_adj_matrix = pd.DataFrame(index=not_in_matrix, columns=adj_matrix.columns)
non_fwd_adj_matrix.fillna(0,inplace=True) # set all values to 0, as these chats have no connections.

# add them to the initial matrix
adj_matrix_combined = pd.concat([adj_matrix, non_fwd_adj_matrix], axis=0)
adj_matrix_combined

Before moving on, we need to check if each chat is represented in our adjacency matrix.

In [None]:
initial_chat_count = df["telegram_chat_id"].nunique()
adj_matrix_chat_count = adj_matrix_combined.shape[0]
print(f"Number of chats in dataset: {initial_chat_count}")
print(f"Number of chats in Adjacency Matrix: {adj_matrix_chat_count}")

In [None]:
# Apply logarithmic scaling to the matrix
adj_matrix_log_scaled = np.log1p(adj_matrix_combined)  # np.log1p is log(x + 1) to handle zeros

# Convert back to DataFrame if needed
adj_matrix_log_scaled = pd.DataFrame(adj_matrix_log_scaled, 
                                     index=adj_matrix_combined.index, 
                                     columns=adj_matrix_combined.columns)


# Display the normalized adjacency matrix
plt.figure(figsize=(30, 10))
sns.heatmap(adj_matrix_log_scaled, annot=False, cmap='coolwarm', vmax=4)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
adj_matrix_one_hot = adj_matrix_combined.map(lambda x: 0 if x==0 else 1)

# Display the one hot encoded adjacency matrix
plt.figure(figsize=(30, 10))
sns.heatmap(adj_matrix_one_hot, annot=False, cmap='gist_yarg')
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
cols = adj_matrix_one_hot.columns
only_once = []
for col in cols:
    if (adj_matrix_one_hot[col].value_counts().loc[1] == 1): # get all columns that have only one connection to a chat and drop them
        only_once.append(col)

adj_matrix_one_hot_multiple = adj_matrix_one_hot.drop(axis=1, labels=only_once)
adj_matrix_one_hot_multiple

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=11, random_state=1, n_init="auto").fit(adj_matrix_one_hot_multiple)
clusters = kmeans.labels_
pd.Series(clusters).value_counts()