# Part 2: Feature Engineering

## Imports and settings

In [1]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed

  from tqdm.autonotebook import tqdm, trange


## Re-Import the dataset


First, we'll import the cleaned dataset and check, if we retained the datatypes.

In [2]:
path = os.path.join(os.getcwd(), "../data/csv/cleaned_data.csv")
dtypes_path = os.path.join(os.getcwd(), '../data/auxiliary/cleaned_data_dtypes.json')

# load datatypes
with open(dtypes_path, 'r') as f:
    dtypes_dict = json.load(f)
    
# isolate datetime and non-datetime columns
datetime_cols = [col for col, dtype in dtypes_dict.items() if dtype == 'datetime64[ns]']
dtype_dict_nodate = {col: dtype for col, dtype in dtypes_dict.items() if dtype != 'datetime64[ns]'}

# load cleaned dataset using the types defined above
df = pd.read_csv(path, low_memory=False, parse_dates=datetime_cols, dtype=dtype_dict_nodate)
df.shape

(1581498, 38)

In [3]:
#create a dataframe to compare the original datatypes and the datatypes of the imported dataframe
dtypes_df = pd.DataFrame({
    'Saved Data Types': dtypes_dict,
    'Current Data Types': df.dtypes
})
display(dtypes_df)
differences_df = dtypes_df[dtypes_df['Saved Data Types'] != dtypes_df['Current Data Types']]
print(f"Number of mismatched data-types: {differences_df.shape[0]}")

Unnamed: 0,Saved Data Types,Current Data Types
chat_handle,object,object
chat_name,object,object
chat_type,object,object
collection_time,datetime64[ns],datetime64[ns]
fwd_from_chat_handle,object,object
fwd_from_chat_id,Int64,Int64
fwd_from_user_name,object,object
is_fwd,bool,bool
is_group_elem,bool,bool
is_reply,bool,bool


Number of mismatched data-types: 0


## Feature 1: Document Embeddings

First, we will create document embeddings based on a chats text. 

This is the most frequently used approach to vectorizing Telegram-Chats and will serve as a baseline for comparison in this experiment.

### Load model

In [4]:
current_path = os.getcwd()
model_dir = os.path.join(current_path, "../data/models/")
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
model_path = os.path.join(model_dir, model_name)

# Load or download the model
if not os.path.isdir(model_path):
    print("Model not found. Downloading...")
    model = SentenceTransformer(model_name)
    model.save(model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already downloaded. Loading...")
    model = SentenceTransformer(model_path)

Model already downloaded. Loading...


### Preprocessing

In [5]:
#WIP

### Create Embeddings

To create the baseline embeddings, we first define the functions we'll use to create the embeddings. To speed the process up, we'll also provide a function to do so in parallel. 

In [6]:
# Function to get embeddings for a given text
def get_embeddings(text, default_embedding, model):
    if pd.isna(text) or text.strip() == '':
        return default_embedding
    
    return model.encode(text, convert_to_tensor=False)

# Function to process each chunk of data
def process_chunk(chunk, default_embedding, model):
    embeddings = []
    
    for text in chunk:
        embeddings.append(get_embeddings(text, default_embedding, model))
    
    return embeddings

Now we create the embeddings. 
To avoid redundant calculations, we'll only calculate the embeddings if we have not saved them yet. If they are already in our project, we'll simply load them.

In [7]:
# check if the embeddings were already saved.
current_path = os.getcwd()
feature_0_path = os.path.join(current_path, "../features/message_embeddings.csv")
already_vectorized = os.path.isfile(feature_0_path)

In [8]:
# calculate embeddings if they have not already been created

if not already_vectorized:
    
    print("Embeddings not yet created. Vectorizing...")
    
    # Create a copy of the original DataFrame
    df_embeddings = df.copy()

    # Set environment variable to control tokenizers parallelism
    os.environ["TOKENIZERS_PARALLELISM"] = "true"

    # Define default embedding
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))

    # Split DataFrame into chunks for parallel processing
    num_chunks = 8 # one for each core. for some reason, three seems to be fast as well. 
    df_chunks = np.array_split(df_embeddings["message_text"], num_chunks)

    # Process each chunk in parallel
    results = Parallel(n_jobs=num_chunks)(
        delayed(lambda chunk: process_chunk(chunk, default_embedding))(chunk) for chunk in df_chunks
    )

    # Combine results into a single DataFrame
    df_embeddings["embedding"] = np.concatenate(results).tolist()
    df_embeddings["embedding"].head()
    
    # save the results as a csv-file
    embedding_path = os.path.join(os.getcwd(), '../features/message_embeddings_2.csv')
    df_embeddings.to_csv(embedding_path)
    print(df_embeddings.shape())
    
else:
    # loading the whole dataframe takes about 45min. Instead we'll load only the embedding column and add it to the dataframe we loaded earlier
    print("Embeddings already created. Loading Embeddings...") 
    feature = ['embedding']
    embeddings = pd.read_csv(feature_0_path, skipinitialspace=True, usecols=feature)
    df_embeddings = df.copy()
    df_embeddings["embedding"] = embeddings
    

Embeddings already created. Loading Embeddings...


## Feature 1: Strucutral Equivalence

1. We construct an adjacency matrix representing the frequency of forwarded messages from one chat to another in our dataset.

2. We correlate the rows of the matrix, ignoring diagonals to produce a correlation matrix. 

#### 1. Create the Chat/Feature Matrix

An adjacency matrix is a standard representation of a graph where each cell indicates the number of connections between nodes. In our case, the columns represent the source chats of messages, and the row indices represent the chats in our dataset.

Due to limitations in the data collection process, the current adjacency matrix does not capture all forward-based relationships between chats. Instead, it only reflects the incoming connections observed within the data collection timeframe.

To create a comprehensive adjacency matrix, we will:

1. Isolate Rows with forwarded messages


2. Construct the Adjacency Matrix


3. Add Chats with no forwarded messages

We'll start with isolating rows with forwarded messages and create the initial adjacency matrix.

In [None]:
# isolate rows that contain messages forwarded from a public chat
fwd_messages = df[~(pd.isna(df["fwd_from_chat_id"]))]

# Create the adjacency matrix 
adj_matrix = fwd_messages.pivot_table(
                            index='telegram_chat_id', 
                            columns='fwd_from_chat_id', 
                            aggfunc='size', # count the number of occurrences of each combination of telegram_chat_id and fwd_from_chat_id
                            fill_value=0) # fills all cells with no co-occurances of chat and source-chat with 0
adj_matrix.head()

Next, we check, if we have chats without any forwarded messages in our dataset and if they are already in the matrix.

In [None]:
# group messages by chat they were sent in
grouped = df.groupby("telegram_chat_id")

# get the chat ids of all chats, that have 0 messages forwarded from public chats
def all_nans(series):
    return series.isna().all()
no_fwd_chats = grouped["fwd_from_chat_id"].apply(all_nans)
no_fwd_chats = no_fwd_chats[no_fwd_chats==True].index

# check if the Adjacency Matrix already contains the chats we identified
index_adj_matrix = adj_matrix.index
in_matrix = []
not_in_matrix = []

for index in no_fwd_chats:
    if index in (index_adj_matrix):
        in_matrix.append(index)
    else:
        not_in_matrix.append(index)
        
print(f"{len(in_matrix)}/{len(no_fwd_chats)} chats without forwarded messages are already in the matrix.")

Finally, we'll add rows for chats containing no forwarded messages to the matrix.

In [None]:
# create the adjacency matrix for non-forward chats
non_fwd_adj_matrix = pd.DataFrame(index=not_in_matrix, columns=adj_matrix.columns)
non_fwd_adj_matrix.fillna(0,inplace=True) # set all values to 0, as these chats have no connections.

# add them to the initial matrix
adj_matrix_combined = pd.concat([adj_matrix, non_fwd_adj_matrix], axis=0)
adj_matrix_combined

Before moving on, we need to check if each chat is represented in our adjacency matrix.

In [None]:
initial_chat_count = df["telegram_chat_id"].nunique()
adj_matrix_chat_count = adj_matrix_combined.shape[0]
print(f"Number of chats in dataset: {initial_chat_count}")
print(f"Number of chats in Adjacency Matrix: {adj_matrix_chat_count}")

In [None]:
# Apply logarithmic scaling to the matrix
adj_matrix_log_scaled = np.log1p(adj_matrix_combined)  # np.log1p is log(x + 1) to handle zeros

# Convert back to DataFrame if needed
adj_matrix_log_scaled = pd.DataFrame(adj_matrix_log_scaled, 
                                     index=adj_matrix_combined.index, 
                                     columns=adj_matrix_combined.columns)


# Display the normalized adjacency matrix
plt.figure(figsize=(30, 10))
sns.heatmap(adj_matrix_log_scaled, annot=False, cmap='coolwarm', vmax=4)
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
adj_matrix_one_hot = adj_matrix_combined.map(lambda x: 0 if x==0 else 1)

# Display the one hot encoded adjacency matrix
plt.figure(figsize=(30, 10))
sns.heatmap(adj_matrix_one_hot, annot=False, cmap='gist_yarg')
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
cols = adj_matrix_one_hot.columns
only_once = []
for col in cols:
    if (adj_matrix_one_hot[col].value_counts().loc[1] == 1): # get all columns that have only one connection to a chat and drop them
        only_once.append(col)

adj_matrix_one_hot_multiple = adj_matrix_one_hot.drop(axis=1, labels=only_once)
adj_matrix_one_hot_multiple

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=11, random_state=1, n_init="auto").fit(adj_matrix_one_hot_multiple)
clusters = kmeans.labels_
pd.Series(clusters).value_counts()