In [7]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [8]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

In [9]:
# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")

# Instantiating model embedding
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model.to(device)

# Removing empty texts
merged_df['clean_text'] = merged_df['clean_text'].fillna('').astype(str)
merged_df = merged_df[merged_df['clean_text'].str.strip() != ''].reset_index(drop=True)

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(merged_df['clean_text'].astype(str))

Using device : cuda


In [10]:
merged_df["topic"] = topics

In [11]:
# Queste righe ti dicono quanti NaN ci sono e i tipi presenti
print("NaN count:", merged_df['clean_text'].isnull().sum())
print(merged_df['clean_text'].apply(type).value_counts().head(10))
# Mostra qualche riga problematica
print(merged_df.loc[merged_df['clean_text'].isnull(), ['author','community_id']].head())


NaN count: 0
clean_text
<class 'str'>    98944
Name: count, dtype: int64
Empty DataFrame
Columns: [author, community_id]
Index: []


In [12]:
# Calculating embeddings
embeddings = embedding_model.encode(
    merged_df['clean_text'].tolist(),
    show_progress_bar=True
)

merged_df['embedding'] = embeddings.tolist()

Batches:   0%|          | 0/3092 [00:00<?, ?it/s]

In [15]:
# Example embedding community 0
community_0 = merged_df.loc[merged_df['community_id'] == 0.0]
community_0_vector = np.mean(np.vstack(community_0["embedding"]), axis=0)

print("Community 0 vector shape:", community_0_vector.shape)
print("Community 0 vector: ", community_0_vector)

Community 0 vector shape: (384,)
Community 0 vector:  [ 8.13904081e-03 -9.02695303e-03  1.19124760e-02 -6.00260893e-03
  2.25680581e-02  2.35915556e-02  1.01472429e-02 -2.21193183e-02
 -2.28023825e-02  9.08811209e-03  8.93703638e-03  1.94498937e-02
  1.36193151e-02 -4.38184027e-03 -1.82475415e-03  2.13859875e-02
  1.07231266e-02 -1.22948486e-02 -3.47818965e-02  2.88639617e-02
 -1.61233481e-02  3.84674968e-03 -1.15983462e-02  2.08112402e-02
 -1.44574515e-02  9.94728362e-03 -3.73844498e-03 -4.29198993e-03
  1.33636184e-02 -1.84575954e-02  1.86187737e-02  2.94283077e-02
 -1.64756464e-03  1.63752596e-02  1.13541777e-02 -5.88031030e-03
  1.38086157e-02  1.15333178e-02  1.58351785e-02 -3.17303256e-02
  1.07157155e-03 -5.08416909e-02 -1.19155248e-02 -1.57662108e-02
 -1.03000455e-02  5.28701514e-03  2.05821541e-03  5.26911898e-03
 -1.91424139e-03 -1.70309936e-02 -1.77808217e-02 -6.60975760e-03
 -7.52085836e-03 -9.04692154e-03  8.82760907e-03 -2.48910127e-02
 -1.28335187e-02  2.48204119e-02  6.

In [14]:
merged_df.to_csv('../../../src/nlp/topic_modeling/global_embeddings.csv', sep=',', encoding='utf-8', index=False)