In [1]:
import torch
import pandas as pd
import time
import ast
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english') + stopwords.words('spanish'))


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bowenyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/bowenyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import string

In [4]:
import networkx as nx
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np


  from tqdm.autonotebook import tqdm, trange


In [5]:
en_data = pd.read_csv("en_combined.csv")

In [6]:
es_data = pd.read_csv("es_combined.csv")

In [7]:
en_data.columns

Index(['id', 'text', 'lang', 'epoch', 'hashtags', 'links', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId',
       'mentionedUsers', 'id_str', 'followersCount', 'friendsCount',
       'statusesCount'],
      dtype='object')

In [8]:
def convert_format(field):
    return ast.literal_eval(field)

en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


### Text similarity network construction

#### 1. Cleaning text

In [9]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove emojis
    text = text.encode("ascii", "ignore").decode("utf-8")
    # Tokenize and remove stopwords
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop_words]

    return " ".join(words)
    

#### 2. Network construction

In [10]:
model = SentenceTransformer("distiluse-base-multilingual-cased-v1").to(device)



In [19]:
def construct_text_similarity_network(dataframe, threshold=0.90):
    # Step 1: Clean the text column
    dataframe['cleaned_text'] = dataframe['text'].apply(clean_text)
    
    # Step 2: Filter out tweets with fewer than 4 words
    dataframe = dataframe[dataframe['cleaned_text'].apply(lambda x: len(x.split()) >= 4)]
    
    # Step 3: Group tweets by user
    user_text_map = dataframe.groupby('id_str')['cleaned_text'].apply(list)
    user_text_map = {user: texts for user, texts in user_text_map.items() if len(texts) >= 10}

    # Step 4: Generate embeddings for each user
    user_embeddings = {}
    for i, (user, texts) in enumerate(user_text_map.items()):
        # Encode all texts for the user
        embeddings = model.encode(texts, device='cuda', batch_size=64)
        # Average embeddings to get a single user-level vector
        user_embeddings[user] = np.mean(embeddings, axis=0)

    # Step 5: Compute cosine similarity matrix
    user_ids = list(user_embeddings.keys())
    user_vectors = np.array(list(user_embeddings.values()))
    similarity_matrix = cosine_similarity(user_vectors)

    # Step 6: Build the user-user similarity graph
    G = nx.Graph()
    for i, user1 in enumerate(user_ids):
        for j, user2 in enumerate(user_ids):
            if i != j and similarity_matrix[i, j] > threshold:
                G.add_edge(user1, user2, weight=similarity_matrix[i, j])

    return G

In [21]:
en_text_graph = construct_text_similarity_network(en_data)


In [22]:
en_text_graph.size()

4058

In [14]:
es_text_graph = construct_text_similarity_network(es_data)
es_text_graph.size()

1515595

#### We need a higher similarity threshold for Spanish data

In [16]:
es_text_graph = construct_text_similarity_network(es_data,0.95)
es_text_graph.size()

14280

In [48]:
# pickle.dump(es_text_graph, open('es_text_graph.gpickle', 'wb'))

In [3]:
# es_text_graph = pickle.load(open('es_text_graph.gpickle', 'rb'))
# en_text_graph = pickle.load(open('en_text_graph.gpickle', 'rb'))


In [23]:
nx.write_gexf(en_text_graph, "en_text_graph_new.gexf")

In [18]:
nx.write_gexf(es_text_graph, "es_text_graph.gexf")

### 3. Cluster analysis

In [9]:
en_clusters = pd.read_csv("en_text_sim.csv")
es_clusters = pd.read_csv("es_text_sim.csv")

In [11]:
from bertopic import BERTopic
from collections import Counter
from bertopic.representation import KeyBERTInspired


def get_top_topics(cluster_df, data_df):
    """
    Analyzes the top topics in a cluster using BERTopic.

    Parameters:
    - cluster_df: DataFrame representing the cluster, containing user IDs in the 'Id' column and 'eigencentrality'.
    - data_df: DataFrame containing the raw tweet data, with user IDs in 'id_str' and tweets in 'text'.
    """
    # Extract tweets for users in the cluster
    texts = []
    for _, row in cluster_df.iterrows():
        user = row['Id']
        user_data = data_df[data_df['id_str'] == user]
        
        for _, tweet_row in user_data.iterrows():
            tweet = tweet_row['text']
            if isinstance(tweet, str) and len(tweet) > 0:
                texts.append(tweet)

    # Check if texts are non-empty
    if not texts:
        print("No texts available in this cluster.")
        return

    # Apply BERTopic
    representation_model = KeyBERTInspired()
    topic_model = BERTopic(language="multilingual", representation_model=representation_model)
    topics, probs = topic_model.fit_transform(texts)

    # Get the most common topics
    print(topic_model.get_topic(10))
    
    # Cluster metrics
    print(f"\nCluster size: {cluster_df.shape[0]}")
    print(f"Avg Eigenvector Centrality (EC): {cluster_df['eigencentrality'].mean()}")
    print("Top 2 Nodes by EC:")
    print(cluster_df.nlargest(2, 'eigencentrality')[['Id', 'eigencentrality']])


In [38]:
get_top_topics(en_clusters[en_clusters['modularity_class']==0], en_data)

[('abortion', 0.60028183), ('abortions', 0.5542473), ('fetuses', 0.37792125), ('pregnancy', 0.33204708), ('unborn', 0.3180533), ('roe', 0.30129647), ('reproductive', 0.27235496), ('neutered', 0.26331052), ('newsweek', 0.24960382), ('fertilization', 0.24601033)]

Cluster size: 242
Avg Eigenvector Centrality (EC): 0.20524888016528928
Top 2 Nodes by EC:
                       Id  eigencentrality
3   '1607179494267453440'           1.0000
48  '1323861935385800705'           0.9997


In [39]:
get_top_topics(en_clusters[en_clusters['modularity_class']==6], en_data)

[('bullet', 0.4730395), ('gunfire', 0.47082508), ('assassination', 0.4590474), ('shooting', 0.44375038), ('shot', 0.41272205), ('shooter', 0.39944214), ('gunmen', 0.36273488), ('pennsylvania', 0.34602836), ('sniper', 0.34067425), ('shoot', 0.33194247)]

Cluster size: 155
Avg Eigenvector Centrality (EC): 0.1851809741935484
Top 2 Nodes by EC:
             Id  eigencentrality
90   '14662354'         0.846619
107  '18956073'         0.824424


In [44]:
get_top_topics(en_clusters[en_clusters['modularity_class']==10], en_data)

[('trump2024tosaveamerica', 0.5242352), ('trump2024', 0.5153598), ('donaldjtrumpjr', 0.46091014), ('trump', 0.4561502), ('donald', 0.443261), ('realdonaldtrump', 0.41920254), ('trumpanzees', 0.40702057), ('trumpmeltdown', 0.39976558), ('trumps', 0.39621258), ('melania', 0.38993993)]

Cluster size: 103
Avg Eigenvector Centrality (EC): 0.03263343689320389
Top 2 Nodes by EC:
              Id  eigencentrality
326  '576975457'         0.259018
23   '626760400'         0.224929


#### Spanish

In [12]:
get_top_topics(es_clusters[es_clusters['modularity_class']==0], es_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[('trump', 0.74134916), ('trumpmeltdown', 0.6016231), ('trump2024', 0.50835246), ('obama', 0.5025191), ('discurso', 0.3884017), ('retórica', 0.36591637), ('mítines', 0.3581076), ('libertario', 0.35769534), ('trumpisunfitforoffice', 0.3180873), ('audiencia', 0.31041327)]

Cluster size: 332
Avg Eigenvector Centrality (EC): 0.15878569879518073
Top 2 Nodes by EC:
             Id  eigencentrality
76   '16676396'         1.000000
42  '133945128'         0.959601


In [13]:
get_top_topics(es_clusters[es_clusters['modularity_class']==72], es_data)

[('fbi', 0.44429463), ('trump', 0.3996332), ('sospechoso', 0.38954106), ('identificó', 0.34184307), ('sospechosos', 0.32849008), ('identificar', 0.27406016), ('atacante', 0.2710208), ('identifica', 0.25448278), ('crooks', 0.24684718), ('atentado', 0.23638453)]

Cluster size: 281
Avg Eigenvector Centrality (EC): 0.2314304412811388
Top 2 Nodes by EC:
              Id  eigencentrality
150  '520653311'         0.909223
5    '200267797'         0.895046
