In [1]:
import pandas as pd
import time
import ast
import pickle
import networkx as nx
from collections import Counter
from scipy.sparse import csr_matrix

from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np


In [2]:
en_data = pd.read_csv("en_combined.csv")
es_data = pd.read_csv("es_combined.csv")

In [3]:
def convert_format(field):
    return ast.literal_eval(field)

en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


#### 1. Preprocessing

In [27]:
def filter_users_by_hashtags(dataframe, min_unique_hashtags=6):
    # Extract user-hashtag mappings
    user_hashtag_map = [
        (row['id_str'], tag['text'].lower())
        for _, row in dataframe.iterrows()
        if isinstance(row['hashtags'], list)
        for tag in row['hashtags']
        if 'text' in tag
    ]

    user_hashtag_df = pd.DataFrame(user_hashtag_map, columns=['user', 'hashtag'])

    # Count unique hashtags per user
    user_hashtag_counts = user_hashtag_df.groupby('user')['hashtag'].nunique()

    # Filter users with at least min_unique_hashtags unique hashtags
    users_with_min_hashtags = user_hashtag_counts[user_hashtag_counts >= min_unique_hashtags].index

    # Filter the original dataframe to include only these users
    filtered_data = dataframe[dataframe['id_str'].isin(users_with_min_hashtags)].copy()
    print(f"{filtered_data.shape[0]} users have shared more than {min_unique_hashtags} tags")

    return filtered_data, user_hashtag_df[user_hashtag_df['user'].isin(users_with_min_hashtags)]
    

In [32]:
def construct_co_hashtag_network(user_hashtag_df, min_df=5, threshold=0.7):
    # Create a user-hashtag matrix
    user_hashtag_matrix = user_hashtag_df.pivot_table(index='user', columns='hashtag', aggfunc=len, fill_value=0)

    # Apply minimum document frequency (min_df)
    hashtag_frequencies = (user_hashtag_matrix > 0).sum(axis=0)
    valid_hashtags = hashtag_frequencies[hashtag_frequencies >= min_df].index
    filtered_user_hashtag_matrix = user_hashtag_matrix[valid_hashtags]

    # Convert to sparse matrix
    user_hashtag_sparse_matrix = csr_matrix(filtered_user_hashtag_matrix.values)

    # Apply TF-IDF transformation
    tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
    tfidf_matrix = tfidf_transformer.fit_transform(user_hashtag_sparse_matrix)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Build the user-user graph
    user_ids = filtered_user_hashtag_matrix.index.tolist()
    G = nx.Graph()

    for i, user1 in enumerate(user_ids):
        for j in range(i + 1, len(user_ids)):
            user2 = user_ids[j]
            similarity = similarity_matrix[i, j]
            if similarity > threshold:
                G.add_edge(user1, user2, weight=similarity)

    return G

In [37]:
en_data_filtered, en_user_hashtag_df = filter_users_by_hashtags(en_data)
es_data_filtered, es_user_hashtag_df = filter_users_by_hashtags(es_data)


15701 users have shared more than 6 tags
92218 users have shared more than 6 tags


In [38]:
# Construct Co-Hashtag Networks
en_co_hashtag_graph = construct_co_hashtag_network(en_user_hashtag_df)
es_co_hashtag_graph = construct_co_hashtag_network(es_user_hashtag_df)


In [39]:
en_co_hashtag_graph.size()

1865

In [40]:
es_co_hashtag_graph.size()

5520

In [41]:
nx.write_gexf(en_co_hashtag_graph, "en_co_hashtag_graph_new.gexf")

In [42]:
nx.write_gexf(es_co_hashtag_graph, "es_co_hashtag_graph_new.gexf")

### 2. Cluster Analysis

In [43]:
en_clusters = pd.read_csv("en_co_hashtag.csv")
es_clusters = pd.read_csv("es_co_hashtag.csv")


In [51]:
from collections import Counter

def get_top_tags(df, data):
    """
    Prints the 10 most common tags in a cluster of nodes.

    Parameters:
    - df: DataFrame representing the cluster, containing user IDs in the 'Id' column.
    - data: DataFrame containing the raw tweet data for the respective language.
    """
    tags = []

    # Iterate over each user in the cluster
    for i, r in df.iterrows():
        user = r['Id'] 
        user_data = data[data['id_str'] == user] 
        
        for _, row in user_data.iterrows():
            hashtags = row['hashtags']
            if isinstance(hashtags, list):
                for tag in hashtags:
                    if 'text' in tag:
                        tags.append(tag['text'].lower())

    print(Counter(tags).most_common(10))
    print(f"Cluster size: {df.shape[0]}")
    print(f"Avg EC: {df['eigencentrality'].mean()}")
    print(f"Max EC: {df.sort_values(by='eigencentrality', ascending=False).head(2)}")


In [52]:
get_top_tags(en_clusters[en_clusters['modularity_class']==15], en_data)

[('trump2024', 456), ('maga', 160), ('trump', 63), ('nahbabynah', 29), ('trumpvance2024', 25), ('maga2024', 24), ('jokes', 22), ('trump2024vance', 7), ('biden', 7), ('makeamericagreatagain', 7)]
Cluster size: 72
Avg EC: 0.16504269444444442
Max EC:                        Id                  Label  modularity_class  \
84  '1841523728166899712'  '1841523728166899712'                15   
87             '32921811'             '32921811'                15   

    eigencentrality       Size  
84         0.980311  25.724232  
87         0.980311  25.724232  


In [53]:
get_top_tags(en_clusters[en_clusters['modularity_class']==102], en_data)

[('maga', 148), ('borderobserver', 27), ('trump2024', 15), ('trump', 11), ('rightbias', 7), ('m666', 6), ('frontpage', 6), ('americong', 6), ('operationsecondchancebuses', 5), ('smart', 5)]
Cluster size: 35
Avg EC: 0.7191521142857144
Max EC:                         Id                  Label  modularity_class  \
144  '1483144269946892290'  '1483144269946892290'               102   
147  '1608390506710503425'  '1608390506710503425'               102   

     eigencentrality  Size  
144              1.0  26.0  
147              1.0  26.0  


In [54]:
get_top_tags(en_clusters[en_clusters['modularity_class']==1], en_data)

[('biden', 86), ('trump', 39), ('maga', 21), ('ridinwithbiden', 4), ('republican', 4), ('democrats', 3), ('trump2024', 3), ('publishtheera', 3), ('equalrightsamendment', 3), ('harris', 3)]
Cluster size: 34
Avg EC: 0.11914432352941175
Max EC:              Id        Label  modularity_class  eigencentrality       Size
22  '841346448'  '841346448'                 1         0.195507  14.731993
24  '866219268'  '866219268'                 1         0.189038  14.641395


#### Only the blue cluster is about Biden. StrokerAC90 is the highest in two clusters. 

#### Spanish

In [57]:
get_top_tags(es_clusters[es_clusters['modularity_class']==49], es_data)

[('mundo', 1673), ('estadosunidos', 152), ('américanoticias', 94), ('diariodemexico', 94), ('biden', 93), ('todoestáenln', 76), ('elcomentario', 66), ('joebiden', 60), ('radioamérica', 51), ('olivanoticias', 50)]
Cluster size: 52
Avg EC: 0.7594760384615385
Max EC:                         Id                  Label  modularity_class  \
477             '18193312'             '18193312'                49   
474  '1765936395896414208'  '1765936395896414208'                49   

     eigencentrality  Size  
477              1.0  28.0  
474              1.0  28.0  


In [58]:
get_top_tags(es_clusters[es_clusters['modularity_class']==7], es_data)

[('trump2024', 754), ('calleymadurocae', 265), ('entiranianosevota', 140), ('rebelionylibertad', 117), ('trump', 113), ('antimudpsuv', 104), ('magazolanosfortrump', 81), ('maga', 75), ('biden', 46), ('trump2024tosaveamerica', 46)]
Cluster size: 69
Avg EC: 0.16560021739130432
Max EC:                         Id                  Label  modularity_class  \
157  '1360542971649753089'  '1360542971649753089'                 7   
179             '95726181'             '95726181'                 7   

     eigencentrality       Size  
157         0.347077  18.856369  
179         0.347077  18.856369  


In [61]:
get_top_tags(es_clusters[es_clusters['modularity_class']==14], es_data)

[('internacional', 1155), ('noticiasalmomento', 359), ('noticias', 184), ('estadosunidos', 172), ('euvzla', 103), ('eeuu', 101), ('biden', 83), ('internacionales', 70), ('donaldtrump', 65), ('nacional', 63)]
Cluster size: 67
Avg EC: 0.16403810447761194
Max EC:                       Id                 Label  modularity_class  \
8            '259876618'           '259876618'                14   
15  '759364749897314304'  '759364749897314304'                14   

    eigencentrality       Size  
8          0.342636  18.794167  
15         0.342636  18.794167  


In [60]:
get_top_tags(es_clusters[es_clusters['modularity_class']==1], es_data)

[('internacionales', 958), ('elnuevodiariord', 121), ('peruinformado', 110), ('estadosunidos', 68), ('biden', 63), ('eeuu', 53), ('trump', 38), ('hechosdigital', 37), ('internacional', 37), ('nacionales', 32)]
Cluster size: 44
Avg EC: 0.2574433409090909
Max EC:                Id         Label  modularity_class  eigencentrality       Size
115   '168693277'   '168693277'                 1         0.357420  19.001207
125  '3279794756'  '3279794756'                 1         0.352432  18.931358


#### The biggest cluster has a small Avg EC.