In [5]:
import pandas as pd
import time
import ast
import pickle
import networkx as nx
from collections import Counter
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np


In [2]:
en_data = pd.read_csv("en.csv")
es_data = pd.read_csv("es.csv")

In [3]:
def convert_format(field):
    return ast.literal_eval(field)

en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


#### 1. Preprocessing

In [46]:
# Filter users with fewer than three hashtags
def filter_users_by_hashtags(dataframe):
    filtered_data = dataframe.copy()
    filtered_data['hashtag_count'] = dataframe['hashtags'].apply(
        lambda tags: len(tags) if isinstance(tags, list) else 0
    )
    filtered_data = filtered_data[filtered_data['hashtag_count'] >= 3]
    return filtered_data

# Filter English and Spanish datasets
en_data_filtered = filter_users_by_hashtags(en_data)
es_data_filtered = filter_users_by_hashtags(es_data)


In [47]:
en_data_filtered.shape

(14636, 17)

In [48]:
es_data_filtered.shape

(23104, 17)

In [49]:
def extract_hashtag_sequences(dataframe):
    hashtag_sequences = {}
    for _, row in dataframe.iterrows():
        user = row['id_str']
        hashtags = row['hashtags']
        if isinstance(hashtags, list):
            sequence = [tag['text'] for tag in hashtags if 'text' in tag]
            hashtag_sequences[user] = sequence
    return hashtag_sequences
    
en_hashtag_sequences = extract_hashtag_sequences(en_data_filtered)
es_hashtag_sequences = extract_hashtag_sequences(es_data_filtered)

In [50]:
print(f"{len(en_hashtag_sequences)} users have shared no fewer than 3 hashtags in English data")

11034 users have shared no fewer than 3 hashtags in English data


In [51]:
print(f"{len(es_hashtag_sequences)} users have shared no fewer than 3 hashtags in Spanish data")

7655 users have shared no fewer than 3 hashtags in Spanish data


In [52]:
# Build the Co-Hashtag Network
def construct_co_hashtag_network(hashtag_sequences):
    G = nx.Graph()
    users = list(hashtag_sequences.keys())
    
    # Compare each pair of users
    for i, user1 in enumerate(users):
        for j in range(i + 1, len(users)):
            user2 = users[j]
            if sorted(hashtag_sequences[user1]) == sorted(hashtag_sequences[user2]):  # Identical sequences
                G.add_edge(user1, user2)
    
    return G

# Create Co-Hashtag Networks for English and Spanish data
en_co_hashtag_graph = construct_co_hashtag_network(en_hashtag_sequences)
es_co_hashtag_graph = construct_co_hashtag_network(es_hashtag_sequences)



In [44]:
nx.write_gexf(en_co_hashtag_graph, "en_co_hashtag_graph.gexf")

In [45]:
nx.write_gexf(es_co_hashtag_graph, "es_co_hashtag_graph.gexf")