In [1]:
import torch
import pandas as pd
import ast
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
import pickle

In [2]:
import networkx as nx
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np


  from tqdm.autonotebook import tqdm, trange


In [3]:
en_data = pd.read_csv("en.csv")

In [4]:
es_data = pd.read_csv("es.csv")

In [9]:
es_data.head(1)

Unnamed: 0,id,text,lang,epoch,hashtags,links,replyCount,retweetCount,likeCount,quoteCount,conversationId,mentionedUsers,id_str,followersCount,friendsCount,statusesCount
0,1.80089e+18,@stayintham Mi amiga la que quiere ahorrar,es,1718201000.0,[],[],1.0,0.0,0.0,0.0,1.800879e+18,"[{'id_str': '1197160866002718722', 'name': 'lu...",'966746102',144,220,3081


In [14]:
en_data.columns

Index(['id', 'text', 'lang', 'epoch', 'hashtags', 'links', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId',
       'mentionedUsers', 'id_str', 'followersCount', 'friendsCount',
       'statusesCount'],
      dtype='object')

In [5]:
def convert_format(field):
    return ast.literal_eval(field)

en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)



### 1. Network

#### 1.1 Text similarity network

In [8]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2").to(device)

def construct_text_similarity_network(dataframe):
    user_text_map = defaultdict(list)
    for _, row in dataframe.iterrows():
        user = row['id_str']
        text = row['text']
        if isinstance(text, str):
            user_text_map[user].append(text)

    user_embeddings = {}
    for user, texts in user_text_map.items():
        embeddings = model.encode(texts, device=device)
        user_embeddings[user] = np.mean(embeddings, axis=0)

    user_ids = list(user_embeddings.keys())
    user_vectors = np.array(list(user_embeddings.values()))
    similarity_matrix = cosine_similarity(user_vectors)

    # Build user-user graph
    G = nx.Graph()
    for i, user1 in enumerate(user_ids):
        for j, user2 in enumerate(user_ids):
            if i != j and similarity_matrix[i, j] > 0.7:
                G.add_edge(user1, user2, weight=similarity_matrix[i, j])

    return G


In [None]:
en_text_graph = construct_text_similarity_network(en_data)
es_text_graph = construct_text_similarity_network(es_data)


In [None]:
pickle.dump(en_text_graph, open('en_text_graph.gpickle', 'wb'))
pickle.dump(es_text_graph, open('es_text_graph.gpickle', 'wb'))


#### 1.2 co-Hashtags network

#### 1.3 Co-URL network

In [6]:
def construct_co_url_network(dataframe):
    # 1: Extract user-URL mappings
    user_url_map = defaultdict(list)
    for _, row in dataframe.iterrows():
        user = row['id_str']
        links = row['links']
        if isinstance(links, list):
            for link in links:
                expanded_url = link.get('expanded_url')
                if expanded_url:
                    user_url_map[user].append(expanded_url)

    # 2: Create user-URL matrix (rows representing users, cols representing URLs)
    users = list(user_url_map.keys())
    urls = list({url for url_list in user_url_map.values() for url in url_list})
    user_url_matrix = np.zeros((len(users), len(urls)))

    for i, user in enumerate(users):
        for url in user_url_map[user]:
            if url in urls:
                user_url_matrix[i, urls.index(url)] = 1

    # 3: Compute cosine similarity between users 
    similarity_matrix = cosine_similarity(user_url_matrix)  # Calculate the cosine sim between rows (users)
 
    # 4: Build user-user graph
    G = nx.Graph()
    for i, user1 in enumerate(users):
        for j, user2 in enumerate(users):
            if i != j and similarity_matrix[i, j] > 0.7:
                G.add_edge(user1, user2, weight=similarity_matrix[i, j])

    return G

In [7]:
# Construct Co-URL networks for English and Spanish data
en_co_url_graph = construct_co_url_network(en_data)
es_co_url_graph = construct_co_url_network(es_data)


In [9]:
# Save graphs for further use
pickle.dump(en_co_url_graph, open('en_co_url_graph.pickle', 'wb'))
pickle.dump(es_co_url_graph, open('es_co_url_graph.pickle', 'wb'))


#### 1.4 Network Fusion
- Might consider temporary indicators (epoch)
- Might also add linguistic features like formality and toxicity, if time allows