In [1]:
import pandas as pd
import time
import plotly.graph_objects as go 
import ast
import pickle
import networkx as nx
from collections import Counter
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
import tldextract

def extract_base_domain(url):
    extracted = tldextract.extract(url)
    return f"{extracted.domain}.{extracted.suffix}"

In [12]:
en_data = pd.read_csv("en_combined.csv")
es_data = pd.read_csv("es_combined.csv")

In [13]:
def convert_format(field):
    return ast.literal_eval(field)

en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


#### 1. Network construction

In [19]:
# Overly general domains to exclude
en_exclude_domains = ["youtu.be", "x.com", "youtube.com", "dlvr.it", "trib.al", "ift.tt", "tiktok.com", "bit.ly", "yahoo.com"]
es_exclude_domains = ["dlvr.it", "youtu.be", "youtube.com", "x.com", "bit.ly", "buff.ly", "ift.tt", "ow.ly", "tinyurl.com", "short.gy", "trib.al", "acortar.link", "uni.vi"]


In [None]:
def preprocess_urls(dataframe, exclude_domains):
    # Extract user-URL mappings
    user_url_map = [
        (user, extract_base_domain(url['expanded_url'])) 
        for user, urls in zip(dataframe['id_str'], dataframe['links'])
        if isinstance(urls, list) 
        for url in urls 
        if (
            isinstance(url, dict) and 
            'expanded_url' in url and 
            len(url['expanded_url']) != 0 and 
            extract_base_domain(url['expanded_url']) not in exclude_domains
        )
    ]

    # Create a DataFrame for user-URL pairs
    user_url_df = pd.DataFrame(user_url_map, columns=["user", "url"])
    
    # Count number of valid URLs per user
    user_url_counts = user_url_df.groupby('user')['url'].nunique()
    users_with_min_links = user_url_counts[user_url_counts >= 3].index

    # Filter DataFrame to include only users with >= 3 URLs
    user_url_df = user_url_df[user_url_df['user'].isin(users_with_min_links)]
    unique_users = user_url_df['user'].nunique()
    print(f"Unique users sharing >= 3 unique URLs: {unique_users}")
    
    return user_url_df


In [None]:
def construct_full_co_domain_network(dataframe, min_df=3, threshold=0.6, lang='en'):
    # Extract and preprocess URLs
    if lang == 'en':
        user_url_df = preprocess_urls(dataframe, en_exclude_domains)
    else:
        user_url_df = preprocess_urls(dataframe, es_exclude_domains)

    # Pivot the user-URL DataFrame to create a user-URL matrix
    user_url_matrix = user_url_df.pivot_table(index="user", columns="url", aggfunc=len, fill_value=0)

    # Calculate URL frequencies (column sums)
    url_frequencies = user_url_matrix.sum(axis=0)

    # Filter URLs based on min_df and max_df
    valid_urls = url_frequencies[(url_frequencies >= min_df)].index
    filtered_user_url_matrix = user_url_matrix[valid_urls]

    # Convert to sparse matrix
    user_url_sparse_matrix = csr_matrix(filtered_user_url_matrix.values)

    # Apply TF-IDF transformation
    tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
    tfidf_matrix = tfidf_transformer.fit_transform(user_url_sparse_matrix)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Build user-user graph
    user_ids = filtered_user_url_matrix.index
    G = nx.Graph()
    for i, user1 in enumerate(user_ids):
        for j, user2 in enumerate(user_ids):
            if i != j and similarity_matrix[i, j] > threshold:
                G.add_edge(user1, user2, weight=similarity_matrix[i, j])

    return G


In [None]:
en_co_url_graph = construct_full_co_domain_network(en_data)


In [None]:
en_co_url_graph.size()

In [None]:
# nx.write_gexf(en_co_url_graph, "co_url_network.gexf")

In [None]:
es_co_url_graph = construct_full_co_domain_network(es_data, lang='es')


In [None]:
es_co_url_graph.size()

In [None]:
# nx.write_gexf(es_co_url_graph, "es_co_domain_network_new.gexf")
# nx.write_gexf(en_co_url_graph, "en_co_domain_network_new.gexf")

### 2. Cluster analysis

#### English

In [25]:
en_clusters = pd.read_csv("en_co_domain.csv")

In [26]:
en_clusters_64 = en_clusters[en_clusters['modularity_class']==64]


In [27]:
en_clusters_64 = en_clusters_64.sort_values(by='eigencentrality', ascending=False)


In [52]:
from collections import Counter

def get_top_domains(df, lang, data, exclude_domains):
    """
    Prints the 20 most common domains in a cluster of nodes.

    Parameters:
    - df: DataFrame representing the cluster, containing user IDs in the 'Id' column.
    - lang: Language ('en' or 'es') for processing.
    - data: DataFrame containing the raw tweet data for the respective language.
    - exclude_domains: List of domains to exclude from counting.
    """
    domains = []

    # Iterate over each user in the cluster
    for i, r in df.iterrows():
        user = r['Id'] 
        user_data = data[data['id_str'] == user] 
        
        for _, row in user_data.iterrows():
            urls = row['links']
            if isinstance(urls, list):
                for url in urls:
                    if (
                        isinstance(url, dict) and 
                        'expanded_url' in url and 
                        len(url['expanded_url']) > 0
                    ):
                        base_domain = extract_base_domain(url['expanded_url'])
                        if base_domain not in exclude_domains:
                            domains.append(base_domain)

    print(Counter(domains).most_common(20))
    print(f"Cluster size: {df.shape[0]}")
    print(f"Avg EC: {df['eigencentrality'].mean()}")
    print(f"Max EC: {df.sort_values(by='eigencentrality', ascending=False).head(2)}")


In [45]:
get_top_domains(en_clusters_64, 'en', en_data, en_exclude_domains)


[('breitbart.com', 259), ('foxnews.com', 65), ('nypost.com', 32), ('dailycaller.com', 26), ('thefederalist.com', 24), ('washingtonexaminer.com', 23), ('dailymail.co.uk', 21), ('redstate.com', 15), ('justthenews.com', 14), ('townhall.com', 12), ('americanthinker.com', 12), ('thepostmillennial.com', 12), ('zerohedge.com', 10), ('pjmedia.com', 10), ('westernjournal.com', 7), ('theconservativetreehouse.com', 7), ('newsmax.com', 6), ('rumble.com', 6), ('hotair.com', 6), ('dailywire.com', 6)]
Cluster size: 72
Avg EC: 0.42782433333333336
Max EC:                Id         Label  modularity_class  componentnumber  \
122  '4899465914'  '4899465914'                64                0   

     eigencentrality  Size  
122              1.0  10.0  


In [46]:
get_top_domains(en_clusters[en_clusters['modularity_class']==39], 'en', en_data, en_exclude_domains)

[('foxnews.com', 138), ('foxbusiness.com', 20), ('nypost.com', 11), ('breitbart.com', 9), ('washingtonexaminer.com', 7), ('dailycaller.com', 7), ('redstate.com', 5), ('babylonbee.com', 5), ('msn.com', 4), ('theguardian.com', 4), ('newsbreak.com', 4), ('newsbreakapp.com', 3), ('dailymail.co.uk', 3), ('politico.com', 2), ('washingtontimes.com', 2), ('rumble.com', 2), ('pbs.org', 2), ('justthenews.com', 2), ('thefederalist.com', 2), ('dailywire.com', 2)]
Cluster size: 40
Avg EC: 0.168047925
Max EC:               Id        Label  modularity_class  componentnumber  \
119  '343429091'  '343429091'                39                0   

     eigencentrality      Size  
119         0.604227  6.436669  


In [47]:
get_top_domains(en_clusters[en_clusters['modularity_class']==43], 'en', en_data, en_exclude_domains)

[('rawstory.com', 164), ('newsweek.com', 29), ('mediaite.com', 27), ('palmerreport.com', 23), ('alternet.org', 21), ('crooksandliars.com', 19), ('msn.com', 14), ('politicususa.com', 13), ('thedailybeast.com', 11), ('substack.com', 9), ('thehill.com', 6), ('salon.com', 6), ('msnbc.com', 6), ('theguardian.com', 6), ('dailykos.com', 6), ('huffpost.com', 6), ('apnews.com', 5), ('axios.com', 5), ('washingtonpost.com', 4), ('newrepublic.com', 3)]
Cluster size: 54
Avg EC: 0.023622148148148146
Max EC:              Id       Label  modularity_class  componentnumber  \
102  '20817529'  '20817529'                43                0   

     eigencentrality      Size  
102         0.070835  1.634285  


In [53]:
get_top_domains(en_clusters[en_clusters['modularity_class']==18], 'en', en_data, en_exclude_domains)

[('washingtonpost.com', 49), ('theguardian.com', 26), ('msnbc.com', 24), ('thedailybeast.com', 15), ('thehill.com', 11), ('dailykos.com', 11), ('apnews.com', 10), ('twitter.com', 10), ('nbcnews.com', 8), ('newrepublic.com', 7), ('cnn.com', 6), ('wsj.com', 5), ('newsweek.com', 5), ('alternet.org', 5), ('nytimes.com', 5), ('lgbtqnation.com', 4), ('politico.com', 4), ('substack.com', 4), ('propublica.org', 4), ('bbc.com', 3)]
Cluster size: 46
Avg EC: 0.0028649565217391316
Max EC:                         Id                  Label  modularity_class  \
209  '1431355664098603010'  '1431355664098603010'                18   
171             '23847371'             '23847371'                18   

     componentnumber  eigencentrality      Size  
209                0         0.027825  1.247045  
171                0         0.009219  1.079529  


#### Spanish:

In [50]:
es_clusters = pd.read_csv("es_co_domain.csv")

In [54]:
get_top_domains(es_clusters[es_clusters['modularity_class']==28], 'es', es_data, es_exclude_domains)

[('clarin.com', 731), ('abc.es', 343), ('cnn.com', 292), ('euronews.com', 105), ('google.com', 69), ('enter.co', 62), ('noticiasuno.com', 39), ('elcomercio.com', 32), ('colombia.com', 28), ('las2orillas.co', 27), ('elnacional.com', 26), ('BBC.com', 8), ('eltiempo.com', 7), ('reforma.com', 4), ('elnacional.com.do', 4), ('elpais.com', 4), ('washingtonpost.com', 3), ('lanacion.com.ar', 3), ('nytimes.com', 2), ('elmundo.es', 2)]
Cluster size: 141
Avg EC: 0.8161753617021277
Max EC:                         Id                  Label  modularity_class  \
567  '1609913670011858945'  '1609913670011858945'                28   
607           '3206275589'           '3206275589'                28   

     eigencentrality      Size  
567         1.000000  9.000000  
607         0.997883  8.987295  
