# Script for creating an edgelist from the users_stream gzip files
Author: Ali Salloum 23.3.2023

- The mapping from the author_id to the nickname can be done with the candidate2023 list.
- First node represents the retweeter, second node the retweeted.

In [1]:
import os
import gzip
import json

import pandas as pd
from tqdm import tqdm

import networkx as nx

In [None]:
# Set the directory
data_dir = "../users_stream"

# Load the data
twitter_files = sorted(os.listdir(data_dir))

raw_data = []

for tweets_file in tqdm(twitter_files):
  with gzip.open(filename = os.path.join(data_dir, tweets_file), mode = 'rb') as f_tweets:
        for line in f_tweets:
          tweet = json.loads(line)
          raw_data.append(tweet)

In [None]:
# Helper functions

def remove_duplicates(data):

    print(f"Number of json objects found is {len(raw_data)}.")
    
    no_duplicates = []

    for obj in tqdm(data):
        if obj not in no_duplicates:
            no_duplicates.append(obj)

    print(f"After removing the duplicates, we are left with {len(data)} objects.")

    return no_duplicates

def check_retweet_status(obj):
    if "referenced_tweets" in obj:
        if obj["referenced_tweets"][0]["type"] == "retweeted":
            return True
        else:
            return False
    else:
        return False

def edge_formation(data):
    
    data = remove_duplicates(data)

    edge_data = []

    for obj in tqdm(data):
        if check_retweet_status(obj):
            retweeter_node = obj["author_id"]
            retweeted_node = obj["referenced_tweets"][0]["tweet"]["author_id"]
            timestamp = obj["created_at"]

            edge_data.append([retweeter_node, retweeted_node, timestamp])
        else:
            continue

    return edge_data

In [None]:
# Construct the edge list
edge_list = edge_formation(raw_data)

# Convert it to pandas dataframe object and save it 
df = pd.DataFrame(edge_list)
df.to_csv("./users_stream_edgelist.txt", index=False, header=False)

Refining

In [3]:
df = pd.read_csv("./users_stream_processed/users_stream_edgelists.txt")
df.columns = ["source", "target", "timestamp"]

df_weights = df.value_counts(["source", "target"]).reset_index()
df_weights.columns = ["source", "target", "weight"]

G = nx.from_pandas_edgelist(df_weights, edge_attr = "weight", create_using = nx.DiGraph())



In [4]:
candidates = pd.read_csv("candidates-2023.csv")
candidates_full = pd.read_csv("candidates2023-complete.csv")

In [5]:
id_2_candidate = dict(zip(candidates.id, candidates.screen_name))

candidate_2_party = dict(zip(candidates_full.screen_name, candidates_full.puolue))
candidate_2_age = dict(zip(candidates_full.screen_name, candidates_full.ikä))
candidate_2_sex = dict(zip(candidates_full.screen_name, candidates_full.sukupuoli))
candidate_2_hometown = dict(zip(candidates_full.screen_name, candidates_full.kotikunta))
candidate_2_lang = dict(zip(candidates_full.screen_name, candidates_full.kieli))

In [29]:
# Add screen_names
for node in G.nodes():
    if node in id_2_candidate:
        G.nodes[node]["screen_name"] = id_2_candidate[node]
    else:
        G.nodes[node]["screen_name"] = "NA"

In [30]:
party_attributes = dict()
sex_attributes = dict()
language_attributes = dict()

for node in G.nodes():
    node_screen_name = G.nodes[node]["screen_name"]
    if node_screen_name in candidate_2_party:
        party_attributes[node] = candidate_2_party[node_screen_name].rstrip()
        sex_attributes[node] = candidate_2_sex[node_screen_name]
        language_attributes[node] = candidate_2_lang[node_screen_name]
    else:
        party_attributes[node] = "NA"
        sex_attributes[node] = "NA"
        language_attributes[node] = "NA"

nx.set_node_attributes(G, party_attributes, "party")
nx.set_node_attributes(G, sex_attributes, "sex")
nx.set_node_attributes(G, language_attributes, "language")

In [31]:
nx.write_graphml_lxml(G, "candidates-full.graphml")

In [37]:
Gcc = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])


In [45]:
nx.write_graphml_lxml(G0, "candidates-full-gc.graphml")