In [2]:
import networkx as nx
import json
from datetime import datetime

def criar_arquivo_gephi_temporal(caminho_arquivo_jsonl, arquivo_saida_gexf):
    """
    Cria um arquivo .gexf temporal para o Gephi.
    Nós = usuários; Arestas = interações (menções/respostas) com stance e tempo.
    """
    G = nx.DiGraph()
    # Grafo dinâmico + formato de tempo em datetime ISO
    G.graph["mode"] = "dynamic"
    G.graph["timeformat"] = "datetime"  # Gephi lê 'start'/'end' como strings ISO

    def to_iso_z(ts: str) -> str:
        # Aceita "....Z" ou com offset; zera microssegundos e força 'Z' (UTC)
        dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
        dt = dt.replace(microsecond=0)
        # Gephi lida bem com 'Z'
        return dt.isoformat().replace("+00:00", "Z")

    with open(caminho_arquivo_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            try:
                tweet = json.loads(line)
                autor = tweet.get("user")
                stance = tweet.get("stance")
                timestamp_str = tweet.get("created_at_iso")
                if not autor or not stance or not timestamp_str:
                    continue

                valid_timestamp = to_iso_z(timestamp_str)

                # garante existência de nó do autor
                G.add_node(autor)

                # coletar alvos da interação (menções + reply-to)
                mencoes = [m.get("username") for m in tweet.get("mentions", []) if m.get("username")]
                usuario_respondido = tweet.get("in_reply_to_user")
                if usuario_respondido:
                    mencoes.append(usuario_respondido)

                # criar arestas com tempo e stance
                for mencionado in set(mencoes):
                    if not mencionado:
                        continue
                    G.add_node(mencionado)
                    # start=end => evento instantâneo naquele timestamp
                    G.add_edge(
                        autor,
                        mencionado,
                        stance=stance,
                        start=valid_timestamp,
                        end=valid_timestamp,
                    )
            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"Erro ao processar linha: {line.strip()}. Erro: {e}")
                continue

    print(f"Grafo temporal criado com {G.number_of_nodes()} nós e {G.number_of_edges()} arestas.")

    try:
        # Deixe no default (1.2draft) ou especifique '1.2draft'
        nx.write_gexf(G, arquivo_saida_gexf)  # ou version='1.2draft'
        print(f"Arquivo '{arquivo_saida_gexf}' criado com sucesso!")
    except Exception as e:
        print(f"Ocorreu um erro ao salvar o arquivo: {e}")


In [4]:
# --- Como usar a função ---
# Certifique-se que este arquivo existe e contém os dados no formato esperado
arquivo_de_tweets_classificados = 'tweets_classified_wagner_schwartz.jsonl' 
nome_do_arquivo_gephi_temporal = 'wagner_schwartz_temporal.gexf'

criar_arquivo_gephi_temporal(arquivo_de_tweets_classificados, nome_do_arquivo_gephi_temporal)

Grafo temporal criado com 1098 nós e 848 arestas.
Arquivo 'eduardo_bueno_temporal.gexf' criado com sucesso!


In [5]:

# network_builder.py
# Build a user-user interaction network (reply / retweet / mention) from your JSONL
# and export to GEXF for Gephi (with stance + metrics on nodes and edge weights).

import json
from collections import defaultdict
from datetime import datetime
from pathlib import Path

import networkx as nx
import pandas as pd


# -----------------------------
# Config
# -----------------------------
INPUT_JSONL = 'tweets_classified_wagner_schwartz.jsonl' # adjust path if needed
OUTPUT_GEXF = 'wagner_schwartz_temporal.gexf'
OUTPUT_CSV_NODES = "network_nodes.csv"
OUTPUT_CSV_EDGES = "network_edges.csv"

# Choose which interactions to include as edges
INCLUDE_REPLIES = True
INCLUDE_RETWEETS = True
INCLUDE_MENTIONS = True
INCLUDE_QUOTES = True   # will only be used if we have a quoted user field

# Edge weighting strategy: how much to weight each interaction type
W_REPLY = 1.0
W_RETWEET = 1.0
W_MENTION = 0.5
W_QUOTE = 1.0


# -----------------------------
# Load
# -----------------------------
def load_jsonl(path):
    recs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                recs.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return pd.DataFrame(recs)


df = load_jsonl(INPUT_JSONL)

# Normalize/ensure expected columns exist
for col in [
    "user", "stance", "is_retweet", "is_quote", "in_reply_to_user",
    "mentions", "created_at_iso", "like_count", "retweet_count",
    "reply_count", "quote_count", "id"
]:
    if col not in df.columns:
        if col == "mentions":
            df[col] = [[] for _ in range(len(df))]
        else:
            df[col] = None

# Parse datetime
if "created_at_iso" in df.columns:
    df["created_at_iso"] = pd.to_datetime(df["created_at_iso"], errors="coerce")


# -----------------------------
# Build directed graph
# -----------------------------
G = nx.DiGraph()

# Mark dynamic/time metadata so Gephi recognizes temporal attributes (optional)
G.graph["mode"] = "dynamic"
G.graph["timeformat"] = "datetime"

# Aggregate edge weights before adding to graph (reduces duplicates)
edge_accumulator = defaultdict(lambda: {"weight": 0.0, "types": defaultdict(int)})

def add_user_if_needed(u, stance=None):
    if not G.has_node(u):
        G.add_node(u)
        if stance:
            G.nodes[u]["stance"] = stance

def add_edge(u, v, w, etype, time=None):
    key = (u, v)
    edge_accumulator[key]["weight"] += w
    edge_accumulator[key]["types"][etype] += 1
    # Optional: keep earliest interaction time for temporal start
    if time is not None:
        if "start" not in edge_accumulator[key]:
            edge_accumulator[key]["start"] = time
        else:
            # keep the earliest
            edge_accumulator[key]["start"] = min(edge_accumulator[key]["start"], time)

for _, row in df.iterrows():
    u = row.get("user")
    if not u:
        continue

    stance = row.get("stance") or "neutro"
    add_user_if_needed(u, stance=stance)

    t = row.get("created_at_iso")

    # Replies (u -> replied_user)
    if INCLUDE_REPLIES and row.get("in_reply_to_user"):
        v = row["in_reply_to_user"]
        if v:
            add_user_if_needed(v)  # stance unknown for targets we haven't seen tweeting
            add_edge(u, v, W_REPLY, "reply", time=t)

    # Retweets
    # NOTE: In some datasets, the original author isn't provided. If your JSONL
    # encodes the original author elsewhere (e.g., "retweeted_user"), swap in here.
    if INCLUDE_RETWEETS and row.get("is_retweet") and row.get("in_reply_to_user"):
        v = row["in_reply_to_user"]
        if v:
            add_user_if_needed(v)
            add_edge(u, v, W_RETWEET, "retweet", time=t)

    # Mentions (u -> each mentioned @user)
    if INCLUDE_MENTIONS and isinstance(row.get("mentions"), list):
        for v in row["mentions"]:
            if v and v != u:
                add_user_if_needed(v)
                add_edge(u, v, W_MENTION, "mention", time=t)

    # Quotes (if you have a quoted user field; many datasets don’t—kept here for completeness)
    # Example placeholder: row.get("quoted_user")
    quoted_user = row.get("quoted_user")
    if INCLUDE_QUOTES and row.get("is_quote") and quoted_user:
        v = quoted_user
        add_user_if_needed(v)
        add_edge(u, v, W_QUOTE, "quote", time=t)

# Add nodes’ numeric activity/engagement aggregates
user_group = df.groupby("user", dropna=True)
for u, g in user_group:
    if not G.has_node(u):
        continue
    G.nodes[u]["tweets"] = int(len(g))
    G.nodes[u]["likes_sum"] = int(g["like_count"].fillna(0).sum())
    G.nodes[u]["retweets_sum"] = int(g["retweet_count"].fillna(0).sum())
    G.nodes[u]["replies_sum"] = int(g["reply_count"].fillna(0).sum())
    G.nodes[u]["quotes_sum"] = int(g["quote_count"].fillna(0).sum())
    # (Re)assign stance as the mode for users with mixed stances
    mode_stance = g["stance"].dropna().mode()
    if len(mode_stance):
        G.nodes[u]["stance"] = mode_stance.iloc[0]

# Materialize edges in the graph
for (u, v), info in edge_accumulator.items():
    G.add_edge(u, v,
               weight=float(info["weight"]),
               reply_count=int(info["types"].get("reply", 0)),
               retweet_count=int(info["types"].get("retweet", 0)),
               mention_count=int(info["types"].get("mention", 0)),
               quote_count=int(info["types"].get("quote", 0)),
               # temporal start for edge (first observed interaction)
               start=info.get("start").isoformat() if isinstance(info.get("start"), pd.Timestamp) else None
               )

print(f"Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")


# -----------------------------
# Centralities (on the directed graph)
# -----------------------------
# Use weight for degree-like metrics; for PR, weight is a 'weight' edge attribute
in_deg = dict(G.in_degree(weight="weight"))
out_deg = dict(G.out_degree(weight="weight"))
nx.set_node_attributes(G, in_deg, "in_strength")
nx.set_node_attributes(G, out_deg, "out_strength")

try:
    pr = nx.pagerank(G, weight="weight", alpha=0.85, max_iter=100)
    nx.set_node_attributes(G, pr, "pagerank")
except nx.PowerIterationFailedConvergence:
    print("PageRank didn't converge; skipping.")

btw = nx.betweenness_centrality(G, weight="weight", normalized=True, k=None)
nx.set_node_attributes(G, btw, "betweenness")


# -----------------------------
# Export: GEXF + CSVs for quick inspection
# -----------------------------
# Gephi sometimes complains if None appears in attributes; clean them
for n, data in G.nodes(data=True):
    for k, v in list(data.items()):
        if v is None:
            G.nodes[n][k] = ""

for u, v, data in G.edges(data=True):
    for k, v2 in list(data.items()):
        if v2 is None:
            G.edges[u, v][k] = ""

nx.write_gexf(G, OUTPUT_GEXF)
print(f"Saved GEXF → {OUTPUT_GEXF}")

# Also dump flat CSVs
nodes_rows = []
for n, d in G.nodes(data=True):
    row = {"user": n}
    row.update(d)
    nodes_rows.append(row)

edges_rows = []
for u, v, d in G.edges(data=True):
    row = {"source": u, "target": v}
    row.update(d)
    edges_rows.append(row)

pd.DataFrame(nodes_rows).to_csv(OUTPUT_CSV_NODES, index=False)
pd.DataFrame(edges_rows).to_csv(OUTPUT_CSV_EDGES, index=False)
print(f"Saved nodes CSV → {OUTPUT_CSV_NODES}")
print(f"Saved edges CSV → {OUTPUT_CSV_EDGES}")


TypeError: unhashable type: 'dict'