# 03 – Entropic Embeddings and Cognitive Distances
This notebook derives symbolic embeddings from the SWOW graph and connects them to the symbolic manifold $(\alpha, \kappa, E_r)$. It uses network embeddings, entropy measures, and topological proximity to operationalize cognitive curvature and symbolic unpredictability.

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib, sys, os
import warnings; warnings.filterwarnings("ignore")

from node2vec import Node2Vec        # pip install node2vec==0.4.6
from sklearn.decomposition import PCA
import seaborn as sns

# --- Paths ---
ROOT = pathlib.Path().resolve()
DATA = ROOT / "data"                 # ajuste conforme estrutura real
GRAPH_PATH = DATA / "swow_graph.gpickle"

if not GRAPH_PATH.exists():
    raise FileNotFoundError(f"Grafo não encontrado em {GRAPH_PATH}")

G = nx.read_gpickle(GRAPH_PATH)
print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

## Embedding the Graph with Node2Vec
We use Node2Vec to learn continuous feature representations for each node based on structural similarity and proximity.

In [None]:
# Node2Vec parameters tuned for 5k–100k nodes
node2vec = Node2Vec(
    G,
    dimensions=64,
    walk_length=40,
    num_walks=150,
    workers=os.cpu_count() // 2 or 1,
    p=1,  # return param
    q=1   # in–out param
)

model = node2vec.fit(window=10, min_count=1, batch_words=256)

# Embedding dataframe
embedding_df = (
    pd.DataFrame.from_dict(
        {node: model.wv[node] for node in G.nodes()},
        orient="index"
    )
)
embedding_df.index = embedding_df.index.astype(str)  # garante tipo string
embedding_df.head()

## Dimensionality Reduction
To visualize and use embeddings in symbolic models, we reduce them to 2D or 3D using PCA.

In [None]:
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(embedding_df.values)
embedding_df[["x", "y"]] = coords

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=embedding_df.sample(n=min(len(embedding_df), 5000), random_state=1),
    x="x", y="y", alpha=0.4, linewidth=0
)
plt.title("Node Embeddings – PCA (64 → 2)")
plt.axis("equal")
plt.tight_layout()
plt.show()

## Cognitive Symbolic Metrics (Curvature and Entropy)
We estimate local entropy and divergence using neighbors of each node.

In [None]:
def local_entropy(node):
    neigh = list(G.neighbors(node))
    if not neigh:
        return 0.0
    degs = np.array([G.degree(n) for n in neigh])
    p = degs / degs.sum()
    return -(p * np.log2(p)).sum()

embedding_df["E_r"] = embedding_df.index.map(local_entropy)
embedding_df.head()

In [None]:
# Salva embeddings + entropia para uso nos próximos notebooks
OUT = DATA / "swow_embeddings_entropy.parquet"
embedding_df.to_parquet(OUT)
print(f"Saved embeddings to {OUT}")