# Method Comparison: Co-occurrence vs Transformers

Compare classical co-occurrence networks with transformer-based similarity networks on the same dataset.

Sections:
1. Setup and Imports
2. Step 1: Basic Concepts
3. Step 2: Core Implementation (build both networks)
4. Step 3: Practical Examples (timings, samples)
5. Step 4: Visualization and Analysis (metrics, plots)

## Section 1: Setup and Imports

In [None]:
import time
from pathlib import Path
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
import sys
sys.path.insert(0, str(Path.cwd().parent))

from src.semantic.transformers_enhanced import TransformerSemanticNetwork

# We'll use the CLI build_semantic_network via a helper if needed; else, mimic co-occurrence with a stub
try:
    from src.semantic import build_semantic_network as cooccur
except Exception as e:
    cooccur = None
    print(f"Warning: co-occurrence module not available: {e}")

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Load sample data
sample_path = Path.cwd() / 'sample_news.csv'
if not sample_path.exists():
    sys.path.insert(0, str(Path.cwd()))
    from sample_data import generate_news_dataset
    df = generate_news_dataset(n_docs=200)
    df.to_csv(sample_path, index=False)
else:
    df = pd.read_csv(sample_path)

texts = df['text'].tolist()
print(f"Loaded {len(texts)} documents")

## Section 2: Step 1 — Basic Concepts

- Co-occurrence: Connect terms based on shared windows (PPMI weights)
- Transformers: Connect documents/terms based on embedding cosine similarity

## Section 3: Step 2 — Core Implementation

We will build:
- Transformer document similarity network
- Co-occurrence term network (if module available)

In [None]:
# Build transformer document network
builder = TransformerSemanticNetwork()

start = time.time()
edges_transformer = builder.build_document_network(
    documents=texts,
    similarity_threshold=0.3,
    top_k=10
)
elapsed_transformer = time.time() - start

print(f"Transformer network: {len(edges_transformer)} edges in {elapsed_transformer:.2f}s")

# Build co-occurrence network if available
edges_cooccur = None
elapsed_cooccur = None

if cooccur is not None:
    start = time.time()
    # Using a minimal pipeline helper if available
    # Fallback: skip building if not present
    try:
        # If a function exists to build from DataFrame, use it
        from src.semantic.build_semantic_network import build_semantic_from_df
        import tempfile, os
        tmpdir = tempfile.mkdtemp()
        build_semantic_from_df(df, tmpdir, min_df=5, topk=20)
        # Load edges
        edges_path = Path(tmpdir) / 'edges.csv'
        if edges_path.exists():
            edges_cooccur = pd.read_csv(edges_path)
            elapsed_cooccur = time.time() - start
            print(f"Co-occurrence network: {len(edges_cooccur)} edges in {elapsed_cooccur:.2f}s")
        else:
            print("Co-occurrence edges.csv not found; skipping.")
    except Exception as e:
        print(f"Co-occurrence build failed: {e}")
else:
    print("Co-occurrence module unavailable; skipping.")

## Section 4: Step 3 — Practical Examples

Preview edges and sample nodes; compute basic stats.

In [None]:
print("Transformer edges sample:")
print(edges_transformer.head())

if edges_cooccur is not None:
    print("\nCo-occurrence edges sample:")
    print(edges_cooccur.head())

# Build simple graphs for metrics
G_trans = nx.Graph()
for _, r in edges_transformer.iterrows():
    G_trans.add_edge(r['src'], r['dst'], weight=r['weight'])

if edges_cooccur is not None:
    G_co = nx.Graph()
    src_col = 'source' if 'source' in edges_cooccur.columns else edges_cooccur.columns[0]
    dst_col = 'target' if 'target' in edges_cooccur.columns else edges_cooccur.columns[1]
    w_col = 'weight' if 'weight' in edges_cooccur.columns else edges_cooccur.columns[-1]
    for _, r in edges_cooccur.iterrows():
        G_co.add_edge(r[src_col], r[dst_col], weight=r[w_col])

## Section 5: Step 4 — Visualization and Analysis

Compute and compare simple metrics and visualizations.

In [None]:
def summarize_graph(G):
    return {
        'nodes': G.number_of_nodes(),
        'edges': G.number_of_edges(),
        'density': nx.density(G),
        'avg_degree': np.mean([d for _, d in G.degree()]) if G.number_of_nodes() else 0,
        'components': nx.number_connected_components(G)
    }

summary_trans = summarize_graph(G_trans)
print("Transformer graph:", summary_trans)

if edges_cooccur is not None:
    summary_co = summarize_graph(G_co)
    print("Co-occurrence graph:", summary_co)

# Plot degree distributions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist([d for _, d in G_trans.degree()], bins=20, color='skyblue', edgecolor='black')
plt.title('Transformer Degree Distribution')

if edges_cooccur is not None:
    plt.subplot(1, 2, 2)
    plt.hist([d for _, d in G_co.degree()], bins=20, color='lightcoral', edgecolor='black')
    plt.title('Co-occurrence Degree Distribution')

plt.tight_layout()
plt.show()

# Timing summary
print("\nTiming Summary:")
print(f"Transformer network build time: {elapsed_transformer:.2f}s")
if elapsed_cooccur is not None:
    print(f"Co-occurrence network build time: {elapsed_cooccur:.2f}s")
else:
    print("Co-occurrence network build time: N/A")