# Graph Basics & Connections

This notebook demonstrates:
1. **Setup & Connection** - Connect to the CodeNav backend API
2. **Graph Statistics** - Explore the analyzed codebase structure
3. **Node Exploration** - Search and filter nodes by type, language
4. **NetworkX Analysis** - Build in-memory graphs for analysis
5. **Degree Analysis** - Identify hubs and leaf nodes

**Backend**: CodeNav API at http://localhost:8000
**Analysis**: Real-time code graph queries with filtering

## Section 1: Setup & Connection

Import libraries and connect to the CodeNav backend API.

In [None]:
# Import required libraries
import sys
from pathlib import Path

# Data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict

# Add utils to path
sys.path.insert(0, str(Path.cwd() / 'utils'))
from graph_client import GraphClient

# Configure visualization
plt.style.use('dark_background')
%matplotlib inline

print("‚úÖ Libraries imported successfully")

In [None]:
# Initialize client and connect to CodeNav API
# Uses CODENAV_API_URL env var or defaults to localhost:8000
client = GraphClient()
await client.connect()

## Section 2: Graph Statistics

Fetch graph statistics from the backend API to understand the codebase structure.

In [None]:
# Fetch graph statistics
stats = await client.get_stats()

print("üìä Graph Statistics:")
print(f"   ‚Ä¢ Total Nodes: {stats.total_nodes}")
print(f"   ‚Ä¢ Total Relationships: {stats.total_relationships}")
print(f"   ‚Ä¢ Seam Count: {stats.seam_count}")

# Show language distribution
if stats.languages:
    print("\nüìä Languages:")
    for lang, count in stats.languages.items():
        print(f"   ‚Ä¢ {lang}: {count} nodes")
    
    # Visualize
    plt.figure(figsize=(10, 5))
    plt.bar(stats.languages.keys(), stats.languages.values(), color='#4F46E5')
    plt.xlabel('Language', fontsize=12)
    plt.ylabel('Node Count', fontsize=12)
    plt.title('Node Distribution by Language', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Show node type distribution
if stats.node_types:
    print("\nüìä Node Types:")
    for ntype, count in sorted(stats.node_types.items(), key=lambda x: -x[1]):
        print(f"   ‚Ä¢ {ntype}: {count}")

In [None]:
# Export graph with filtering (exclude tests and stdlib for cleaner view)
data = await client.export_graph(
    exclude_stdlib=True,
    exclude_tests=True,
    include_private=True,
    limit=5000
)

# Convert to DataFrames
nodes_df = pd.DataFrame(data.get('nodes', []))
links_df = pd.DataFrame(data.get('links', []))

print(f"üì¶ Exported Graph:")
print(f"   ‚Ä¢ Nodes: {len(nodes_df)}")
print(f"   ‚Ä¢ Links: {len(links_df)}")

# Show filter stats if available
filter_stats = data.get('stats', {}).get('filterStats', {})
if filter_stats:
    print(f"\nüîç Filter Statistics:")
    print(f"   ‚Ä¢ Filtered by tests: {filter_stats.get('filtered_by_tests', 0)}")
    print(f"   ‚Ä¢ Filtered by stdlib: {filter_stats.get('filtered_by_stdlib', 0)}")

# Show first few nodes
if not nodes_df.empty:
    print("\nüìã Sample Nodes:")
    display_cols = [c for c in ['name', 'type', 'language', 'file'] if c in nodes_df.columns]
    print(nodes_df[display_cols].head(10).to_string())

## Section 3: Entry Points & Seams

Explore detected entry points (CLI commands, HTTP handlers) and cross-language seams.

In [None]:
# Get entry points
entry_points = await client.get_entry_points(limit=20)

print("üöÄ Entry Points (Top-level handlers):\n")
if entry_points:
    for i, ep in enumerate(entry_points[:10], 1):
        name = ep.get('name', 'unknown')
        ep_type = ep.get('type', ep.get('entry_type', 'unknown'))
        file_path = ep.get('file', ep.get('location', {}).get('file_path', ''))
        print(f"{i:2d}. [{ep_type}] {name}")
        if file_path:
            print(f"       üìÅ {file_path}")
    print(f"\n‚úÖ Found {len(entry_points)} entry points")
else:
    print("No entry points detected")

In [None]:
# Get cross-language seams
seams = await client.get_seams(limit=20)

print("ü™° Cross-Language Seams:\n")
if seams:
    for i, seam in enumerate(seams[:10], 1):
        source = seam.get('source', seam.get('source_name', 'unknown'))
        target = seam.get('target', seam.get('target_name', 'unknown'))
        seam_type = seam.get('type', seam.get('seam_type', 'seam'))
        print(f"{i:2d}. {source} ‚Üî {target} [{seam_type}]")
    print(f"\n‚úÖ Found {len(seams)} seams")
else:
    print("No cross-language seams detected (single-language codebase)")

## Section 4: Building NetworkX Graph

Create an in-memory NetworkX graph for local analysis and visualization.

In [None]:
# Build NetworkX graph (excludes tests and stdlib by default)
print("üî® Building NetworkX graph...\n")

G = await client.build_networkx_graph(
    exclude_stdlib=True,
    exclude_tests=True,
    directed=True
)

print(f"‚úÖ Graph Statistics:")
print(f"   ‚Ä¢ Nodes: {G.number_of_nodes()}")
print(f"   ‚Ä¢ Edges: {G.number_of_edges()}")
print(f"   ‚Ä¢ Density: {nx.density(G):.4f}")

# Check connectivity
if G.number_of_nodes() > 0:
    weakly_connected = nx.number_weakly_connected_components(G)
    print(f"   ‚Ä¢ Weakly Connected Components: {weakly_connected}")
    
    # Largest component
    largest_cc = max(nx.weakly_connected_components(G), key=len)
    print(f"   ‚Ä¢ Largest Component Size: {len(largest_cc)} nodes")

In [None]:
print("üìà Degree Analysis\n")

if G.number_of_nodes() > 0:
    # In-degree distribution (how many functions call each function)
    in_degrees = dict(G.in_degree())
    top_called = sorted(in_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

    print("Top 10 Most Called (Highest In-Degree):")
    for func, degree in top_called:
        name = G.nodes[func].get('name', func)
        print(f"   {name}: {degree} callers")

    # Out-degree distribution (how many functions each function calls)
    out_degrees = dict(G.out_degree())
    top_callers = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

    print("\nTop 10 Most Calls (Highest Out-Degree):")
    for func, degree in top_callers:
        name = G.nodes[func].get('name', func)
        print(f"   {name}: {degree} calls")

    # Visualize degree distributions
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # In-degree histogram
    in_degree_values = [v for v in in_degrees.values() if v > 0]
    if in_degree_values:
        axes[0].hist(in_degree_values, bins=30, color='#4F46E5', alpha=0.7, edgecolor='white')
        axes[0].set_xlabel('In-Degree (Callers)', fontsize=11)
        axes[0].set_ylabel('Count', fontsize=11)
        axes[0].set_title('In-Degree Distribution', fontsize=12, fontweight='bold')
        axes[0].set_yscale('log')

    # Out-degree histogram
    out_degree_values = [v for v in out_degrees.values() if v > 0]
    if out_degree_values:
        axes[1].hist(out_degree_values, bins=30, color='#EC4899', alpha=0.7, edgecolor='white')
        axes[1].set_xlabel('Out-Degree (Callees)', fontsize=11)
        axes[1].set_ylabel('Count', fontsize=11)
        axes[1].set_title('Out-Degree Distribution', fontsize=12, fontweight='bold')
        axes[1].set_yscale('log')

    plt.tight_layout()
    plt.show()

    print(f"\n‚úÖ Analysis complete")
else:
    print("‚ö†Ô∏è  Graph is empty")

## Summary

This notebook demonstrated:

1. ‚úÖ **Connection Setup** - Connected to CodeNav API using modern async/await
2. ‚úÖ **Graph Statistics** - Explored node counts, languages, and types
3. ‚úÖ **Filtering** - Used exclude_tests and exclude_stdlib for cleaner graphs
4. ‚úÖ **Entry Points** - Found CLI commands, HTTP handlers
5. ‚úÖ **NetworkX Analysis** - Built in-memory graph for analysis
6. ‚úÖ **Degree Analysis** - Identified hubs and leaf nodes

### Next Notebooks:
- **02**: Centrality Analysis (PageRank, betweenness)
- **03**: Community Detection (Louvain, module boundaries)
- **04**: Architectural Patterns (god objects, cycles, seams)
- **05**: Ontology Extraction (domain vocabulary)
- **06**: C4 Diagram Generation