# Graph Basics & Connections

This notebook demonstrates:
1. **Setup & Connection Management** - Initialize Redis, Memgraph, and Backend API
2. **Loading Graph Data** - Consume CDC events from Redis Streams
3. **Basic Cypher Queries** - Execute queries against Memgraph
4. **Graph Statistics & Visualization** - Explore the code graph structure

**Data Source**: Code-graph-mcp repository (489 nodes, 4475 edges)
**Analysis Type**: Real-time event-driven graph updates

## Section 1: Setup & Connection Management

Initialize connections to all three backends: Redis (CDC events), Memgraph (graph queries), and HTTP API (metadata).

In [None]:
# Import required libraries
import os
import sys
import asyncio
import json
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from datetime import datetime

# Data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict

# Async HTTP and database clients
import httpx
import redis.asyncio as redis
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable

print("‚úÖ Libraries imported successfully")

# Configure visualization
plt.style.use('dark_background')
%matplotlib inline

print("‚úÖ Visualization configured")

In [None]:
# Add utilities to path
sys.path.insert(0, '/home/jovyan/work/utils')

# Load environment variables
REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
MEMGRAPH_URL = os.getenv('MEMGRAPH_URL', 'bolt://memgraph:7687')
BACKEND_API_URL = os.getenv('BACKEND_API_URL', 'http://code-graph-http:8000')

print(f"üìå Redis: {REDIS_URL}")
print(f"üìå Memgraph: {MEMGRAPH_URL}")
print(f"üìå Backend API: {BACKEND_API_URL}")

In [None]:
# Helper class for managing connections
class GraphConnections:
    """Manages connections to Redis, Memgraph, and Backend API"""
    
    def __init__(self, redis_url: str, memgraph_url: str, api_url: str):
        self.redis_url = redis_url
        self.memgraph_url = memgraph_url
        self.api_url = api_url
        
        self.redis_client = None
        self.memgraph_driver = None
        self.http_client = None
    
    async def connect(self):
        """Establish all connections"""
        try:
            # Redis connection
            self.redis_client = await redis.from_url(self.redis_url, decode_responses=True)
            await self.redis_client.ping()
            print("‚úÖ Redis connected")
        except Exception as e:
            print(f"‚ö†Ô∏è  Redis connection failed: {e}")
        
        try:
            # Memgraph connection
            self.memgraph_driver = GraphDatabase.driver(self.memgraph_url)
            with self.memgraph_driver.session() as session:
                result = session.run("RETURN 1")
                result.consume()
            print("‚úÖ Memgraph connected")
        except ServiceUnavailable as e:
            print(f"‚ö†Ô∏è  Memgraph not available: {e}")
            self.memgraph_driver = None
        except Exception as e:
            print(f"‚ö†Ô∏è  Memgraph connection failed: {e}")
            self.memgraph_driver = None
        
        # HTTP client (always works)
        self.http_client = httpx.AsyncClient(base_url=self.api_url)
        print("‚úÖ HTTP client ready")
    
    async def close(self):
        """Close all connections"""
        if self.redis_client:
            await self.redis_client.close()
        if self.memgraph_driver:
            self.memgraph_driver.close()
        if self.http_client:
            await self.http_client.aclose()
        print("‚úÖ All connections closed")

# Initialize connections
connections = GraphConnections(REDIS_URL, MEMGRAPH_URL, BACKEND_API_URL)

# For Jupyter, we need to handle async differently
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(connections.connect())

print("\n‚úÖ All systems initialized!")

## Section 2: Loading Graph Data & Statistics

Fetch basic graph statistics from the backend API. This shows the current state of the analyzed codebase.

In [None]:
async def get_graph_stats():
    """Fetch graph statistics from backend API"""
    try:
        response = await connections.http_client.get('/api/graph/stats')
        stats = response.json()
        return stats
    except Exception as e:
        print(f"Error fetching stats: {e}")
        return None

# Fetch and display stats
stats = loop.run_until_complete(get_graph_stats())

if stats:
    print("üìä Graph Statistics:")
    print(f"   ‚Ä¢ Nodes: {stats.get('total_nodes', 0)}")
    print(f"   ‚Ä¢ Relationships: {stats.get('total_relationships', 0)}")
    print(f"   ‚Ä¢ Languages: {', '.join(stats.get('languages', []))}")
    print(f"   ‚Ä¢ Entry Points: {stats.get('entry_points', 0)}")
    
    # Visualize node distribution by language
    if 'nodes_by_language' in stats:
        lang_dist = stats['nodes_by_language']
        
        plt.figure(figsize=(10, 6))
        plt.bar(lang_dist.keys(), lang_dist.values(), color='#4F46E5')
        plt.xlabel('Language', fontsize=12)
        plt.ylabel('Node Count', fontsize=12)
        plt.title('Node Distribution by Language', fontsize=14, fontweight='bold')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        print(f"\n‚úÖ Graph loaded with {stats.get('total_nodes', 0)} nodes")

In [None]:
async def get_all_nodes():
    """Fetch all nodes from the graph"""
    try:
        response = await connections.http_client.get('/api/graph/nodes/search?limit=1000')
        data = response.json()
        return pd.DataFrame(data.get('results', []))
    except Exception as e:
        print(f"Error fetching nodes: {e}")
        return pd.DataFrame()

# Fetch nodes
nodes_df = loop.run_until_complete(get_all_nodes())

if not nodes_df.empty:
    print(f"Loaded {len(nodes_df)} nodes")
    print("\nFirst 10 nodes:")
    print(nodes_df.head(10)[['name', 'type', 'file', 'language']].to_string())
    
    # Show node type distribution
    print("\n\nNode Type Distribution:")
    print(nodes_df['type'].value_counts())

## Section 3: Basic Cypher Queries (via Memgraph)

If Memgraph is available, execute Cypher queries to explore the graph structure.

In [None]:
def run_cypher_query(query: str, params: Dict = None) -> List[Dict]:
    """Execute a Cypher query against Memgraph"""
    if not connections.memgraph_driver:
        print("‚ö†Ô∏è  Memgraph not available")
        return []
    
    try:
        with connections.memgraph_driver.session() as session:
            result = session.run(query, params or {})
            return [record.data() for record in result]
    except Exception as e:
        print(f"Query error: {e}")
        return []

# Query 1: Entry points (top-level functions)
print("üîç Query 1: Entry Points (Top-level functions)\n")
query1 = """
MATCH (f:Function {is_entry_point: true})
RETURN f.name as name, f.file as file, f.language as language
LIMIT 20
"""
results = run_cypher_query(query1)
if results:
    entry_points_df = pd.DataFrame(results)
    print(entry_points_df.to_string())
    print(f"\nFound {len(results)} entry points")
else:
    print("No results or Memgraph unavailable")

In [None]:
print("\n" + "="*60)
print("üîç Query 2: Function Hubs (Most called functions)\n")

query2 = """
MATCH (f:Function)<-[:CALLS]-(callers)
WITH f, count(callers) as caller_count
WHERE caller_count > 2
RETURN f.name as name, f.file as file, caller_count
ORDER BY caller_count DESC
LIMIT 15
"""
results = run_cypher_query(query2)
if results:
    hubs_df = pd.DataFrame(results)
    print(hubs_df.to_string())
    
    # Visualize top hubs
    plt.figure(figsize=(12, 6))
    top_10 = hubs_df.head(10)
    plt.barh(top_10['name'], top_10['caller_count'], color='#EC4899')
    plt.xlabel('Number of Callers', fontsize=12)
    plt.title('Top 10 Function Hubs (Most Called)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    print(f"\n‚úÖ Found {len(results)} function hubs")
else:
    print("No results or Memgraph unavailable")

In [None]:
print("\n" + "="*60)
print("üîç Query 3: Call Chain Analysis\n")

# Find a function and trace its callers
if not nodes_df.empty:
    sample_function = nodes_df[nodes_df['type'] == 'Function'].iloc[0]['name']
    
    query3 = f"""
    MATCH path = (caller:Function)-[:CALLS*1..3]->(:Function {{name: '{sample_function}'}})
    RETURN 
        length(path) as distance,
        [node in nodes(path) | node.name] as call_path
    ORDER BY distance
    LIMIT 10
    """
    
    print(f"Analyzing callers of function: {sample_function}\n")
    results = run_cypher_query(query3)
    
    if results:
        for i, result in enumerate(results[:5], 1):
            path = " ‚Üí ".join(result['call_path'])
            print(f"{i}. [{result['distance']} hops] {path}")
    else:
        print("No call paths found or Memgraph unavailable")

## Section 4: Building NetworkX Graph from Backend Data

Create an in-memory NetworkX graph for fast local analysis and visualization.

In [None]:
async def get_all_relationships():
    """Fetch all relationships from the graph"""
    try:
        response = await connections.http_client.get('/api/graph/relationships?limit=5000')
        data = response.json()
        return data.get('results', [])
    except Exception as e:
        print(f"Error fetching relationships: {e}")
        return []

# Build NetworkX graph
print("üî® Building NetworkX graph...\n")

# Create directed graph
G = nx.DiGraph()

# Add nodes
if not nodes_df.empty:
    for idx, row in nodes_df.iterrows():
        G.add_node(row['name'], 
                   type=row.get('type'), 
                   file=row.get('file'),
                   language=row.get('language'))
    print(f"‚úÖ Added {len(nodes_df)} nodes")

# Add edges
relationships = loop.run_until_complete(get_all_relationships())
for rel in relationships:
    if rel.get('source_name') in G and rel.get('target_name') in G:
        G.add_edge(rel['source_name'], rel['target_name'], 
                   relationship_type=rel.get('relationship_type'))

print(f"‚úÖ Added {G.number_of_edges()} edges")
print(f"\nGraph Statistics:")
print(f"   ‚Ä¢ Nodes: {G.number_of_nodes()}")
print(f"   ‚Ä¢ Edges: {G.number_of_edges()}")
print(f"   ‚Ä¢ Density: {nx.density(G):.4f}")
print(f"   ‚Ä¢ Diameter: {nx.diameter(G) if nx.is_strongly_connected(G) else 'N/A (disconnected)'}")

In [None]:
print("\n" + "="*60)
print("üìà Basic Graph Analysis\n")

# In-degree distribution (how many functions call each function)
in_degrees = dict(G.in_degree())
top_called = sorted(in_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 Most Called Functions:")
for func, degree in top_called:
    print(f"   {func}: {degree} callers")

# Out-degree distribution (how many functions each function calls)
out_degrees = dict(G.out_degree())
top_callers = sorted(out_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

print("\nTop 10 Functions That Call Most Others:")
for func, degree in top_callers:
    print(f"   {func}: {degree} calls")

# Visualize degree distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# In-degree histogram
in_degree_values = list(in_degrees.values())
axes[0].hist(in_degree_values, bins=50, color='#4F46E5', alpha=0.7)
axes[0].set_xlabel('In-Degree (Number of Callers)', fontsize=11)
axes[0].set_ylabel('Number of Functions', fontsize=11)
axes[0].set_title('In-Degree Distribution', fontsize=12, fontweight='bold')
axes[0].set_yscale('log')

# Out-degree histogram
out_degree_values = list(out_degrees.values())
axes[1].hist(out_degree_values, bins=50, color='#EC4899', alpha=0.7)
axes[1].set_xlabel('Out-Degree (Number of Calls)', fontsize=11)
axes[1].set_ylabel('Number of Functions', fontsize=11)
axes[1].set_title('Out-Degree Distribution', fontsize=12, fontweight='bold')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Analysis complete")

## Summary

This notebook demonstrated:

1. ‚úÖ **Connection Setup** - Initialized async connections to Redis, Memgraph, and Backend API
2. ‚úÖ **Graph Statistics** - Loaded and visualized graph metadata
3. ‚úÖ **Cypher Queries** - Executed queries against Memgraph (entry points, hubs, call chains)
4. ‚úÖ **NetworkX Analysis** - Built in-memory graph for local analysis
5. ‚úÖ **Degree Analysis** - Identified hubs and leaf nodes

### Next Steps:
- **Notebook 02**: Centrality Analysis (PageRank, betweenness, closeness)
- **Notebook 03**: Community Detection (Louvain algorithm, module boundaries)
- **Notebook 04**: Architectural Patterns (seams, coupling, god functions)
- **Notebook 05**: Ontology Extraction (domain vocabulary, concept mapping)
- **Notebook 06**: C4 Diagram Generation (architecture visualization)