## Section 1: Setup & Load Graph

In [None]:
import os
import sys
import asyncio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from typing import Dict, List, Set, Tuple
from collections import defaultdict, Counter
import re

import httpx
import redis.asyncio as redis
from neo4j import GraphDatabase

plt.style.use('dark_background')
%matplotlib inline

print("‚úÖ Libraries loaded")

In [None]:
# Initialize connections
REDIS_URL = os.getenv('REDIS_URL', 'redis://redis:6379')
MEMGRAPH_URL = os.getenv('MEMGRAPH_URL', 'bolt://memgraph:7687')
BACKEND_API_URL = os.getenv('BACKEND_API_URL', 'http://code-graph-http:8000')

class GraphConnections:
    def __init__(self, redis_url: str, memgraph_url: str, api_url: str):
        self.redis_url = redis_url
        self.memgraph_url = memgraph_url
        self.api_url = api_url
        self.redis_client = None
        self.memgraph_driver = None
        self.http_client = None
    
    async def connect(self):
        try:
            self.redis_client = await redis.from_url(self.redis_url, decode_responses=True)
            await self.redis_client.ping()
            print("‚úÖ Redis connected")
        except Exception:
            print("‚ö†Ô∏è  Redis unavailable")
        
        try:
            self.memgraph_driver = GraphDatabase.driver(self.memgraph_url)
            with self.memgraph_driver.session() as session:
                session.run("RETURN 1").consume()
            print("‚úÖ Memgraph connected")
        except:
            print("‚ö†Ô∏è  Memgraph unavailable")
            self.memgraph_driver = None
        
        self.http_client = httpx.AsyncClient(base_url=self.api_url)
        print("‚úÖ HTTP client ready")

connections = GraphConnections(REDIS_URL, MEMGRAPH_URL, BACKEND_API_URL)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(connections.connect())

print("\n‚úÖ Connected")

In [None]:
# Load graph data
async def get_all_nodes():
    try:
        response = await connections.http_client.get('/api/graph/nodes/search?limit=1000')
        return pd.DataFrame(response.json().get('results', []))
    except Exception as e:
        return pd.DataFrame()

async def get_all_relationships():
    try:
        response = await connections.http_client.get('/api/graph/relationships?limit=5000')
        return response.json().get('results', [])
    except Exception:
        return []

nodes_df = loop.run_until_complete(get_all_nodes())
relationships = loop.run_until_complete(get_all_relationships())

print(f"‚úÖ Loaded {len(nodes_df)} nodes, {len(relationships)} relationships")

## Section 2: Extract Entity Types

Identify different types of entities in the codebase.

In [None]:
# Analyze entity types
entity_types = nodes_df['type'].value_counts()

print("üìä Entity Type Distribution:\n")
print(entity_types.to_string())

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
entity_types.plot(kind='barh', ax=ax, color='#4F46E5')
ax.set_xlabel('Count', fontsize=12)
ax.set_ylabel('Entity Type', fontsize=12)
ax.set_title('Code Entity Type Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Section 3: Extract Domain Vocabulary

Identify key naming patterns and domain concepts.

In [None]:
# Extract terms from names using camelCase/snake_case splitting
def split_name(name: str) -> List[str]:
    """Split camelCase and snake_case names into terms"""
    # First split on underscore
    parts = name.split('_')
    
    # Then split camelCase
    terms = []
    for part in parts:
        # Insert space before capital letters (not at start)
        spaced = re.sub(r'(?<!^)(?=[A-Z])', ' ', part)
        terms.extend([t.lower() for t in spaced.split() if t])
    
    return [t for t in terms if len(t) > 2]  # Filter short terms

# Extract all terms
all_terms = []
for name in nodes_df['name']:
    all_terms.extend(split_name(name))

# Count term frequency
term_frequency = Counter(all_terms)

print("üéØ Top 30 Domain Concepts (Most Frequent Terms):\n")
for term, count in term_frequency.most_common(30):
    print(f"  {term:20s}: {count:3d} occurrences")

In [None]:
# Visualize term frequency
top_terms = dict(term_frequency.most_common(15))

fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(list(top_terms.keys()), list(top_terms.values()), color='#8B5CF6')
ax.set_xlabel('Frequency', fontsize=12)
ax.set_ylabel('Domain Concept', fontsize=12)
ax.set_title('Top Domain Concepts in Codebase', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## Section 4: Entity by Language

Analyze entity distribution across programming languages.

In [None]:
# Language distribution
language_dist = nodes_df['language'].value_counts()

print("üåç Language Distribution:\n")
print(language_dist.to_string())

# Entity type by language
print("\nüìä Entity Types by Language:\n")
cross_tab = pd.crosstab(nodes_df['language'], nodes_df['type'])
print(cross_tab.to_string())

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

language_dist.plot(kind='bar', ax=ax1, color='#EC4899')
ax1.set_title('Entities by Language', fontsize=12, fontweight='bold')
ax1.set_ylabel('Count', fontsize=11)
ax1.set_xlabel('Language', fontsize=11)

cross_tab.plot(kind='bar', ax=ax2, stacked=True)
ax2.set_title('Entity Types by Language', fontsize=12, fontweight='bold')
ax2.set_ylabel('Count', fontsize=11)
ax2.set_xlabel('Language', fontsize=11)
ax2.legend(title='Entity Type', fontsize=9)

plt.tight_layout()
plt.show()

## Section 5: Relationship Patterns

Analyze types of relationships in the codebase.

In [None]:
# Relationship type distribution
rel_df = pd.DataFrame(relationships)

if not rel_df.empty and 'relationship_type' in rel_df.columns:
    rel_types = rel_df['relationship_type'].value_counts()
    
    print("üîó Relationship Type Distribution:\n")
    print(rel_types.to_string())
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 6))
    rel_types.plot(kind='pie', ax=ax, autopct='%1.1f%%', colors=plt.cm.Set3(range(len(rel_types))))
    ax.set_ylabel('')
    ax.set_title('Relationship Types in Code Graph', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
else:
    print("No relationship type data available")

## Section 6: Semantic Ontology Export

Generate RDF-like semantic representation.

In [None]:
# Create ontology representation
ontology = {
    "entities": {},
    "relationships": [],
    "concepts": {},
    "metadata": {
        "total_entities": len(nodes_df),
        "total_relationships": len(relationships),
        "languages": list(language_dist.index),
        "entity_types": list(entity_types.index),
        "top_concepts": dict(term_frequency.most_common(20))
    }
}

# Add entities
for idx, row in nodes_df.iterrows():
    entity_id = row['name']
    ontology["entities"][entity_id] = {
        "type": row['type'],
        "language": row.get('language'),
        "file": row.get('file'),
        "terms": split_name(row['name'])
    }

# Add relationships
for rel in relationships:
    ontology["relationships"].append({
        "source": rel.get('source_name'),
        "target": rel.get('target_name'),
        "type": rel.get('relationship_type')
    })

# Add concept hierarchy
for concept, count in term_frequency.most_common(50):
    ontology["concepts"][concept] = {
        "frequency": count,
        "entities": [name for name in nodes_df['name'] if concept in split_name(name)]
    }

print("‚úÖ Ontology Generated")
print(f"   Entities: {len(ontology['entities'])}")
print(f"   Relationships: {len(ontology['relationships'])}")
print(f"   Concepts: {len(ontology['concepts'])}")

# Export as JSON
import json
ontology_json = json.dumps(ontology, indent=2)
print(f"\nüìÑ Ontology size: {len(ontology_json)} bytes")

## Summary

This notebook extracted domain ontology from the code graph:

1. ‚úÖ **Entity Types** - Classes, functions, variables, modules
2. ‚úÖ **Domain Vocabulary** - Key naming patterns and concepts
3. ‚úÖ **Language Analysis** - Multi-language entity distribution
4. ‚úÖ **Relationships** - Connection types and patterns
5. ‚úÖ **Semantic Ontology** - RDF-like knowledge graph

### Use Cases:
- Auto-generate API documentation
- Semantic code search
- Domain model extraction
- Knowledge management

### Next Step:
- **Notebook 06**: C4 Diagram Generation (visualize architecture)