# Layer 2: Logical Blueprint - Relationships and Data Model Discovery

Interactive exploration of primary keys, foreign keys, relationships, cardinality patterns, and logical data model structure.

**Author:** Data Archaeologist Team  
**Version:** 2.0  
**Date:** 2025-08-28

In [None]:
# Import required libraries
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
import ipywidgets as widgets
from IPython.display import display, HTML
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
PROJECT_ROOT = Path('.').parent
sys.path.insert(0, str(PROJECT_ROOT))

from data_archaeologist.core.database_connection import DatabaseConnection
from data_archaeologist.layer2_logical.primary_key_detection import PrimaryKeyDetection
from data_archaeologist.layer2_logical.foreign_key_detection import ForeignKeyDetection
from data_archaeologist.layer2_logical.cardinality_analysis import CardinalityAnalysis

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All imports successful")

In [None]:
# Load configuration and initialize components
config_file = '../config.json'

with open(config_file, 'r') as f:
    config = json.load(f)

environments = list(config['environments'].keys())
analysis_settings = config.get('analysis_settings', {})

db_connection = DatabaseConnection(config_file)

# Initialize analysis components
pk_detector = PrimaryKeyDetection()
fk_detector = ForeignKeyDetection()
cardinality_analyzer = CardinalityAnalysis()

print(f"Available environments: {environments}")
print(f"Layer 2 components initialized: PK Detection, FK Detection, Cardinality Analysis")

In [None]:
# Environment selection for Layer 2 analysis

env_dropdown = widgets.Dropdown(
    options=environments,
    value=environments[0] if environments else None,
    description='Environment:',
    style={'description_width': 'initial'}
)

analysis_output = widgets.Output()

display(widgets.VBox([
    widgets.HTML("<h3>Select Environment for Logical Model Analysis</h3>"),
    env_dropdown,
    analysis_output
]))

In [None]:
# Primary Key Discovery and Analysis

def analyze_primary_keys(environment):
    """Discover and analyze primary keys across all tables."""
    print(f"🔑 Analyzing Primary Keys in {environment}...")
    
    try:
        # Get all tables first
        tables_query = """
        SELECT DISTINCT table_schema, table_name
        FROM information_schema.tables 
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
        AND table_type = 'BASE TABLE'
        ORDER BY table_schema, table_name
        """
        
        tables = db_connection.execute_query(environment, tables_query)
        
        if not tables:
            print("No tables found")
            return
        
        pk_results = []
        
        # Analyze each table
        for table in tables:
            schema = table['table_schema']
            table_name = table['table_name']
            
            try:
                # Detect primary keys
                pk_info = pk_detector.detect_primary_key(db_connection, environment, schema, table_name)
                
                pk_results.append({
                    'schema': schema,
                    'table': table_name,
                    'full_name': f"{schema}.{table_name}",
                    'has_declared_pk': pk_info.get('has_declared_pk', False),
                    'declared_pk_columns': pk_info.get('declared_pk_columns', []),
                    'candidate_keys': pk_info.get('candidate_keys', []),
                    'unique_columns': pk_info.get('unique_columns', []),
                    'analysis_status': pk_info.get('status', 'analyzed')
                })
                
            except Exception as e:
                print(f"Warning: Could not analyze {schema}.{table_name}: {e}")
                pk_results.append({
                    'schema': schema,
                    'table': table_name,
                    'full_name': f"{schema}.{table_name}",
                    'has_declared_pk': False,
                    'declared_pk_columns': [],
                    'candidate_keys': [],
                    'unique_columns': [],
                    'analysis_status': 'error'
                })
        
        # Convert to DataFrame for analysis
        pk_df = pd.DataFrame(pk_results)
        
        # Create visualizations
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Primary Key Coverage',
                'Schema Distribution',
                'Key Type Analysis',
                'Tables Without Primary Keys'
            ),
            specs=[[{"type": "pie"}, {"type": "pie"}],
                   [{"secondary_y": False}, {"type": "table"}]]
        )
        
        # Primary Key Coverage
        pk_coverage = pk_df['has_declared_pk'].value_counts()
        fig.add_trace(
            go.Pie(
                labels=['Has PK' if x else 'No PK' for x in pk_coverage.index],
                values=pk_coverage.values,
                name='PK Coverage'
            ),
            row=1, col=1
        )
        
        # Schema distribution
        schema_counts = pk_df['schema'].value_counts()
        fig.add_trace(
            go.Pie(
                labels=schema_counts.index,
                values=schema_counts.values,
                name='Schema Distribution'
            ),
            row=1, col=2
        )
        
        # Key analysis by schema
        schema_pk_analysis = pk_df.groupby(['schema', 'has_declared_pk']).size().unstack(fill_value=0)
        
        if True in schema_pk_analysis.columns:
            fig.add_trace(
                go.Bar(
                    x=schema_pk_analysis.index,
                    y=schema_pk_analysis[True] if True in schema_pk_analysis.columns else [],
                    name='With PK',
                    marker_color='green'
                ),
                row=2, col=1
            )
        
        if False in schema_pk_analysis.columns:
            fig.add_trace(
                go.Bar(
                    x=schema_pk_analysis.index,
                    y=schema_pk_analysis[False] if False in schema_pk_analysis.columns else [],
                    name='Without PK',
                    marker_color='red'
                ),
                row=2, col=1
            )
        
        # Tables without PKs
        no_pk_tables = pk_df[pk_df['has_declared_pk'] == False][['full_name', 'analysis_status']].head(10)
        
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Table', 'Status'],
                    fill_color='paleturquoise',
                    align='left'
                ),
                cells=dict(
                    values=[
                        no_pk_tables['full_name'],
                        no_pk_tables['analysis_status']
                    ],
                    fill_color='lavender',
                    align='left'
                )
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=800,
            title_text=f"Primary Key Analysis - {environment.title()}",
            showlegend=True
        )
        
        fig.show()
        
        # Summary statistics
        total_tables = len(pk_df)
        tables_with_pk = len(pk_df[pk_df['has_declared_pk'] == True])
        tables_without_pk = total_tables - tables_with_pk
        
        print(f"\n📊 Primary Key Summary:")
        print(f"Total tables: {total_tables}")
        print(f"Tables with declared PK: {tables_with_pk} ({tables_with_pk/total_tables*100:.1f}%)")
        print(f"Tables without PK: {tables_without_pk} ({tables_without_pk/total_tables*100:.1f}%)")
        print(f"Schemas analyzed: {pk_df['schema'].nunique()}")
        
        if tables_without_pk > 0:
            print(f"\n🔴 Tables without primary keys:")
            for _, row in pk_df[pk_df['has_declared_pk'] == False].head(10).iterrows():
                candidate_info = f" (has {len(row['candidate_keys'])} candidate keys)" if row['candidate_keys'] else ""
                print(f"  • {row['full_name']}{candidate_info}")
        
        return pk_df
        
    except Exception as e:
        print(f"❌ Error in primary key analysis: {e}")
        return None

# Primary key analysis button
pk_button = widgets.Button(
    description='Analyze Primary Keys',
    button_style='primary',
    icon='key'
)

pk_output = widgets.Output()

def on_pk_click(b):
    with pk_output:
        pk_output.clear_output()
        env = env_dropdown.value
        if env:
            analyze_primary_keys(env)
        else:
            print("Please select an environment first")

pk_button.on_click(on_pk_click)

display(widgets.VBox([
    pk_button,
    pk_output
]))

In [None]:
# Foreign Key Discovery and Relationship Mapping

def analyze_foreign_keys(environment):
    """Discover and analyze foreign key relationships."""
    print(f"🔗 Analyzing Foreign Key Relationships in {environment}...")
    
    try:
        # Get all tables
        tables_query = """
        SELECT DISTINCT table_schema, table_name
        FROM information_schema.tables 
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
        AND table_type = 'BASE TABLE'
        ORDER BY table_schema, table_name
        """
        
        tables = db_connection.execute_query(environment, tables_query)
        
        if not tables:
            print("No tables found")
            return
        
        fk_relationships = []
        
        # Analyze each table for foreign keys
        for table in tables:
            schema = table['table_schema']
            table_name = table['table_name']
            
            try:
                # Detect foreign keys
                fk_info = fk_detector.detect_foreign_keys(db_connection, environment, schema, table_name)
                
                if fk_info.get('foreign_keys'):
                    for fk in fk_info['foreign_keys']:
                        fk_relationships.append({
                            'source_schema': schema,
                            'source_table': table_name,
                            'source_column': fk.get('column'),
                            'target_schema': fk.get('referenced_schema'),
                            'target_table': fk.get('referenced_table'),
                            'target_column': fk.get('referenced_column'),
                            'constraint_name': fk.get('constraint_name'),
                            'relationship_type': fk.get('relationship_type', 'foreign_key'),
                            'confidence': fk.get('confidence', 1.0)
                        })
                
            except Exception as e:
                print(f"Warning: Could not analyze FK for {schema}.{table_name}: {e}")
        
        if not fk_relationships:
            print("No foreign key relationships found")
            return
        
        # Convert to DataFrame
        fk_df = pd.DataFrame(fk_relationships)
        
        # Create network graph
        G = nx.DiGraph()
        
        # Add nodes (tables)
        for _, row in fk_df.iterrows():
            source_node = f"{row['source_schema']}.{row['source_table']}"
            target_node = f"{row['target_schema']}.{row['target_table']}"
            
            G.add_node(source_node, schema=row['source_schema'])
            G.add_node(target_node, schema=row['target_schema'])
            
            # Add edge (relationship)
            G.add_edge(source_node, target_node, 
                      source_col=row['source_column'],
                      target_col=row['target_column'],
                      constraint=row['constraint_name'])
        
        # Create visualizations
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Relationship Network',
                'Schema Connections',
                'Table Connectivity Analysis',
                'Top Connected Tables'
            ),
            specs=[[{"secondary_y": False}, {"type": "pie"}],
                   [{"secondary_y": False}, {"type": "table"}]]
        )
        
        # Network visualization (simplified)
        if len(G.nodes()) <= 50:  # Only for smaller networks
            pos = nx.spring_layout(G, k=3, iterations=50)
            
            # Extract node positions
            node_x = [pos[node][0] for node in G.nodes()]
            node_y = [pos[node][1] for node in G.nodes()]
            node_text = list(G.nodes())
            
            # Extract edge positions
            edge_x = []
            edge_y = []
            for edge in G.edges():
                x0, y0 = pos[edge[0]]
                x1, y1 = pos[edge[1]]
                edge_x.extend([x0, x1, None])
                edge_y.extend([y0, y1, None])
            
            # Add edges
            fig.add_trace(
                go.Scatter(
                    x=edge_x, y=edge_y,
                    line=dict(width=0.5, color='#888'),
                    hoverinfo='none',
                    mode='lines',
                    showlegend=False
                ),
                row=1, col=1
            )
            
            # Add nodes
            fig.add_trace(
                go.Scatter(
                    x=node_x, y=node_y,
                    mode='markers+text',
                    text=node_text,
                    textposition="middle center",
                    hoverinfo='text',
                    marker=dict(size=10, color='lightblue'),
                    showlegend=False
                ),
                row=1, col=1
            )
        else:
            # For large networks, show summary
            fig.add_annotation(
                text=f"Network too large to display<br>{len(G.nodes())} tables, {len(G.edges())} relationships",
                xref="x", yref="y",
                x=0.5, y=0.5,
                showarrow=False,
                row=1, col=1
            )
        
        # Schema connections
        schema_connections = fk_df.groupby(['source_schema', 'target_schema']).size().reset_index(name='count')
        if len(schema_connections) > 0:
            schema_labels = [f"{row['source_schema']} -> {row['target_schema']}" for _, row in schema_connections.iterrows()]
            fig.add_trace(
                go.Pie(
                    labels=schema_labels,
                    values=schema_connections['count'],
                    name='Schema Connections'
                ),
                row=1, col=2
            )
        
        # Table connectivity analysis
        in_degree = dict(G.in_degree())
        out_degree = dict(G.out_degree())
        
        connectivity_data = []
        for node in G.nodes():
            connectivity_data.append({
                'table': node,
                'incoming': in_degree[node],
                'outgoing': out_degree[node],
                'total': in_degree[node] + out_degree[node]
            })
        
        connectivity_df = pd.DataFrame(connectivity_data).sort_values('total', ascending=False)
        
        if len(connectivity_df) > 0:
            fig.add_trace(
                go.Bar(
                    x=connectivity_df['table'].head(10),
                    y=connectivity_df['incoming'].head(10),
                    name='Incoming',
                    marker_color='lightcoral'
                ),
                row=2, col=1
            )
            
            fig.add_trace(
                go.Bar(
                    x=connectivity_df['table'].head(10),
                    y=connectivity_df['outgoing'].head(10),
                    name='Outgoing',
                    marker_color='lightblue'
                ),
                row=2, col=1
            )
        
        # Top connected tables
        top_tables = connectivity_df.head(10)
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Table', 'Incoming FKs', 'Outgoing FKs', 'Total'],
                    fill_color='paleturquoise',
                    align='left'
                ),
                cells=dict(
                    values=[
                        top_tables['table'],
                        top_tables['incoming'],
                        top_tables['outgoing'],
                        top_tables['total']
                    ],
                    fill_color='lavender',
                    align='left'
                )
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=800,
            title_text=f"Foreign Key Relationship Analysis - {environment.title()}",
            showlegend=True
        )
        
        fig.update_xaxes(tickangle=45, row=2, col=1)
        fig.show()
        
        # Summary statistics
        print(f"\n📊 Foreign Key Summary:")
        print(f"Total relationships: {len(fk_df)}")
        print(f"Connected tables: {len(G.nodes())}")
        print(f"Schema connections: {len(schema_connections)}")
        print(f"Most connected table: {connectivity_df.iloc[0]['table']} ({connectivity_df.iloc[0]['total']} connections)")
        
        return fk_df, G
        
    except Exception as e:
        print(f"❌ Error in foreign key analysis: {e}")
        return None, None

# Foreign key analysis button
fk_button = widgets.Button(
    description='Analyze Foreign Keys',
    button_style='success',
    icon='link'
)

fk_output = widgets.Output()

def on_fk_click(b):
    with fk_output:
        fk_output.clear_output()
        env = env_dropdown.value
        if env:
            analyze_foreign_keys(env)
        else:
            print("Please select an environment first")

fk_button.on_click(on_fk_click)

display(widgets.VBox([
    fk_button,
    fk_output
]))

In [None]:
# Cardinality Pattern Analysis

def analyze_cardinality_patterns(environment, sample_size=10000):
    """Analyze cardinality patterns between related tables."""
    print(f"🔢 Analyzing Cardinality Patterns in {environment}...")
    
    try:
        # Get sample of tables for cardinality analysis
        tables_query = """
        SELECT table_schema, table_name
        FROM information_schema.tables 
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
        AND table_type = 'BASE TABLE'
        ORDER BY table_schema, table_name
        LIMIT 20
        """
        
        tables = db_connection.execute_query(environment, tables_query)
        
        if not tables:
            print("No tables found")
            return
        
        cardinality_results = []
        
        # Analyze cardinality for each table
        for table in tables:
            schema = table['table_schema']
            table_name = table['table_name']
            
            try:
                # Get column cardinality info
                cardinality_info = cardinality_analyzer.analyze_cardinality(
                    db_connection, environment, schema, table_name, sample_size
                )
                
                if cardinality_info.get('column_stats'):
                    for col_stat in cardinality_info['column_stats']:
                        cardinality_results.append({
                            'schema': schema,
                            'table': table_name,
                            'full_table': f"{schema}.{table_name}",
                            'column': col_stat['column_name'],
                            'distinct_count': col_stat.get('distinct_count', 0),
                            'total_count': col_stat.get('total_count', 0),
                            'cardinality_ratio': col_stat.get('cardinality_ratio', 0),
                            'cardinality_category': col_stat.get('cardinality_category', 'unknown'),
                            'null_percentage': col_stat.get('null_percentage', 0)
                        })
                
            except Exception as e:
                print(f"Warning: Could not analyze cardinality for {schema}.{table_name}: {e}")
        
        if not cardinality_results:
            print("No cardinality data found")
            return
        
        # Convert to DataFrame
        card_df = pd.DataFrame(cardinality_results)
        
        # Create visualizations
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Cardinality Distribution',
                'Cardinality Categories',
                'High vs Low Cardinality by Table',
                'Potential Key Candidates'
            ),
            specs=[[{"secondary_y": False}, {"type": "pie"}],
                   [{"secondary_y": False}, {"type": "table"}]]
        )
        
        # Cardinality ratio distribution
        fig.add_trace(
            go.Histogram(
                x=card_df['cardinality_ratio'],
                nbinsx=20,
                name='Cardinality Ratio',
                marker_color='skyblue'
            ),
            row=1, col=1
        )
        
        # Cardinality categories
        category_counts = card_df['cardinality_category'].value_counts()
        fig.add_trace(
            go.Pie(
                labels=category_counts.index,
                values=category_counts.values,
                name='Categories'
            ),
            row=1, col=2
        )
        
        # High vs low cardinality by table
        table_cardinality = card_df.groupby(['full_table', 'cardinality_category']).size().unstack(fill_value=0)
        
        if 'high' in table_cardinality.columns:
            fig.add_trace(
                go.Bar(
                    x=table_cardinality.index,
                    y=table_cardinality.get('high', []),
                    name='High Cardinality',
                    marker_color='orange'
                ),
                row=2, col=1
            )
        
        if 'low' in table_cardinality.columns:
            fig.add_trace(
                go.Bar(
                    x=table_cardinality.index,
                    y=table_cardinality.get('low', []),
                    name='Low Cardinality',
                    marker_color='lightgreen'
                ),
                row=2, col=1
            )
        
        # Potential key candidates (high cardinality columns)
        key_candidates = card_df[
            (card_df['cardinality_ratio'] > 0.9) & 
            (card_df['distinct_count'] > 100)
        ].sort_values('cardinality_ratio', ascending=False).head(10)
        
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Table.Column', 'Distinct Count', 'Cardinality Ratio'],
                    fill_color='paleturquoise',
                    align='left'
                ),
                cells=dict(
                    values=[
                        [f"{row['full_table']}.{row['column']}" for _, row in key_candidates.iterrows()],
                        key_candidates['distinct_count'],
                        [f"{ratio:.3f}" for ratio in key_candidates['cardinality_ratio']]
                    ],
                    fill_color='lavender',
                    align='left'
                )
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=800,
            title_text=f"Cardinality Pattern Analysis - {environment.title()}",
            showlegend=True
        )
        
        fig.update_xaxes(tickangle=45, row=2, col=1)
        fig.show()
        
        # Summary statistics
        total_columns = len(card_df)
        high_card_cols = len(card_df[card_df['cardinality_category'] == 'high'])
        low_card_cols = len(card_df[card_df['cardinality_category'] == 'low'])
        potential_keys = len(key_candidates)
        
        print(f"\n📊 Cardinality Summary:")
        print(f"Total columns analyzed: {total_columns}")
        print(f"High cardinality columns: {high_card_cols} ({high_card_cols/total_columns*100:.1f}%)")
        print(f"Low cardinality columns: {low_card_cols} ({low_card_cols/total_columns*100:.1f}%)")
        print(f"Potential key candidates: {potential_keys}")
        
        if potential_keys > 0:
            print(f"\n🔑 Top key candidates:")
            for _, row in key_candidates.head(5).iterrows():
                print(f"  • {row['full_table']}.{row['column']} (ratio: {row['cardinality_ratio']:.3f})")
        
        return card_df
        
    except Exception as e:
        print(f"❌ Error in cardinality analysis: {e}")
        return None

# Cardinality analysis button
card_button = widgets.Button(
    description='Analyze Cardinality Patterns',
    button_style='warning',
    icon='sort-numeric-up'
)

card_output = widgets.Output()

def on_card_click(b):
    with card_output:
        card_output.clear_output()
        env = env_dropdown.value
        if env:
            analyze_cardinality_patterns(env)
        else:
            print("Please select an environment first")

card_button.on_click(on_card_click)

display(widgets.VBox([
    card_button,
    card_output
]))

## Layer 2 Analysis Complete ✅

This notebook provided logical-level analysis including:

- **Primary Key Discovery** - Identified declared and candidate primary keys
- **Foreign Key Mapping** - Discovered relationships and built network graphs
- **Cardinality Analysis** - Analyzed data distribution patterns and key candidates
- **Relationship Visualization** - Created network diagrams and connectivity analysis

### Key Insights:
- Tables without primary keys need attention for data integrity
- Foreign key relationships reveal the logical data model structure
- High cardinality columns are potential unique identifiers
- Network connectivity shows central vs peripheral tables

### Data Quality Recommendations:
- Add primary keys to tables that lack them
- Verify high-cardinality columns as potential keys
- Document discovered relationships for better understanding
- Consider indexing strategies based on connectivity patterns

### Next Steps:
- **03_layer3_business_story.ipynb** - Discover business processes and domain insights
- **04_multi_env_parallel_run.ipynb** - Compare logical models across environments