# Layer 3: Business Story - Domain Knowledge and Process Discovery

Interactive exploration of business processes, domain patterns, naming conventions, and actionable insights from the data architecture.

**Author:** Data Archaeologist Team  
**Version:** 2.0  
**Date:** 2025-08-28

In [None]:
# Import required libraries
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
import ipywidgets as widgets
from IPython.display import display, HTML
from pathlib import Path
from collections import Counter, defaultdict
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
PROJECT_ROOT = Path('.').parent
sys.path.insert(0, str(PROJECT_ROOT))

from data_archaeologist.core.database_connection import DatabaseConnection
from data_archaeologist.layer3_business.business_inference import BusinessInference

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All imports successful")

In [None]:
# Load configuration and initialize components
config_file = '../config.json'

with open(config_file, 'r') as f:
    config = json.load(f)

environments = list(config['environments'].keys())
analysis_settings = config.get('analysis_settings', {})

db_connection = DatabaseConnection(config_file)
business_inference = BusinessInference()

print(f"Available environments: {environments}")
print(f"Business inference engine initialized")

In [None]:
# Environment selection for Layer 3 analysis

env_dropdown = widgets.Dropdown(
    options=environments,
    value=environments[0] if environments else None,
    description='Environment:',
    style={'description_width': 'initial'}
)

analysis_output = widgets.Output()

display(widgets.VBox([
    widgets.HTML("<h3>Select Environment for Business Story Analysis</h3>"),
    env_dropdown,
    analysis_output
]))

In [None]:
# Naming Convention and Domain Pattern Analysis

def analyze_naming_conventions(environment):
    """Analyze naming patterns to infer business domains."""
    print(f"📝 Analyzing Naming Conventions and Domain Patterns in {environment}...")
    
    try:
        # Get all tables and columns
        schema_query = """
        SELECT 
            t.table_schema,
            t.table_name,
            c.column_name,
            c.data_type,
            c.is_nullable
        FROM information_schema.tables t
        JOIN information_schema.columns c ON t.table_name = c.table_name 
            AND t.table_schema = c.table_schema
        WHERE t.table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
        AND t.table_type = 'BASE TABLE'
        ORDER BY t.table_schema, t.table_name, c.ordinal_position
        """
        
        results = db_connection.execute_query(environment, schema_query)
        
        if not results:
            print("No schema information found")
            return
        
        df = pd.DataFrame(results)
        
        # Analyze table naming patterns
        def extract_naming_patterns(names):
            patterns = {
                'prefixes': Counter(),
                'suffixes': Counter(),
                'common_words': Counter(),
                'separators': Counter()
            }
            
            for name in names:
                # Find separators
                if '_' in name:
                    patterns['separators']['underscore'] += 1
                    words = name.split('_')
                elif any(c.isupper() for c in name[1:]):
                    patterns['separators']['camelCase'] += 1
                    words = re.findall(r'[A-Z][a-z]*|[a-z]+', name)
                else:
                    patterns['separators']['single_word'] += 1
                    words = [name]
                
                # Extract patterns from words
                if len(words) > 0:
                    patterns['prefixes'][words[0].lower()] += 1
                if len(words) > 1:
                    patterns['suffixes'][words[-1].lower()] += 1
                
                for word in words:
                    if len(word) > 2:  # Only meaningful words
                        patterns['common_words'][word.lower()] += 1
            
            return patterns
        
        # Analyze table names
        table_names = df['table_name'].unique()
        table_patterns = extract_naming_patterns(table_names)
        
        # Analyze column names
        column_names = df['column_name'].unique()
        column_patterns = extract_naming_patterns(column_names)
        
        # Identify business domains from naming patterns
        business_domains = {
            'user_management': ['user', 'account', 'profile', 'auth', 'login', 'password'],
            'financial': ['payment', 'invoice', 'transaction', 'billing', 'price', 'cost', 'amount'],
            'inventory': ['product', 'item', 'stock', 'inventory', 'warehouse', 'sku'],
            'sales': ['order', 'sale', 'customer', 'purchase', 'cart', 'checkout'],
            'content': ['post', 'article', 'content', 'media', 'file', 'document'],
            'analytics': ['log', 'event', 'metric', 'stat', 'report', 'analytics'],
            'system': ['config', 'setting', 'system', 'admin', 'migration', 'schema']
        }
        
        # Score tables by domain
        domain_scores = defaultdict(lambda: defaultdict(int))
        
        for _, row in df.iterrows():
            table_full = f"{row['table_schema']}.{row['table_name']}"
            text_to_analyze = f"{row['table_name']} {row['column_name']}".lower()
            
            for domain, keywords in business_domains.items():
                for keyword in keywords:
                    if keyword in text_to_analyze:
                        domain_scores[table_full][domain] += 1
        
        # Create visualizations
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Common Table Name Patterns',
                'Business Domain Distribution',
                'Naming Convention Analysis',
                'Top Business Terms'
            ),
            specs=[[{"secondary_y": False}, {"type": "pie"}],
                   [{"type": "pie"}, {"type": "table"}]]
        )
        
        # Common table prefixes
        top_prefixes = dict(table_patterns['prefixes'].most_common(10))
        fig.add_trace(
            go.Bar(
                x=list(top_prefixes.keys()),
                y=list(top_prefixes.values()),
                name='Table Prefixes',
                marker_color='lightblue'
            ),
            row=1, col=1
        )
        
        # Business domain distribution
        domain_totals = defaultdict(int)
        for table_scores in domain_scores.values():
            for domain, score in table_scores.items():
                domain_totals[domain] += score
        
        if domain_totals:
            fig.add_trace(
                go.Pie(
                    labels=list(domain_totals.keys()),
                    values=list(domain_totals.values()),
                    name='Business Domains'
                ),
                row=1, col=2
            )
        
        # Naming convention analysis
        separator_counts = dict(table_patterns['separators'])
        fig.add_trace(
            go.Pie(
                labels=list(separator_counts.keys()),
                values=list(separator_counts.values()),
                name='Naming Conventions'
            ),
            row=2, col=1
        )
        
        # Top business terms
        all_words = Counter()
        all_words.update(table_patterns['common_words'])
        all_words.update(column_patterns['common_words'])
        
        # Filter out common technical words
        technical_words = {'id', 'name', 'type', 'date', 'time', 'created', 'updated', 'deleted', 'status'}
        business_words = {word: count for word, count in all_words.items() 
                         if word not in technical_words and len(word) > 3}
        
        top_business_words = dict(Counter(business_words).most_common(10))
        
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Business Term', 'Frequency'],
                    fill_color='paleturquoise',
                    align='left'
                ),
                cells=dict(
                    values=[
                        list(top_business_words.keys()),
                        list(top_business_words.values())
                    ],
                    fill_color='lavender',
                    align='left'
                )
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=800,
            title_text=f"Naming Convention & Domain Analysis - {environment.title()}",
            showlegend=False
        )
        
        fig.show()
        
        # Business domain analysis summary
        print(f"\n📊 Business Domain Analysis:")
        print(f"Total tables: {len(table_names)}")
        print(f"Total columns: {len(column_names)}")
        print(f"Identified domains: {len([d for d in domain_totals if domain_totals[d] > 0])}")
        
        if domain_totals:
            print(f"\n🏢 Top business domains:")
            for domain, score in sorted(domain_totals.items(), key=lambda x: x[1], reverse=True)[:5]:
                percentage = (score / sum(domain_totals.values())) * 100
                print(f"  • {domain.replace('_', ' ').title()}: {score} occurrences ({percentage:.1f}%)")
        
        # Tables by domain
        print(f"\n📋 Tables by primary domain:")
        for table, scores in sorted(domain_scores.items()):
            if scores:
                primary_domain = max(scores, key=scores.get)
                print(f"  • {table} → {primary_domain.replace('_', ' ').title()} (score: {scores[primary_domain]})")
        
        return {
            'table_patterns': table_patterns,
            'column_patterns': column_patterns,
            'domain_scores': domain_scores,
            'business_terms': top_business_words
        }
        
    except Exception as e:
        print(f"❌ Error in naming convention analysis: {e}")
        return None

# Naming convention analysis button
naming_button = widgets.Button(
    description='Analyze Naming Conventions',
    button_style='primary',
    icon='tag'
)

naming_output = widgets.Output()

def on_naming_click(b):
    with naming_output:
        naming_output.clear_output()
        env = env_dropdown.value
        if env:
            analyze_naming_conventions(env)
        else:
            print("Please select an environment first")

naming_button.on_click(on_naming_click)

display(widgets.VBox([
    naming_button,
    naming_output
]))

In [None]:
# Data Quality and Business Process Insights

def analyze_data_quality_insights(environment):
    """Analyze data quality patterns to infer business processes."""
    print(f"🔍 Analyzing Data Quality Insights in {environment}...")
    
    try:
        # Get sample of tables for quality analysis
        tables_query = """
        SELECT table_schema, table_name
        FROM information_schema.tables 
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
        AND table_type = 'BASE TABLE'
        ORDER BY table_schema, table_name
        LIMIT 20
        """
        
        tables = db_connection.execute_query(environment, tables_query)
        
        if not tables:
            print("No tables found")
            return
        
        quality_insights = []
        timestamp_patterns = []
        
        # Analyze each table for quality patterns
        for table in tables:
            schema = table['table_schema']
            table_name = table['table_name']
            
            try:
                # Get column info
                columns_query = """
                SELECT column_name, data_type, is_nullable
                FROM information_schema.columns
                WHERE table_schema = %s AND table_name = %s
                ORDER BY ordinal_position
                """
                
                columns = db_connection.execute_query(environment, columns_query, (schema, table_name))
                
                if not columns:
                    continue
                
                # Analyze timestamp patterns (business processes)
                timestamp_cols = []
                for col in columns:
                    col_name = col['column_name'].lower()
                    data_type = col['data_type']
                    
                    if (data_type in ['timestamp', 'timestamptz', 'date', 'datetime'] or
                        any(word in col_name for word in ['date', 'time', 'created', 'updated', 'deleted', 'modified'])):
                        timestamp_cols.append(col['column_name'])
                
                # Infer business process from timestamp patterns
                process_type = 'unknown'
                if any('created' in col.lower() for col in timestamp_cols):
                    if any('updated' in col.lower() for col in timestamp_cols):
                        process_type = 'crud_operations'
                    else:
                        process_type = 'append_only'
                elif any('deleted' in col.lower() for col in timestamp_cols):
                    process_type = 'soft_delete'
                elif len(timestamp_cols) > 0:
                    process_type = 'temporal_tracking'
                
                timestamp_patterns.append({
                    'schema': schema,
                    'table': table_name,
                    'full_name': f"{schema}.{table_name}",
                    'timestamp_columns': timestamp_cols,
                    'timestamp_count': len(timestamp_cols),
                    'process_type': process_type
                })
                
                # Get row count and basic quality metrics
                count_query = f'SELECT COUNT(*) as row_count FROM "{schema}"."{table_name}"'
                count_result = db_connection.execute_query(environment, count_query)
                row_count = count_result[0]['row_count'] if count_result else 0
                
                # Analyze nullable columns (data quality indicator)
                nullable_cols = [col for col in columns if col['is_nullable'] == 'YES']
                nullable_ratio = len(nullable_cols) / len(columns) if columns else 0
                
                quality_insights.append({
                    'schema': schema,
                    'table': table_name,
                    'full_name': f"{schema}.{table_name}",
                    'row_count': row_count,
                    'column_count': len(columns),
                    'nullable_columns': len(nullable_cols),
                    'nullable_ratio': nullable_ratio,
                    'timestamp_columns': len(timestamp_cols),
                    'process_type': process_type,
                    'quality_score': 1 - nullable_ratio  # Simple quality score
                })
                
            except Exception as e:
                print(f"Warning: Could not analyze {schema}.{table_name}: {e}")
        
        if not quality_insights:
            print("No quality insights found")
            return
        
        # Convert to DataFrames
        quality_df = pd.DataFrame(quality_insights)
        timestamp_df = pd.DataFrame(timestamp_patterns)
        
        # Create visualizations
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Data Quality Score Distribution',
                'Business Process Types',
                'Table Size vs Quality Correlation',
                'Timestamp Pattern Analysis'
            ),
            specs=[[{"secondary_y": False}, {"type": "pie"}],
                   [{"secondary_y": False}, {"type": "table"}]]
        )
        
        # Quality score distribution
        fig.add_trace(
            go.Histogram(
                x=quality_df['quality_score'],
                nbinsx=10,
                name='Quality Score',
                marker_color='lightgreen'
            ),
            row=1, col=1
        )
        
        # Business process types
        process_counts = timestamp_df['process_type'].value_counts()
        fig.add_trace(
            go.Pie(
                labels=process_counts.index,
                values=process_counts.values,
                name='Process Types'
            ),
            row=1, col=2
        )
        
        # Size vs Quality correlation
        fig.add_trace(
            go.Scatter(
                x=quality_df['row_count'],
                y=quality_df['quality_score'],
                mode='markers',
                text=quality_df['full_name'],
                name='Tables',
                marker=dict(
                    size=8,
                    color=quality_df['quality_score'],
                    colorscale='RdYlGn',
                    showscale=True
                )
            ),
            row=2, col=1
        )
        
        # Timestamp pattern analysis
        timestamp_summary = timestamp_df.groupby('process_type').agg({
            'table': 'count',
            'timestamp_count': 'mean'
        }).round(1).reset_index()
        
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Process Type', 'Table Count', 'Avg Timestamps'],
                    fill_color='paleturquoise',
                    align='left'
                ),
                cells=dict(
                    values=[
                        timestamp_summary['process_type'],
                        timestamp_summary['table'],
                        timestamp_summary['timestamp_count']
                    ],
                    fill_color='lavender',
                    align='left'
                )
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            height=800,
            title_text=f"Data Quality & Business Process Analysis - {environment.title()}",
            showlegend=True
        )
        
        fig.show()
        
        # Business insights summary
        avg_quality = quality_df['quality_score'].mean()
        high_quality_tables = len(quality_df[quality_df['quality_score'] > 0.8])
        low_quality_tables = len(quality_df[quality_df['quality_score'] < 0.5])
        
        print(f"\n📊 Data Quality Insights:")
        print(f"Average quality score: {avg_quality:.2f}")
        print(f"High quality tables (>80%): {high_quality_tables}")
        print(f"Low quality tables (<50%): {low_quality_tables}")
        
        print(f"\n🔄 Business Process Insights:")
        for process_type, count in process_counts.items():
            percentage = (count / len(timestamp_df)) * 100
            print(f"  • {process_type.replace('_', ' ').title()}: {count} tables ({percentage:.1f}%)")
        
        # Actionable recommendations
        print(f"\n💡 Actionable Recommendations:")
        
        if low_quality_tables > 0:
            print(f"  • Review {low_quality_tables} low-quality tables for data governance improvements")
        
        crud_tables = len(timestamp_df[timestamp_df['process_type'] == 'crud_operations'])
        if crud_tables > 0:
            print(f"  • {crud_tables} tables support full CRUD operations - consider audit trails")
        
        append_tables = len(timestamp_df[timestamp_df['process_type'] == 'append_only'])
        if append_tables > 0:
            print(f"  • {append_tables} tables are append-only - good for analytics and reporting")
        
        large_tables = quality_df[quality_df['row_count'] > 100000]
        if len(large_tables) > 0:
            print(f"  • {len(large_tables)} large tables may benefit from partitioning or archival")
        
        return {
            'quality_insights': quality_df,
            'timestamp_patterns': timestamp_df,
            'business_recommendations': {
                'avg_quality': avg_quality,
                'process_distribution': dict(process_counts)
            }
        }
        
    except Exception as e:
        print(f"❌ Error in data quality analysis: {e}")
        return None

# Data quality insights button
quality_button = widgets.Button(
    description='Analyze Data Quality Insights',
    button_style='success',
    icon='check-circle'
)

quality_output = widgets.Output()

def on_quality_click(b):
    with quality_output:
        quality_output.clear_output()
        env = env_dropdown.value
        if env:
            analyze_data_quality_insights(env)
        else:
            print("Please select an environment first")

quality_button.on_click(on_quality_click)

display(widgets.VBox([
    quality_button,
    quality_output
]))

## Layer 3 Business Story Complete ✅

This notebook provided business-level insights including:

- **Domain Discovery** - Identified business domains from naming patterns and table relationships
- **Process Analysis** - Discovered business processes from timestamp patterns and data flows
- **Quality Insights** - Analyzed data quality patterns and their business implications
- **Strategic Recommendations** - Generated actionable recommendations for business value

### Key Business Insights:
- **Naming conventions** reveal organizational structure and business domains
- **Timestamp patterns** indicate business process maturity (CRUD vs append-only)
- **Data quality scores** correlate with business process reliability
- **Table relationships** map to real-world business workflows

### Strategic Value:
- Prioritize data governance initiatives based on business impact
- Identify automation opportunities in high-volume, high-quality processes
- Plan system modernization around core business domains
- Optimize reporting and analytics for business-critical data flows

### Actionable Outcomes:
- **Data Governance Roadmap** - Focus on high-impact, low-quality areas
- **Process Optimization** - Automate mature, stable business processes
- **Architecture Evolution** - Plan migrations around business domain boundaries
- **Investment Priorities** - Direct resources to highest business value opportunities

### Next Steps:
- **04_multi_env_parallel_run.ipynb** - Compare business stories across environments
- **Executive Summary Report** - Present findings to business stakeholders
- **Implementation Roadmap** - Plan concrete actions based on recommendations