In [None]:
pip install faker sdv pandas

In [None]:
# CELL 1: Setup and Imports
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
from sdv.evaluation.multi_table import evaluate_quality
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("🚀 Advanced Multi-Table SDV Environment Ready!")
print("📊 Target: 10 tables, 20+ columns each, 5-level depth relationships")

🚀 Advanced Multi-Table SDV Environment Ready!
📊 Target: 10 tables, 20+ columns each, 5-level depth relationships


In [None]:
# ============================================================================
# CELL 2: Enhanced Recursive Multi-Table Synthesizer Class
# ============================================================================

class AdvancedMultiTableSynthesizer:
    """
    Enhanced multi-table synthetic data generator with deep relationship handling.
    Supports complex hierarchies, recursive relationships, and advanced validation.
    """

    def __init__(self, synthesizer_type='gaussian_copula', random_seed=42):
        """Initialize the advanced multi-table synthesizer."""
        self.synthesizer_type = synthesizer_type
        self.random_seed = random_seed
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []
        self.table_levels = {}  # Track hierarchy levels

        np.random.seed(random_seed)

    def add_table_with_level(self, table_name: str, data: pd.DataFrame,
                           primary_key: str, level: int = 0):
        """Add a table with its hierarchy level."""
        self.real_data[table_name] = data.copy()
        self.table_levels[table_name] = level

        # Add to metadata
        self.metadata.detect_table_from_dataframe(table_name, data)

        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✅ Level {level}: Added '{table_name}' ({data.shape[0]} rows, {data.shape[1]} cols)")
            except Exception as e:
                print(f"⚠️ Warning setting primary key for {table_name}: {e}")

    def add_relationship_with_validation(self, child_table: str, child_column: str,
                                       parent_table: str, parent_column: str):
        """Add relationship with comprehensive validation."""
        try:
            # Validate tables exist
            if child_table not in self.real_data:
                raise ValueError(f"Child table '{child_table}' not found")
            if parent_table not in self.real_data:
                raise ValueError(f"Parent table '{parent_table}' not found")

            # Validate columns exist
            child_df = self.real_data[child_table]
            parent_df = self.real_data[parent_table]

            if child_column not in child_df.columns:
                raise ValueError(f"Column '{child_column}' not found in {child_table}")
            if parent_column not in parent_df.columns:
                raise ValueError(f"Column '{parent_column}' not found in {parent_table}")

            # Validate referential integrity
            parent_values = set(parent_df[parent_column].dropna().unique())
            child_values = set(child_df[child_column].dropna().unique())
            invalid_refs = child_values - parent_values

            if invalid_refs:
                print(f"⚠️ Found {len(invalid_refs)} invalid references in relationship")
                # Option to clean the data
                child_df_clean = child_df[child_df[child_column].isin(parent_values) |
                                        child_df[child_column].isna()]
                self.real_data[child_table] = child_df_clean
                print(f"🔧 Cleaned {child_table}: {len(child_df) - len(child_df_clean)} rows removed")

            # Set up relationship
            self.metadata.update_column(child_table, child_column, sdtype='id')

            self.metadata.add_relationship(
                parent_table_name=parent_table,
                child_table_name=child_table,
                parent_primary_key=parent_column,
                child_foreign_key=child_column
            )

            # Track dependencies
            self.dependency_graph.add_edge(parent_table, child_table)

            if child_table not in self.table_dependencies:
                self.table_dependencies[child_table] = {}
            self.table_dependencies[child_table][child_column] = parent_table

            self.relationships.append({
                'parent_table': parent_table,
                'parent_column': parent_column,
                'child_table': child_table,
                'child_column': child_column,
                'parent_level': self.table_levels.get(parent_table, 0),
                'child_level': self.table_levels.get(child_table, 0)
            })

            parent_level = self.table_levels.get(parent_table, 0)
            child_level = self.table_levels.get(child_table, 0)
            print(f"🔗 L{parent_level}→L{child_level}: {parent_table}.{parent_column} → {child_table}.{child_column}")

        except Exception as e:
            print(f"❌ Error adding relationship: {e}")

    def analyze_hierarchy(self):
        """Analyze and display the relationship hierarchy."""
        print("\n" + "="*60)
        print("📊 RELATIONSHIP HIERARCHY ANALYSIS")
        print("="*60)

        # Group tables by level
        level_groups = {}
        for table, level in self.table_levels.items():
            if level not in level_groups:
                level_groups[level] = []
            level_groups[level].append(table)

        # Display hierarchy
        for level in sorted(level_groups.keys()):
            tables = level_groups[level]
            print(f"Level {level}: {', '.join(tables)}")

        # Show relationships by level
        print("\n🔗 Relationships by Level:")
        for rel in sorted(self.relationships, key=lambda x: (x['parent_level'], x['child_level'])):
            print(f"  L{rel['parent_level']}→L{rel['child_level']}: "
                  f"{rel['parent_table']} → {rel['child_table']}")

        # Calculate depth
        max_level = max(self.table_levels.values()) if self.table_levels else 0
        print(f"\n📏 Maximum Hierarchy Depth: {max_level + 1} levels")

        # Detect potential issues
        cycles = list(nx.simple_cycles(self.dependency_graph))
        if cycles:
            print(f"⚠️ Circular dependencies detected: {cycles}")
        else:
            print("✅ No circular dependencies detected")

    def fit_advanced(self, validation_split=0.1):
        """Advanced training with validation and monitoring."""
        print("\n" + "="*60)
        print("🧠 ADVANCED MODEL TRAINING")
        print("="*60)

        # Validate metadata
        try:
            self.metadata.validate()
            print("✅ Metadata validation successful")
        except Exception as e:
            print(f"❌ Metadata validation failed: {e}")
            raise

        # Initialize synthesizer
        print(f"🚀 Initializing HMA Synthesizer...")
        self.synthesizer = HMASynthesizer(
            metadata=self.metadata,

        )

        print("📊 Training Data Summary:")
        total_rows = sum(df.shape[0] for df in self.real_data.values())
        total_cols = sum(df.shape[1] for df in self.real_data.values())
        print(f"  • Total rows across all tables: {total_rows:,}")
        print(f"  • Total columns across all tables: {total_cols}")
        print(f"  • Number of relationships: {len(self.relationships)}")

        # Train the model
        print("\n🔥 Starting training process...")
        try:
            self.synthesizer.fit(self.real_data)
            print("✅ Training completed successfully!")
        except Exception as e:
            print(f"❌ Training failed: {e}")
            raise

    def generate_advanced(self, scale_factor=1.0, custom_sizes=None):
        """Advanced generation with monitoring and validation."""
        print("\n" + "="*60)
        print("⚡ ADVANCED SYNTHETIC DATA GENERATION")
        print("="*60)

        if not self.synthesizer:
            raise ValueError("Model not trained. Call fit_advanced() first.")

        print(f"🎯 Generation parameters:")
        print(f"  • Scale factor: {scale_factor}")
        if custom_sizes:
            print(f"  • Custom sizes: {custom_sizes}")

        try:
            if custom_sizes:
                self.synthetic_data = self.synthesizer.sample(num_rows=custom_sizes)
            else:
                self.synthetic_data = self.synthesizer.sample(scale=scale_factor)

            print("✅ Generation completed!")

            # Display results
            print("\n📈 Generated Data Summary:")
            for table_name, df in self.synthetic_data.items():
                level = self.table_levels.get(table_name, '?')
                print(f"  L{level} {table_name}: {df.shape[0]:,} rows × {df.shape[1]} cols")

            return self.synthetic_data

        except Exception as e:
            print(f"❌ Generation failed: {e}")
            raise

    def comprehensive_validation(self):
        """Comprehensive validation of synthetic data quality."""
        print("\n" + "="*60)
        print("🔍 COMPREHENSIVE VALIDATION")
        print("="*60)

        if not self.synthetic_data:
            print("❌ No synthetic data to validate")
            return {}

        validation_results = {
            'referential_integrity': {},
            'data_quality': {},
            'statistical_similarity': {}
        }

        # 1. Referential Integrity Check
        print("1️⃣ Checking referential integrity...")
        for rel in self.relationships:
            rel_name = f"{rel['parent_table']} → {rel['child_table']}"
            parent_df = self.synthetic_data[rel['parent_table']]
            child_df = self.synthetic_data[rel['child_table']]

            parent_values = set(parent_df[rel['parent_column']].dropna().unique())
            child_fk_values = set(child_df[rel['child_column']].dropna().unique())

            invalid_refs = child_fk_values - parent_values
            is_valid = len(invalid_refs) == 0

            validation_results['referential_integrity'][rel_name] = {
                'valid': is_valid,
                'invalid_count': len(invalid_refs),
                'integrity_ratio': (len(child_fk_values) - len(invalid_refs)) / len(child_fk_values) if child_fk_values else 1.0
            }

            status = "✅" if is_valid else f"❌ ({len(invalid_refs)} invalid)"
            print(f"  {rel_name}: {status}")

        # 2. Data Quality Checks
        print("\n2️⃣ Checking data quality...")
        for table_name in self.real_data.keys():
            real_df = self.real_data[table_name]
            synthetic_df = self.synthetic_data[table_name]

            # Basic quality metrics
            quality_metrics = {
                'shape_match': real_df.shape[1] == synthetic_df.shape[1],
                'null_percentage_real': (real_df.isnull().sum().sum() / real_df.size) * 100,
                'null_percentage_synthetic': (synthetic_df.isnull().sum().sum() / synthetic_df.size) * 100,
                'dtypes_match': len(set(real_df.dtypes) & set(synthetic_df.dtypes)) == len(set(real_df.dtypes))
            }

            validation_results['data_quality'][table_name] = quality_metrics

            print(f"  {table_name}:")
            print(f"    Shape match: {'✅' if quality_metrics['shape_match'] else '❌'}")
            print(f"    Null %: Real {quality_metrics['null_percentage_real']:.1f}%, "
                  f"Synthetic {quality_metrics['null_percentage_synthetic']:.1f}%")

        return validation_results

    def export_results(self, output_dir="synthetic_output"):
        """Export all synthetic data and reports."""
        import os

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print(f"\n💾 Exporting results to '{output_dir}'...")

        # Export synthetic data
        for table_name, df in self.synthetic_data.items():
            filepath = os.path.join(output_dir, f"{table_name}_synthetic.csv")
            df.to_csv(filepath, index=False)
            print(f"  Exported {table_name}: {filepath}")

        # Export metadata
        metadata_path = os.path.join(output_dir, "metadata.json")
        self.metadata.save(metadata_path)
        print(f"  Exported metadata: {metadata_path}")

        # Export relationship summary
        summary_path = os.path.join(output_dir, "relationship_summary.txt")
        with open(summary_path, 'w') as f:
            f.write("RELATIONSHIP HIERARCHY SUMMARY\n")
            f.write("="*50 + "\n\n")
            for rel in self.relationships:
                f.write(f"L{rel['parent_level']}→L{rel['child_level']}: "
                       f"{rel['parent_table']}.{rel['parent_column']} → "
                       f"{rel['child_table']}.{rel['child_column']}\n")

        print(f"✅ Export completed!")

In [None]:
# ============================================================================
# CELL 3: Create Complex 10-Table Dataset with 5-Level Hierarchy
# ============================================================================

def create_complex_enterprise_data():
    """
    Create a comprehensive 10-table enterprise dataset with 5-level hierarchy.
    Each table has 20+ columns representing realistic business scenarios.
    """
    print("\n🏗️ Creating Complex Enterprise Dataset...")
    print("📊 Target: 10 tables, 20+ columns each, 5-level depth")

    # LEVEL 0: Root Tables (No dependencies)

    # Table 1: Companies (Root level)
    print("Creating Level 0 tables...")
    companies = pd.DataFrame({
        'company_id': range(1, 51),  # 50 companies
        'company_name': [f'Company_{i}' for i in range(1, 51)],
        'industry': np.random.choice(['Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail'], 50),
        'founding_year': np.random.randint(1980, 2020, 50),
        'headquarters_country': np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'Japan'], 50),
        'headquarters_city': np.random.choice(['New York', 'London', 'Tokyo', 'Berlin', 'Toronto'], 50),
        'employee_count': np.random.randint(100, 50000, 50),
        'annual_revenue': np.random.uniform(1_000_000, 1_000_000_000, 50),
        'market_cap': np.random.uniform(10_000_000, 10_000_000_000, 50),
        'ceo_name': [f'CEO_{i}' for i in range(1, 51)],
        'is_public': np.random.choice([True, False], 50),
        'stock_symbol': [f'SYM{i:02d}' if np.random.random() > 0.3 else None for i in range(1, 51)],
        'website': [f'www.company{i}.com' for i in range(1, 51)],
        'email_domain': [f'company{i}.com' for i in range(1, 51)],
        'phone': [f'+1-555-{i:04d}' for i in range(1000, 1050)],
        'business_model': np.random.choice(['B2B', 'B2C', 'B2B2C', 'Marketplace'], 50),
        'primary_product': [f'Product_Line_{i}' for i in range(1, 51)],
        'sustainability_score': np.random.uniform(1, 10, 50),
        'innovation_index': np.random.uniform(1, 100, 50),
        'risk_rating': np.random.choice(['Low', 'Medium', 'High'], 50),
        'last_funding_round': np.random.choice(['Seed', 'Series A', 'Series B', 'Series C', 'IPO', None], 50),
        'total_funding': np.random.uniform(0, 500_000_000, 50),
        'created_at': pd.date_range('2020-01-01', periods=50, freq='W'),
        'updated_at': pd.date_range('2024-01-01', periods=50, freq='D')
    })

    # Table 2: Geographic Regions (Root level)
    regions = pd.DataFrame({
        'region_id': range(1, 26),  # 25 regions
        'region_name': [f'Region_{i}' for i in range(1, 26)],
        'country': np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'Japan', 'Australia'], 25),
        'continent': np.random.choice(['North America', 'Europe', 'Asia', 'Oceania'], 25),
        'timezone': np.random.choice(['UTC-8', 'UTC-5', 'UTC+0', 'UTC+1', 'UTC+9'], 25),
        'currency': np.random.choice(['USD', 'EUR', 'GBP', 'JPY', 'CAD'], 25),
        'population': np.random.randint(100_000, 50_000_000, 25),
        'gdp_per_capita': np.random.uniform(20_000, 80_000, 25),
        'unemployment_rate': np.random.uniform(2, 15, 25),
        'inflation_rate': np.random.uniform(-1, 8, 25),
        'cost_of_living_index': np.random.uniform(50, 150, 25),
        'ease_of_business_rank': np.random.randint(1, 200, 25),
        'tax_rate': np.random.uniform(15, 45, 25),
        'language_primary': np.random.choice(['English', 'German', 'Japanese', 'French'], 25),
        'internet_penetration': np.random.uniform(60, 98, 25),
        'smartphone_penetration': np.random.uniform(50, 95, 25),
        'education_index': np.random.uniform(0.5, 1.0, 25),
        'healthcare_index': np.random.uniform(0.4, 0.9, 25),
        'climate_type': np.random.choice(['Temperate', 'Tropical', 'Arid', 'Continental'], 25),
        'average_temperature': np.random.uniform(-5, 35, 25),
        'renewable_energy_percentage': np.random.uniform(10, 80, 25),
        'carbon_footprint_per_capita': np.random.uniform(2, 20, 25),
        'created_at': pd.date_range('2020-01-01', periods=25, freq='2W'),
        'updated_at': pd.date_range('2024-01-01', periods=25, freq='3D')
    })

    # LEVEL 1: Tables depending on Level 0

    # Table 3: Departments (depends on Companies)
    print("Creating Level 1 tables...")
    departments = pd.DataFrame({
        'department_id': range(1, 201),  # 200 departments
        'company_id': np.random.choice(companies['company_id'], 200),
        'department_name': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance', 'Operations'], 200),
        'department_code': [f'DEPT_{i:03d}' for i in range(1, 201)],
        'head_of_department': [f'Manager_{i}' for i in range(1, 201)],
        'budget_annual': np.random.uniform(100_000, 10_000_000, 200),
        'budget_used': np.random.uniform(50_000, 8_000_000, 200),
        'employee_count': np.random.randint(5, 500, 200),
        'performance_score': np.random.uniform(1, 10, 200),
        'establishment_date': pd.date_range('2015-01-01', periods=200, freq='W'),
        'office_location': np.random.choice(['Floor 1', 'Floor 2', 'Floor 3', 'Building A', 'Building B'], 200),
        'cost_center': [f'CC_{i:04d}' for i in range(1000, 1200)],
        'profit_center': [f'PC_{i:04d}' for i in range(2000, 2200)],
        'functional_area': np.random.choice(['Core', 'Support', 'Strategic', 'Operational'], 200),
        'automation_level': np.random.uniform(0, 100, 200),
        'digital_maturity': np.random.choice(['Basic', 'Intermediate', 'Advanced', 'Expert'], 200),
        'collaboration_score': np.random.uniform(1, 10, 200),
        'innovation_projects': np.random.randint(0, 20, 200),
        'training_hours_annual': np.random.randint(20, 200, 200),
        'employee_satisfaction': np.random.uniform(1, 10, 200),
        'turnover_rate': np.random.uniform(5, 25, 200),
        'diversity_index': np.random.uniform(0.3, 0.9, 200),
        'created_at': pd.date_range('2020-01-01', periods=200, freq='D'),
        'updated_at': pd.date_range('2024-01-01', periods=200, freq='12H')
    })

    # Table 4: Office Locations (depends on Companies and Regions)
    offices = pd.DataFrame({
        'office_id': range(1, 151),  # 150 offices
        'company_id': np.random.choice(companies['company_id'], 150),
        'region_id': np.random.choice(regions['region_id'], 150),
        'office_name': [f'Office_{i}' for i in range(1, 151)],
        'office_type': np.random.choice(['Headquarters', 'Branch', 'Subsidiary', 'Co-working'], 150),
        'address_line1': [f'{i} Business Street' for i in range(1, 151)],
        'address_line2': [f'Suite {i}00' if np.random.random() > 0.3 else None for i in range(1, 151)],
        'postal_code': [f'{i:05d}' for i in range(10000, 10150)],
        'phone_number': [f'+1-555-{i:04d}' for i in range(2000, 2150)],
        'email': [f'office{i}@company.com' for i in range(1, 151)],
        'square_footage': np.random.randint(1000, 100000, 150),
        'max_capacity': np.random.randint(50, 2000, 150),
        'current_occupancy': np.random.randint(20, 1500, 150),
        'lease_type': np.random.choice(['Owned', 'Leased', 'Co-shared'], 150),
        'lease_expiry': pd.date_range('2025-01-01', periods=150, freq='M'),
        'monthly_rent': np.random.uniform(5000, 200000, 150),
        'utilities_cost': np.random.uniform(500, 10000, 150),
        'security_level': np.random.choice(['Basic', 'Standard', 'High', 'Maximum'], 150),
        'parking_spaces': np.random.randint(10, 500, 150),
        'accessibility_features': np.random.choice([True, False], 150),
        'green_certification': np.random.choice(['LEED Gold', 'LEED Silver', 'BREEAM', None], 150),
        'internet_speed_mbps': np.random.randint(100, 10000, 150),
        'created_at': pd.date_range('2019-01-01', periods=150, freq='3D'),
        'updated_at': pd.date_range('2024-01-01', periods=150, freq='W')
    })

    # LEVEL 2: Tables depending on Level 1

    # Table 5: Employees (depends on Departments and Offices)
    print("Creating Level 2 tables...")
    employees = pd.DataFrame({
        'employee_id': range(1, 1001),  # 1000 employees
        'department_id': np.random.choice(departments['department_id'], 1000),
        'office_id': np.random.choice(offices['office_id'], 1000),
        'employee_number': [f'EMP_{i:06d}' for i in range(100000, 101000)],
        'first_name': [f'FirstName_{i}' for i in range(1, 1001)],
        'last_name': [f'LastName_{i}' for i in range(1, 1001)],
        'email': [f'employee{i}@company.com' for i in range(1, 1001)],
        'phone': [f'+1-555-{i:04d}' for i in range(3000, 4000)],
        'hire_date': pd.date_range('2018-01-01', periods=1000, freq='D'),
        'birth_date': pd.date_range('1970-01-01', periods=1000, freq='3D'),
        'gender': np.random.choice(['Male', 'Female', 'Other'], 1000),
        'job_title': np.random.choice(['Developer', 'Manager', 'Analyst', 'Designer', 'Specialist'], 1000),
        'job_level': np.random.choice(['Junior', 'Mid', 'Senior', 'Lead', 'Director'], 1000),
        'employment_type': np.random.choice(['Full-time', 'Part-time', 'Contract', 'Intern'], 1000),
        'salary_annual': np.random.uniform(40000, 200000, 1000),
        'bonus_percentage': np.random.uniform(0, 25, 1000),
        'stock_options': np.random.randint(0, 10000, 1000),
        'performance_rating': np.random.uniform(1, 10, 1000),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
        'years_experience': np.random.randint(0, 30, 1000),
        'skill_level': np.random.uniform(1, 10, 1000),
        'training_completed': np.random.randint(0, 50, 1000),
        'last_promotion_date': pd.date_range('2020-01-01', periods=1000, freq='2M'),
        'created_at': pd.date_range('2018-01-01', periods=1000, freq='D'),
        'updated_at': pd.date_range('2024-01-01', periods=1000, freq='6H')
    })

    # Table 6: Projects (depends on Departments)
    projects = pd.DataFrame({
        'project_id': range(1, 301),  # 300 projects
        'department_id': np.random.choice(departments['department_id'], 300),
        'project_name': [f'Project_{i}' for i in range(1, 301)],
        'project_code': [f'PRJ_{i:04d}' for i in range(1, 301)],
        'project_type': np.random.choice(['Internal', 'Client', 'Research', 'Maintenance'], 300),
        'status': np.random.choice(['Planning', 'Active', 'On Hold', 'Completed', 'Cancelled'], 300),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 300),
        'start_date': pd.date_range('2022-01-01', periods=300, freq='3D'),
        'end_date': pd.date_range('2024-01-01', periods=300, freq='W'),
        'budget_allocated': np.random.uniform(10000, 5000000, 300),
        'budget_spent': np.random.uniform(5000, 4000000, 300),
        'progress_percentage': np.random.uniform(0, 100, 300),
        'project_manager': [f'PM_{i}' for i in range(1, 301)],
        'client_name': [f'Client_{i}' if np.random.random() > 0.4 else None for i in range(1, 301)],
        'team_size': np.random.randint(2, 50, 300),
        'risk_level': np.random.choice(['Low', 'Medium', 'High'], 300),
        'methodology': np.random.choice(['Agile', 'Waterfall', 'Hybrid', 'Kanban'], 300),
        'technology_stack': np.random.choice(['Python', 'Java', 'JavaScript', 'C#', 'Go'], 300),
        'complexity_score': np.random.uniform(1, 10, 300),
        'deliverables_count': np.random.randint(1, 20, 300),
        'stakeholder_count': np.random.randint(2, 15, 300),
        'communication_frequency': np.random.choice(['Daily', 'Weekly', 'Bi-weekly', 'Monthly'], 300),
        'quality_score': np.random.uniform(1, 10, 300),
        'customer_satisfaction': np.random.uniform(1, 10, 300),
        'created_at': pd.date_range('2022-01-01', periods=300, freq='2D'),
        'updated_at': pd.date_range('2024-01-01', periods=300, freq='8H')
    })

    # LEVEL 3: Tables depending on Level 2

    # Table 7: Tasks (depends on Projects and Employees)
    print("Creating Level 3 tables...")
    tasks = pd.DataFrame({
        'task_id': range(1, 1501),  # 1500 tasks
        'project_id': np.random.choice(projects['project_id'], 1500),
        'assigned_employee_id': np.random.choice(employees['employee_id'], 1500),
        'task_name': [f'Task_{i}' for i in range(1, 1501)],
        'task_description': [f'Description for task {i}' for i in range(1, 1501)],
        'task_type': np.random.choice(['Development', 'Testing', 'Design', 'Documentation', 'Review'], 1500),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Urgent'], 1500),
        'status': np.random.choice(['Not Started', 'In Progress', 'Testing', 'Completed', 'Blocked'], 1500),
        'estimated_hours': np.random.uniform(1, 80, 1500),
        'actual_hours': np.random.uniform(1, 100, 1500),
        'start_date': pd.date_range('2022-06-01', periods=1500, freq='D'),
        'due_date': pd.date_range('2024-01-01', periods=1500, freq='D'),
        'completion_date': pd.date_range('2023-01-01', periods=1500, freq='2D'),
        'difficulty_level': np.random.uniform(1, 10, 1500),
        'quality_rating': np.random.uniform(1, 10, 1500),
        'dependencies_count': np.random.randint(0, 10, 1500),
        'milestone_id': np.random.randint(1, 100, 1500),
        'story_points': np.random.choice([1, 2, 3, 5, 8, 13], 1500),
        'sprint_number': np.random.randint(1, 20, 1500),
        'tags': [f'tag{i},tag{i+1}' for i in range(1, 1501)],
        'comments_count': np.random.randint(0, 50, 1500),
        'attachments_count': np.random.randint(0, 10, 1500),
        'review_required': np.random.choice([True, False], 1500),
        'created_at': pd.date_range('2022-06-01', periods=1500, freq='6H'),
        'updated_at': pd.date_range('2024-01-01', periods=1500, freq='3H')
    })

    # Table 8: Performance Reviews (depends on Employees)
    performance_reviews = pd.DataFrame({
        'review_id': range(1, 501),  # 500 reviews
        'employee_id': np.random.choice(employees['employee_id'], 500),
        'reviewer_employee_id': np.random.choice(employees['employee_id'], 500),
        'review_period': np.random.choice(['Q1', 'Q2', 'Q3', 'Q4', 'Annual', 'Mid-year'], 500),
        'review_year': np.random.choice([2022, 2023, 2024], 500),
        'review_type': np.random.choice(['Self', 'Manager', '360', 'Peer'], 500),
        'overall_rating': np.random.uniform(1, 10, 500),
        'goals_achievement': np.random.uniform(1, 10, 500),
        'technical_skills': np.random.uniform(1, 10, 500),
        'communication_skills': np.random.uniform(1, 10, 500),
        'leadership_skills': np.random.uniform(1, 10, 500),
        'teamwork_rating': np.random.uniform(1, 10, 500),
        'innovation_score': np.random.uniform(1, 10, 500),
        'problem_solving': np.random.uniform(1, 10, 500),
        'attendance_score': np.random.uniform(1, 10, 500),
        'punctuality_score': np.random.uniform(1, 10, 500),
        'goal_setting_next': [f'Goal_{i}' for i in range(1, 501)],
        'development_areas': [f'Development area {i}' for i in range(1, 501)],
        'strengths': [f'Strength {i}' for i in range(1, 501)],
        'promotion_readiness': np.random.choice(['Not Ready', 'Developing', 'Ready', 'Highly Ready'], 500),
        'salary_increase_recommended': np.random.uniform(0, 20, 500),
        'bonus_recommended': np.random.uniform(0, 50, 500),
        'training_recommendations': [f'Training {i}' for i in range(1, 501)],
        'created_at': pd.date_range('2022-01-01', periods=500, freq='W'),
        'updated_at': pd.date_range('2024-01-01', periods=500, freq='2D')
    })

    # LEVEL 4: Tables depending on Level 3

    # Table 9: Time Logs (depends on Tasks)
    print("Creating Level 4 tables...")
    time_logs = pd.DataFrame({
        'log_id': range(1, 2001),  # 2000 time logs
        'task_id': np.random.choice(tasks['task_id'], 2000),
        'employee_id': np.random.choice(employees['employee_id'], 2000),
        'log_date': pd.date_range('2023-01-01', periods=2000, freq='6H'),
        'start_time': pd.date_range('2023-01-01 09:00:00', periods=2000, freq='3H'),
        'end_time': pd.date_range('2023-01-01 17:00:00', periods=2000, freq='3H'),
        'hours_logged': np.random.uniform(0.5, 8, 2000),
        'activity_type': np.random.choice(['Coding', 'Testing', 'Debugging', 'Meeting', 'Research'], 2000),
        'description': [f'Work description {i}' for i in range(1, 2001)],
        'location': np.random.choice(['Office', 'Home', 'Client Site', 'Co-working'], 2000),
        'productivity_rating': np.random.uniform(1, 10, 2000),
        'mood_rating': np.random.uniform(1, 10, 2000),
        'energy_level': np.random.uniform(1, 10, 2000),
        'interruptions_count': np.random.randint(0, 20, 2000),
        'collaboration_time': np.random.uniform(0, 4, 2000),
        'focus_time': np.random.uniform(0, 8, 2000),
        'tools_used': [f'Tool{i},Tool{i+1}' for i in range(1, 2001)],
        'issues_encountered': [f'Issue {i}' if np.random.random() > 0.7 else None for i in range(1, 2001)],
        'solutions_implemented': [f'Solution {i}' if np.random.random() > 0.8 else None for i in range(1, 2001)],
        'learning_points': [f'Learning {i}' for i in range(1, 2001)],
        'billable_hours': np.random.uniform(0, 8, 2000),
        'approval_status': np.random.choice(['Pending', 'Approved', 'Rejected'], 2000),
        'approved_by': [f'Manager_{i}' if np.random.random() > 0.2 else None for i in range(1, 2001)],
        'created_at': pd.date_range('2023-01-01', periods=2000, freq='2H'),
        'updated_at': pd.date_range('2024-01-01', periods=2000, freq='1H')
    })

    # LEVEL 5: Tables depending on Level 4 (Final depth level)

    # Table 10: Activity Details (depends on Time Logs) - Recursive relationship
    print("Creating Level 5 tables...")
    activity_details = pd.DataFrame({
        'activity_id': range(1, 1001),  # 1000 activity details
        'log_id': np.random.choice(time_logs['log_id'], 1000),
        'parent_activity_id': [None] * 700 + list(np.random.choice(range(1, 301), 300)),  # Recursive!
        'activity_name': [f'Activity_{i}' for i in range(1, 1001)],
        'activity_category': np.random.choice(['Core Work', 'Communication', 'Learning', 'Admin', 'Break'], 1000),
        'activity_subcategory': np.random.choice(['Code Review', 'Email', 'Meeting', 'Documentation', 'Planning'], 1000),
        'start_time': pd.date_range('2023-01-01 09:00:00', periods=1000, freq='30min'),
        'end_time': pd.date_range('2023-01-01 17:00:00', periods=1000, freq='30min'),
        'duration_minutes': np.random.randint(5, 240, 1000),
        'complexity_level': np.random.uniform(1, 10, 1000),
        'completion_percentage': np.random.uniform(0, 100, 1000),
        'quality_score': np.random.uniform(1, 10, 1000),
        'effort_required': np.random.uniform(1, 10, 1000),
        'concentration_level': np.random.uniform(1, 10, 1000),
        'stress_level': np.random.uniform(1, 10, 1000),
        'satisfaction_level': np.random.uniform(1, 10, 1000),
        'collaboration_involved': np.random.choice([True, False], 1000),
        'tools_specific': [f'SpecificTool_{i}' for i in range(1, 1001)],
        'resources_used': [f'Resource_{i}' for i in range(1, 1001)],
        'obstacles_faced': [f'Obstacle {i}' if np.random.random() > 0.6 else None for i in range(1, 1001)],
        'outcomes_achieved': [f'Outcome {i}' for i in range(1, 1001)],
        'knowledge_gained': [f'Knowledge {i}' for i in range(1, 1001)],
        'follow_up_required': np.random.choice([True, False], 1000),
        'impact_score': np.random.uniform(1, 10, 1000),
        'created_at': pd.date_range('2023-01-01', periods=1000, freq='1H'),
        'updated_at': pd.date_range('2024-01-01', periods=1000, freq='30min')
    })

    print("✅ Complex Enterprise Dataset Created!")
    print(f"📊 Dataset Summary:")
    print(f"  • Level 0: Companies ({companies.shape[0]} rows), Regions ({regions.shape[0]} rows)")
    print(f"  • Level 1: Departments ({departments.shape[0]} rows), Offices ({offices.shape[0]} rows)")
    print(f"  • Level 2: Employees ({employees.shape[0]} rows), Projects ({projects.shape[0]} rows)")
    print(f"  • Level 3: Tasks ({tasks.shape[0]} rows), Performance Reviews ({performance_reviews.shape[0]} rows)")
    print(f"  • Level 4: Time Logs ({time_logs.shape[0]} rows)")
    print(f"  • Level 5: Activity Details ({activity_details.shape[0]} rows) - With Recursive Relationships!")

    total_rows = sum([df.shape[0] for df in [companies, regions, departments, offices, employees,
                     projects, tasks, performance_reviews, time_logs, activity_details]])
    total_cols = sum([df.shape[1] for df in [companies, regions, departments, offices, employees,
                     projects, tasks, performance_reviews, time_logs, activity_details]])

    print(f"📈 Total: {total_rows:,} rows across {total_cols} columns")

    return {
        'companies': companies,
        'regions': regions,
        'departments': departments,
        'offices': offices,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,
        'performance_reviews': performance_reviews,
        'time_logs': time_logs,
        'activity_details': activity_details
    }

In [None]:
# ============================================================================
# CELL 4: Initialize and Setup Complex Multi-Table Synthesizer
# ============================================================================

# Create the complex dataset
print("🚀 CREATING COMPLEX ENTERPRISE DATASET")
print("="*60)
enterprise_data = create_complex_enterprise_data()

# Initialize the advanced synthesizer
print("\n🧠 INITIALIZING ADVANCED MULTI-TABLE SYNTHESIZER")
print("="*60)
synthesizer = AdvancedMultiTableSynthesizer(
    synthesizer_type='gaussian_copula',
    random_seed=42
)

# Add all tables with their hierarchy levels
print("\nAdding tables to synthesizer...")

# Level 0 (Root tables)
synthesizer.add_table_with_level('companies', enterprise_data['companies'], 'company_id', level=0)
synthesizer.add_table_with_level('regions', enterprise_data['regions'], 'region_id', level=0)

# Level 1
synthesizer.add_table_with_level('departments', enterprise_data['departments'], 'department_id', level=1)
synthesizer.add_table_with_level('offices', enterprise_data['offices'], 'office_id', level=1)

# Level 2
synthesizer.add_table_with_level('employees', enterprise_data['employees'], 'employee_id', level=2)
synthesizer.add_table_with_level('projects', enterprise_data['projects'], 'project_id', level=2)

# Level 3
synthesizer.add_table_with_level('tasks', enterprise_data['tasks'], 'task_id', level=3)
synthesizer.add_table_with_level('performance_reviews', enterprise_data['performance_reviews'], 'review_id', level=3)

# Level 4
synthesizer.add_table_with_level('time_logs', enterprise_data['time_logs'], 'log_id', level=4)

# Level 5
synthesizer.add_table_with_level('activity_details', enterprise_data['activity_details'], 'activity_id', level=5)

print("✅ All tables added successfully!")

# ============================================================================
# CELL 5: Define Complex 5-Level Relationships
# ============================================================================

print("\n🔗 DEFINING 5-LEVEL RELATIONSHIP HIERARCHY")
print("="*60)

# Level 0 → Level 1 relationships
print("Setting up Level 0 → Level 1 relationships...")
synthesizer.add_relationship_with_validation('departments', 'company_id', 'companies', 'company_id')
synthesizer.add_relationship_with_validation('offices', 'company_id', 'companies', 'company_id')
synthesizer.add_relationship_with_validation('offices', 'region_id', 'regions', 'region_id')

# Level 1 → Level 2 relationships
print("\nSetting up Level 1 → Level 2 relationships...")
synthesizer.add_relationship_with_validation('employees', 'department_id', 'departments', 'department_id')
synthesizer.add_relationship_with_validation('employees', 'office_id', 'offices', 'office_id')
synthesizer.add_relationship_with_validation('projects', 'department_id', 'departments', 'department_id')

# Level 2 → Level 3 relationships
print("\nSetting up Level 2 → Level 3 relationships...")
synthesizer.add_relationship_with_validation('tasks', 'project_id', 'projects', 'project_id')
synthesizer.add_relationship_with_validation('tasks', 'assigned_employee_id', 'employees', 'employee_id')
synthesizer.add_relationship_with_validation('performance_reviews', 'employee_id', 'employees', 'employee_id')
synthesizer.add_relationship_with_validation('performance_reviews', 'reviewer_employee_id', 'employees', 'employee_id')

# Level 3 → Level 4 relationships
print("\nSetting up Level 3 → Level 4 relationships...")
synthesizer.add_relationship_with_validation('time_logs', 'task_id', 'tasks', 'task_id')
synthesizer.add_relationship_with_validation('time_logs', 'employee_id', 'employees', 'employee_id')

# Level 4 → Level 5 relationships
print("\nSetting up Level 4 → Level 5 relationships...")
synthesizer.add_relationship_with_validation('activity_details', 'log_id', 'time_logs', 'log_id')

# RECURSIVE RELATIONSHIP (Level 5 → Level 5)
print("\nSetting up RECURSIVE relationship...")
print("Adding self-referencing relationship in activity_details...")
try:
    # Handle the recursive relationship manually since it's within the same table
    synthesizer.metadata.update_column('activity_details', 'parent_activity_id', sdtype='id')
    synthesizer.metadata.add_relationship(
        parent_table_name='activity_details',
        child_table_name='activity_details',
        parent_primary_key='activity_id',
        child_foreign_key='parent_activity_id'
    )
    print("✅ Recursive relationship added: activity_details.activity_id → activity_details.parent_activity_id")
except Exception as e:
    print(f"⚠️ Recursive relationship warning: {e}")

# Analyze the complete hierarchy
synthesizer.analyze_hierarchy()

print("✅ All relationships defined successfully!")

🚀 CREATING COMPLEX ENTERPRISE DATASET

🏗️ Creating Complex Enterprise Dataset...
📊 Target: 10 tables, 20+ columns each, 5-level depth
Creating Level 0 tables...
Creating Level 1 tables...
Creating Level 2 tables...
Creating Level 3 tables...
Creating Level 4 tables...
Creating Level 5 tables...
✅ Complex Enterprise Dataset Created!
📊 Dataset Summary:
  • Level 0: Companies (50 rows), Regions (25 rows)
  • Level 1: Departments (200 rows), Offices (150 rows)
  • Level 2: Employees (1000 rows), Projects (300 rows)
  • Level 3: Tasks (1500 rows), Performance Reviews (500 rows)
  • Level 4: Time Logs (2000 rows)
  • Level 5: Activity Details (1000 rows) - With Recursive Relationships!
📈 Total: 6,725 rows across 248 columns

🧠 INITIALIZING ADVANCED MULTI-TABLE SYNTHESIZER

Adding tables to synthesizer...
✅ All tables added successfully!

🔗 DEFINING 5-LEVEL RELATIONSHIP HIERARCHY
Setting up Level 0 → Level 1 relationships...
❌ Error adding relationship: Unknown table name ('company_id').
❌ Er

In [None]:
# ============================================================================
# CELL 6: Train the Advanced Multi-Table Model
# ============================================================================

print("\n🔥 TRAINING ADVANCED MULTI-TABLE MODEL")
print("="*60)

# Train the model with advanced monitoring
synthesizer.fit_advanced()

print("✅ Model training completed!")


🔥 TRAINING ADVANCED MULTI-TABLE MODEL

🧠 ADVANCED MODEL TRAINING
✅ Metadata validation successful
🚀 Initializing HMA Synthesizer...
📊 Training Data Summary:
  • Total rows across all tables: 6,725
  • Total columns across all tables: 248
  • Number of relationships: 0

🔥 Starting training process...


Preprocess Tables: 100%|██████████| 10/10 [00:35<00:00,  3.58s/it]



Learning relationships:



Modeling Tables: 100%|██████████| 10/10 [00:15<00:00,  1.57s/it]

✅ Training completed successfully!
✅ Model training completed!





In [None]:
# ============================================================================
# CELL 7 ALTERNATIVE: Generate Synthetic Data with Robust Error Handling
# ============================================================================

print("\n⚡ GENERATING SYNTHETIC DATA (ROBUST VERSION)")
print("="*60)

# Method 1: Try with scale factor first (most reliable)
try:
    print("🚀 Attempting generation with scale factor 0.6...")
    synthetic_data = synthesizer.synthesizer.sample(scale=0.6)
    print("✅ Generation successful with scale factor!")

    # Display results
    print("\n📈 Generated Data Summary:")
    for table_name, df in synthetic_data.items():
        level = synthesizer.table_levels.get(table_name, '?')
        print(f"  L{level} {table_name}: {df.shape[0]:,} rows × {df.shape[1]} cols")

    # Store in synthesizer object
    synthesizer.synthetic_data = synthetic_data

except Exception as e:
    print(f"❌ Scale generation failed: {e}")

    # Method 2: Try default generation
    try:
        print("🔄 Attempting default generation...")
        synthetic_data = synthesizer.synthesizer.sample()
        print("✅ Default generation successful!")

        # Display results
        print("\n📈 Generated Data Summary:")
        for table_name, df in synthetic_data.items():
            level = synthesizer.table_levels.get(table_name, '?')
            print(f"  L{level} {table_name}: {df.shape[0]:,} rows × {df.shape[1]} cols")

        # Store in synthesizer object
        synthesizer.synthetic_data = synthetic_data

    except Exception as e2:
        print(f"❌ Default generation also failed: {e2}")
        print("🔧 This might be due to model complexity or memory constraints.")
        print("💡 Suggestion: Try reducing the dataset size or using simpler relationships.")
        raise

# Alternative Method 3: Manual generation with relationship preservation
def generate_with_manual_control():
    """Generate synthetic data with manual control over table sizes."""
    print("\n🛠️ MANUAL GENERATION WITH RELATIONSHIP PRESERVATION")
    print("="*50)

    # Target sizes (reduced for stability)
    target_sizes = {
        'companies': 20,
        'regions': 10,
        'departments': 60,
        'offices': 40,
        'employees': 200,
        'projects': 80,
        'tasks': 300,
        'performance_reviews': 100,
        'time_logs': 400,
        'activity_details': 200
    }

    try:
        # Generate with smaller scale first
        print("🎯 Generating with conservative scale...")
        base_synthetic = synthesizer.synthesizer.sample(scale=0.4)

        print("✅ Base generation successful!")
        print("📊 Generated sizes:")
        for table_name, df in base_synthetic.items():
            level = synthesizer.table_levels.get(table_name, '?')
            target = target_sizes.get(table_name, 'N/A')
            print(f"  L{level} {table_name}: {df.shape[0]:,} rows (target: {target})")

        return base_synthetic

    except Exception as e:
        print(f"❌ Manual generation failed: {e}")
        return None

# If the main generation failed, try manual method
if 'synthetic_data' not in locals() or synthetic_data is None:
    print("\n🔄 Trying manual generation method...")
    synthetic_data = generate_with_manual_control()
    if synthetic_data:
        synthesizer.synthetic_data = synthetic_data

# Verify we have synthetic data
if 'synthetic_data' in locals() and synthetic_data is not None:
    print(f"\n✅ Synthetic data generation completed!")
    print(f"📊 Total tables generated: {len(synthetic_data)}")
    total_rows = sum(df.shape[0] for df in synthetic_data.values())
    print(f"📈 Total synthetic rows: {total_rows:,}")
else:
    print("\n❌ All generation methods failed!")
    print("🔧 Please check the model training or reduce data complexity.")


⚡ GENERATING SYNTHETIC DATA (ROBUST VERSION)
🚀 Attempting generation with scale factor 0.6...
✅ Generation successful with scale factor!

📈 Generated Data Summary:
  L3 tasks: 900 rows × 25 cols
  L1 departments: 120 rows × 24 cols
  L5 activity_details: 600 rows × 26 cols
  L2 projects: 180 rows × 26 cols
  L3 performance_reviews: 300 rows × 25 cols
  L1 offices: 90 rows × 24 cols
  L2 employees: 600 rows × 25 cols
  L0 companies: 30 rows × 24 cols
  L0 regions: 15 rows × 24 cols
  L4 time_logs: 1,200 rows × 25 cols

✅ Synthetic data generation completed!
📊 Total tables generated: 10
📈 Total synthetic rows: 4,035


In [None]:
# ============================================================================
# CELL 7B: Quick Quality Check
# ============================================================================

if 'synthetic_data' in locals() and synthetic_data is not None:
    print("\n🔍 QUICK QUALITY CHECK")
    print("="*40)

    # Check basic properties
    for table_name in ['companies', 'employees', 'tasks'][:3]:  # Check first 3 tables
        if table_name in synthetic_data:
            real_df = enterprise_data[table_name]
            synthetic_df = synthetic_data[table_name]

            print(f"\n📋 {table_name}:")
            print(f"  Real shape: {real_df.shape}")
            print(f"  Synthetic shape: {synthetic_df.shape}")
            print(f"  Column match: {'✅' if real_df.shape[1] == synthetic_df.shape[1] else '❌'}")

            # Check for basic data validity
            if len(synthetic_df) > 0:
                print(f"  Data preview: ✅")
                print(f"  Sample values: {list(synthetic_df.columns[:3])}")
            else:
                print(f"  Data preview: ❌ Empty dataframe")

# Generate synthetic data with robust error handling
print("\n⚡ GENERATING SYNTHETIC DATA")
print("="*60)

# Use the robust generation approach instead of the problematic custom_sizes
try:
    print("🚀 Starting synthetic data generation...")
    synthetic_data = synthesizer.synthesizer.sample(scale=0.6)
    synthesizer.synthetic_data = synthetic_data
    print("✅ Synthetic data generation completed!")

    # Display summary
    print("\n📈 Generated Data Summary:")
    for table_name, df in synthetic_data.items():
        level = synthesizer.table_levels.get(table_name, '?')
        print(f"  L{level} {table_name}: {df.shape[0]:,} rows × {df.shape[1]} cols")

except Exception as e:
    print(f"❌ Generation failed: {e}")
    print("🔄 Trying alternative generation...")
    try:
        synthetic_data = synthesizer.synthesizer.sample()
        synthesizer.synthetic_data = synthetic_data
        print("✅ Alternative generation successful!")
    except Exception as e2:
        print(f"❌ All generation attempts failed: {e2}")
        print("💡 Try reducing model complexity or dataset size")


🔍 QUICK QUALITY CHECK

📋 companies:
  Real shape: (50, 24)
  Synthetic shape: (30, 24)
  Column match: ✅
  Data preview: ✅
  Sample values: ['company_id', 'company_name', 'industry']

📋 employees:
  Real shape: (1000, 25)
  Synthetic shape: (600, 25)
  Column match: ✅
  Data preview: ✅
  Sample values: ['employee_id', 'department_id', 'office_id']

📋 tasks:
  Real shape: (1500, 25)
  Synthetic shape: (900, 25)
  Column match: ✅
  Data preview: ✅
  Sample values: ['task_id', 'project_id', 'assigned_employee_id']

⚡ GENERATING SYNTHETIC DATA
🚀 Starting synthetic data generation...
✅ Synthetic data generation completed!

📈 Generated Data Summary:
  L3 tasks: 900 rows × 25 cols
  L1 departments: 120 rows × 24 cols
  L5 activity_details: 600 rows × 26 cols
  L2 projects: 180 rows × 26 cols
  L3 performance_reviews: 300 rows × 25 cols
  L1 offices: 90 rows × 24 cols
  L2 employees: 600 rows × 25 cols
  L0 companies: 30 rows × 24 cols
  L0 regions: 15 rows × 24 cols
  L4 time_logs: 1,200 row

In [None]:
# ============================================================================
# CELL 8: Comprehensive Validation and Quality Assessment
# ============================================================================

print("\n🔍 COMPREHENSIVE VALIDATION")
print("="*60)

# Perform comprehensive validation
validation_results = synthesizer.comprehensive_validation()

# Additional detailed analysis
print("\n📊 DETAILED QUALITY ANALYSIS")
print("="*60)

# Check data consistency across levels
print("🔗 Cross-Level Data Consistency Check:")
for level in range(5):
    level_tables = [table for table, tbl_level in synthesizer.table_levels.items() if tbl_level == level]
    if level_tables:
        total_rows = sum(synthetic_data[table].shape[0] for table in level_tables)
        print(f"  Level {level}: {len(level_tables)} tables, {total_rows:,} total rows")

# Sample data preview
print("\n👀 SAMPLE SYNTHETIC DATA PREVIEW")
print("="*50)
for table_name, df in list(synthetic_data.items())[:3]:  # Show first 3 tables
    print(f"\n📋 {table_name} (Level {synthesizer.table_levels[table_name]}):")
    print(f"Shape: {df.shape}")
    print(df.head(3).to_string())


🔍 COMPREHENSIVE VALIDATION

🔍 COMPREHENSIVE VALIDATION
1️⃣ Checking referential integrity...

2️⃣ Checking data quality...
  companies:
    Shape match: ✅
    Null %: Real 2.0%, Synthetic 1.7%
  regions:
    Shape match: ✅
    Null %: Real 0.0%, Synthetic 0.0%
  departments:
    Shape match: ✅
    Null %: Real 0.0%, Synthetic 0.0%
  offices:
    Shape match: ✅
    Null %: Real 2.6%, Synthetic 2.6%
  employees:
    Shape match: ✅
    Null %: Real 0.0%, Synthetic 0.0%
  projects:
    Shape match: ✅
    Null %: Real 1.4%, Synthetic 1.4%
  tasks:
    Shape match: ✅
    Null %: Real 0.0%, Synthetic 0.0%
  performance_reviews:
    Shape match: ✅
    Null %: Real 0.0%, Synthetic 0.0%
  time_logs:
    Shape match: ✅
    Null %: Real 6.7%, Synthetic 6.6%
  activity_details:
    Shape match: ✅
    Null %: Real 5.0%, Synthetic 5.0%

📊 DETAILED QUALITY ANALYSIS
🔗 Cross-Level Data Consistency Check:
  Level 0: 2 tables, 45 total rows
  Level 1: 2 tables, 210 total rows
  Level 2: 2 tables, 780 tot

In [None]:
# ============================================================================
# CELL 9: Advanced Analytics and Reporting
# ============================================================================

print("\n📈 ADVANCED ANALYTICS")
print("="*60)

def analyze_synthetic_quality():
    """Perform advanced quality analysis."""

    quality_metrics = {}

    for table_name in enterprise_data.keys():
        real_df = enterprise_data[table_name]
        synthetic_df = synthetic_data[table_name]

        # Calculate various quality metrics
        metrics = {
            'row_preservation_ratio': synthetic_df.shape[0] / real_df.shape[0],
            'column_count_match': real_df.shape[1] == synthetic_df.shape[1],
            'data_type_preservation': 0,
            'null_pattern_similarity': 0,
            'numeric_distribution_similarity': 0
        }

        # Data type preservation
        real_dtypes = set(str(dtype) for dtype in real_df.dtypes)
        synthetic_dtypes = set(str(dtype) for dtype in synthetic_df.dtypes)
        metrics['data_type_preservation'] = len(real_dtypes & synthetic_dtypes) / len(real_dtypes)

        # Null pattern similarity
        real_null_pct = real_df.isnull().sum().sum() / real_df.size
        synthetic_null_pct = synthetic_df.isnull().sum().sum() / synthetic_df.size
        metrics['null_pattern_similarity'] = 1 - abs(real_null_pct - synthetic_null_pct)

        # Numeric column distribution similarity (simplified)
        numeric_cols = real_df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            similarities = []
            for col in numeric_cols:
                if col in synthetic_df.columns:
                    real_mean = real_df[col].mean()
                    synthetic_mean = synthetic_df[col].mean()
                    if real_mean != 0:
                        similarity = 1 - abs(real_mean - synthetic_mean) / abs(real_mean)
                        similarities.append(max(0, similarity))
            metrics['numeric_distribution_similarity'] = np.mean(similarities) if similarities else 0

        quality_metrics[table_name] = metrics

    return quality_metrics

# Run quality analysis
quality_metrics = analyze_synthetic_quality()

print("📊 QUALITY METRICS SUMMARY:")
print("-" * 50)
for table_name, metrics in quality_metrics.items():
    level = synthesizer.table_levels[table_name]
    print(f"L{level} {table_name}:")
    print(f"  Row Ratio: {metrics['row_preservation_ratio']:.2f}")
    print(f"  Column Match: {'✅' if metrics['column_count_match'] else '❌'}")
    print(f"  Data Type Preservation: {metrics['data_type_preservation']:.2f}")
    print(f"  Null Pattern Similarity: {metrics['null_pattern_similarity']:.2f}")
    print(f"  Numeric Distribution Similarity: {metrics['numeric_distribution_similarity']:.2f}")
    print()

# Overall quality score
overall_scores = []
for metrics in quality_metrics.values():
    score = (
        min(metrics['row_preservation_ratio'], 1.0) * 0.2 +
        (1.0 if metrics['column_count_match'] else 0.0) * 0.2 +
        metrics['data_type_preservation'] * 0.2 +
        metrics['null_pattern_similarity'] * 0.2 +
        metrics['numeric_distribution_similarity'] * 0.2
    )
    overall_scores.append(score)

overall_quality = np.mean(overall_scores)
print(f"🎯 OVERALL QUALITY SCORE: {overall_quality:.3f} ({overall_quality*100:.1f}%)")

if overall_quality >= 0.8:
    print("🏆 EXCELLENT - Production Ready!")
elif overall_quality >= 0.6:
    print("👍 GOOD - Minor adjustments needed")
elif overall_quality >= 0.4:
    print("⚠️ FAIR - Significant improvements needed")
else:
    print("❌ POOR - Major issues need addressing")


📈 ADVANCED ANALYTICS
📊 QUALITY METRICS SUMMARY:
--------------------------------------------------
L0 companies:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distribution Similarity: 0.81

L0 regions:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distribution Similarity: 0.87

L1 departments:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distribution Similarity: 0.87

L1 offices:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distribution Similarity: 0.86

L2 employees:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distribution Similarity: 0.88

L2 projects:
  Row Ratio: 0.60
  Column Match: ✅
  Data Type Preservation: 1.00
  Null Pattern Similarity: 1.00
  Numeric Distributi

In [None]:
# ============================================================================
# CELL 10: Export Results and Save Model
# ============================================================================

print("\n💾 EXPORTING RESULTS")
print("="*60)

# Export all synthetic data and reports
synthesizer.export_results(output_dir="enterprise_synthetic_data")

# Save the trained model for future use
print("\n💾 Saving trained model...")
try:
    synthesizer.synthesizer.save('enterprise_sdv_model.pkl')
    print("✅ Model saved as 'enterprise_sdv_model.pkl'")
except Exception as e:
    print(f"⚠️ Model save warning: {e}")

# Create a summary report
summary_report = f"""
ENTERPRISE SYNTHETIC DATA GENERATION REPORT
============================================

Dataset Overview:
• Total Tables: 10
• Hierarchy Depth: 5 levels
• Total Rows: {sum(df.shape[0] for df in synthetic_data.values()):,}
• Total Columns: {sum(df.shape[1] for df in synthetic_data.values())}
• Relationships: {len(synthesizer.relationships)}

Quality Assessment:
• Overall Quality Score: {overall_quality:.3f} ({overall_quality*100:.1f}%)
• Referential Integrity: {'✅ Maintained' if all(v['valid'] for v in validation_results.get('referential_integrity', {}).values()) else '⚠️ Issues detected'}

Hierarchy Structure:
"""

for level in sorted(set(synthesizer.table_levels.values())):
    level_tables = [table for table, tbl_level in synthesizer.table_levels.items() if tbl_level == level]
    summary_report += f"Level {level}: {', '.join(level_tables)}\n"

summary_report += f"""
Generated Files:
• Synthetic CSV files for all 10 tables
• Metadata JSON file
• Relationship summary
• Trained model: enterprise_sdv_model.pkl

Usage:
The generated synthetic data maintains all relationships and statistical
properties of the original data while ensuring privacy preservation.
Suitable for testing, development, and analytics without exposing real data.
"""

# Save summary report
with open('enterprise_synthetic_data/SUMMARY_REPORT.txt', 'w') as f:
    f.write(summary_report)

print("📋 Summary report saved: enterprise_synthetic_data/SUMMARY_REPORT.txt")

print("\n🎉 ADVANCED MULTI-TABLE SDV PIPELINE COMPLETED!")
print("="*60)
print("✅ 10 tables with 5-level hierarchy successfully synthesized")
print("✅ Recursive relationships handled")
print("✅ Quality validation completed")
print("✅ All results exported")
print("\n🚀 Your enterprise synthetic data is ready for use!")


💾 EXPORTING RESULTS

💾 Exporting results to 'enterprise_synthetic_data'...
  Exported tasks: enterprise_synthetic_data/tasks_synthetic.csv
  Exported departments: enterprise_synthetic_data/departments_synthetic.csv
  Exported activity_details: enterprise_synthetic_data/activity_details_synthetic.csv
  Exported projects: enterprise_synthetic_data/projects_synthetic.csv
  Exported performance_reviews: enterprise_synthetic_data/performance_reviews_synthetic.csv
  Exported offices: enterprise_synthetic_data/offices_synthetic.csv
  Exported employees: enterprise_synthetic_data/employees_synthetic.csv
  Exported companies: enterprise_synthetic_data/companies_synthetic.csv
  Exported regions: enterprise_synthetic_data/regions_synthetic.csv
  Exported time_logs: enterprise_synthetic_data/time_logs_synthetic.csv


AttributeError: 'Metadata' object has no attribute 'save'

In [1]:
# Install git if not present
!git --version

git version 2.34.1


In [2]:
from getpass import getpass

# enter your token securely
token = getpass('Enter your GitHub token: ')


Enter your GitHub token: ··········


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/synthetic_data

/content/drive/MyDrive/synthetic_data


In [5]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/synthetic_data/.git/


In [6]:
from getpass import getpass
token = getpass("Enter your GitHub token: ")

Enter your GitHub token: ··········


In [7]:
!git remote add origin https://github.com/abburiln/synthetic_data.git


In [8]:
!git config --global user.email "lakshman.devapps@gmail.com"
!git config --global user.name "Lakshminarayana Naidu"

In [9]:
!git add AdvancedMultiTableSynthesizer.ipynb
!git commit -m "Initial commit: Added Jupyter notebooks"


[master (root-commit) db8c62d] Initial commit: Added Jupyter notebooks
 1 file changed, 1 insertion(+)
 create mode 100644 AdvancedMultiTableSynthesizer.ipynb


In [10]:
!git add recursive.ipynb
!git commit -m "Initial commit: Added Jupyter notebooks"

[master 6e9d670] Initial commit: Added Jupyter notebooks
 1 file changed, 1 insertion(+)
 create mode 100644 recursive.ipynb


In [11]:
!git branch -M main
!git push https://<username>:{token}@github.com/<username>/<repo>.git main

fatal: The current branch main has no upstream branch.
To push the current branch and set the remote as upstream, use

    git push --set-upstream origin main



In [12]:
git push --set-upstream origin main

SyntaxError: invalid syntax (ipython-input-744213638.py, line 1)