In [1]:
pip install faker sdv

  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83
Successfully installed boto3-1.40.12 botocore-1.40.12 copulas-0.12.3 ctgan-0.11.0 deepecho-0.7.0 faker-37.5.3 jmespath-1.0.1 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nccl-cu12-2.21.5 nvidia-nvjitlink-cu12-12.4.127 rdt-1.18.0 s3transfer-0.13.1 sdmetrics-0.23.0 sdv-1.26.0


In [18]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

    def add_table_data(self, table_name: str, data: pd.DataFrame,
                      primary_key: Optional[str] = None):
        """
        Add a table to the multi-table structure.

        Args:
            table_name (str): Name of the table
            data (pd.DataFrame): The actual data
            primary_key (str): Primary key column name
        """
        self.real_data[table_name] = data.copy()

        # Add table to metadata using the correct method
        self.metadata.detect_table_from_dataframe(table_name, data)

        # Set primary key if provided
        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
            except Exception as e:
                print(f"Warning setting primary key for {table_name}: {e}")
        else:
            print(f"✓ Added table '{table_name}' (no primary key specified)")

    def add_foreign_key_relationship(self, child_table: str, child_column: str,
                                   parent_table: str, parent_column: str):
        """
        Add a foreign key relationship between tables.

        Args:
            child_table (str): Child table name
            child_column (str): Foreign key column in child table
            parent_table (str): Parent table name
            parent_column (str): Referenced column in parent table
        """
        try:
            # Ensure both tables exist
            if child_table not in self.real_data:
                raise ValueError(f"Child table '{child_table}' not found")
            if parent_table not in self.real_data:
                raise ValueError(f"Parent table '{parent_table}' not found")

            # Mark foreign key column as 'id' type
            self.metadata.update_column(child_table, child_column, sdtype='id')

            # Add relationship to metadata
            self.metadata.add_relationship(
                parent_table_name=parent_table,
                child_table_name=child_table,
                parent_primary_key=parent_column,
                child_foreign_key=child_column
            )

            # Track for dependency graph
            self.dependency_graph.add_edge(parent_table, child_table)

            # Store relationship info
            if child_table not in self.table_dependencies:
                self.table_dependencies[child_table] = {}
            self.table_dependencies[child_table][child_column] = parent_table

            self.relationships.append({
                'parent_table': parent_table,
                'parent_column': parent_column,
                'child_table': child_table,
                'child_column': child_column
            })

            print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

        except Exception as e:
            print(f"✗ Error adding relationship: {e}")

    def add_self_referencing_relationship(self, table_name: str,
                                        foreign_key_col: str,
                                        primary_key_col: str):
        """
        Add a self-referencing (recursive) relationship within a table.

        Args:
            table_name (str): Table name
            foreign_key_col (str): Self-referencing foreign key column
            primary_key_col (str): Primary key column being referenced
        """
        try:
            if table_name not in self.real_data:
                raise ValueError(f"Table '{table_name}' not found")

            # Update the foreign key column to be 'id' type
            self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

            # Add self-referencing relationship
            self.metadata.add_relationship(
                parent_table_name=table_name,
                child_table_name=table_name,
                parent_primary_key=primary_key_col,
                child_foreign_key=foreign_key_col
            )

            # Add self-loop to dependency graph
            self.dependency_graph.add_edge(table_name, table_name)

            print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

        except Exception as e:
            print(f"✗ Error adding self-referencing relationship: {e}")

    def _detect_recursive_relationships(self):
        """
        Detect and handle recursive relationships in the table structure.
        """
        recursive_tables = []

        # Find self-loops (recursive relationships)
        for table in self.dependency_graph.nodes():
            if self.dependency_graph.has_edge(table, table):
                recursive_tables.append(table)

        # Find cycles in dependency graph
        try:
            cycles = list(nx.simple_cycles(self.dependency_graph))
            if cycles:
                print(f"Detected circular dependencies: {cycles}")

        except Exception as e:
            print(f"Error detecting cycles: {e}")

        return recursive_tables

    def _get_synthesis_order(self) -> List[str]:
        """
        Determine the order for synthesizing tables based on dependencies.

        Returns:
            List[str]: Ordered list of table names
        """
        try:
            # Create a copy of the graph without self-loops for topological sort
            temp_graph = self.dependency_graph.copy()
            self_loops = list(nx.selfloop_edges(temp_graph))
            temp_graph.remove_edges_from(self_loops)

            # Topological sort to get dependency order
            synthesis_order = list(nx.topological_sort(temp_graph))

            # Add any tables not in the dependency graph
            all_tables = set(self.real_data.keys())
            ordered_tables = set(synthesis_order)
            remaining_tables = all_tables - ordered_tables
            synthesis_order.extend(list(remaining_tables))

            return synthesis_order

        except nx.NetworkXError:
            # If there are cycles, use a heuristic approach
            print("Circular dependencies detected. Using heuristic ordering.")
            return list(self.real_data.keys())

    def prepare_for_synthesis(self):
        """
        Prepare the data and metadata for synthesis.
        """
        # Detect recursive relationships
        recursive_tables = self._detect_recursive_relationships()
        if recursive_tables:
            print(f"Detected recursive relationships in tables: {recursive_tables}")

        print("\nMetadata Summary:")
        try:
            # Print metadata info
            metadata_dict = self.metadata.to_dict()
            print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
            relationships = metadata_dict.get('relationships', [])
            print(f"Relationships: {len(relationships)}")
            for rel in relationships:
                print(f"  - {rel}")
        except Exception as e:
            print(f"Could not display metadata: {e}")

        # Validate metadata
        try:
            self.metadata.validate()
            print("✓ Metadata validation successful!")
        except Exception as e:
            print(f"✗ Metadata validation error: {e}")
            raise

        # Initialize synthesizer
        try:
            print(f"Initializing {self.synthesizer_type} synthesizer...")
            self.synthesizer = HMASynthesizer(metadata=self.metadata)
            print("✓ Synthesizer initialized successfully!")

        except Exception as e:
            print(f"✗ Error initializing synthesizer: {e}")
            raise

    def fit(self):
        """
        Train the multi-table synthesizer on the real data.
        """
        if not self.synthesizer:
            self.prepare_for_synthesis()

        print("Training multi-table synthesizer...")
        try:
            self.synthesizer.fit(self.real_data)
            print("✓ Training completed successfully!")
        except Exception as e:
            print(f"✗ Training error: {e}")
            raise

    def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                              scale: float = 1.0) -> Dict[str, pd.DataFrame]:
        """
        Generate synthetic data for all tables.

        Args:
            num_rows (dict): Specific number of rows per table
            scale (float): Scale factor for data generation

        Returns:
            Dict[str, pd.DataFrame]: Synthetic data for each table
        """
        if not self.synthesizer:
            raise ValueError("Synthesizer not trained. Call fit() first.")

        print(f"Generating synthetic data...")

        try:
            if num_rows:
                self.synthetic_data = self.synthesizer.sample(num_rows=num_rows)
            else:
                self.synthetic_data = self.synthesizer.sample(scale=scale)

            print("✓ Synthetic data generation completed!")

            # Print summary
            for table_name, df in self.synthetic_data.items():
                print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

            return self.synthetic_data

        except Exception as e:
            print(f"✗ Generation error: {e}")
            raise

    def validate_relationships(self) -> Dict[str, bool]:
        """
        Validate that relationships are maintained in synthetic data.

        Returns:
            Dict[str, bool]: Validation results for each relationship
        """
        validation_results = {}

        if not self.synthetic_data:
            print("No synthetic data to validate.")
            return validation_results

        print("\nValidating relationships...")

        for rel in self.relationships:
            parent_table = rel['parent_table']
            parent_col = rel['parent_column']
            child_table = rel['child_table']
            child_col = rel['child_column']

            if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
                continue

            parent_df = self.synthetic_data[parent_table]
            child_df = self.synthetic_data[child_table]

            if parent_col in parent_df.columns and child_col in child_df.columns:
                # Get valid parent values (excluding None/NaN)
                parent_values = set(parent_df[parent_col].dropna().values)
                child_fk_values = set(child_df[child_col].dropna().values)

                # Check referential integrity
                invalid_refs = child_fk_values - parent_values
                is_valid = len(invalid_refs) == 0

                relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
                validation_results[relationship_name] = is_valid

                status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
                print(f"  {relationship_name}: {status}")

        return validation_results

    def get_summary_statistics(self) -> Dict:
        """
        Get summary statistics comparing real and synthetic data.
        """
        summary = {
            'real_data': {},
            'synthetic_data': {},
            'comparison': {}
        }

        for table_name in self.real_data.keys():
            real_df = self.real_data[table_name]
            summary['real_data'][table_name] = {
                'shape': real_df.shape,
                'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
            }

            if table_name in self.synthetic_data:
                synthetic_df = self.synthetic_data[table_name]
                summary['synthetic_data'][table_name] = {
                    'shape': synthetic_df.shape,
                    'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                    'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
                }

                # Compare basic statistics
                summary['comparison'][table_name] = {
                    'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                    'column_match': synthetic_df.shape[1] == real_df.shape[1]
                }

        return summary

# Example usage and testing
def create_sample_data():
    """
    Create sample data with recursive relationships for testing.
    """
    np.random.seed(42)  # For reproducible results

    # Users table (root table)
    users = pd.DataFrame({
        'user_id': range(1, 101),
        'name': [f'User_{i}' for i in range(1, 101)],
        'age': np.random.randint(18, 80, 100),
        'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], 100)
    })

    # Orders table (depends on users)
    orders = pd.DataFrame({
        'order_id': range(1, 301),
        'user_id': np.random.choice(users['user_id'], 300),
        'amount': np.round(np.random.uniform(10, 1000, 300), 2),
        'status': np.random.choice(['pending', 'completed', 'cancelled'], 300)
    })

    # Order_items table (depends on orders)
    order_items = pd.DataFrame({
        'item_id': range(1, 501),
        'order_id': np.random.choice(orders['order_id'], 500),
        'product_name': np.random.choice(['Product_A', 'Product_B', 'Product_C'], 500),
        'quantity': np.random.randint(1, 10, 500),
        'price': np.round(np.random.uniform(5, 200, 500), 2)
    })

    # Reviews table with self-referencing (recursive) relationship
    review_ids = range(1, 201)
    reviews = pd.DataFrame({
        'review_id': review_ids,
        'user_id': np.random.choice(users['user_id'], 200),
        'rating': np.random.randint(1, 6, 200),
        'comment': [f'Review comment {i}' for i in review_ids],
        # Self-reference: some reviews are replies to other reviews
        'reply_to': [None] * 150 + list(np.random.choice(range(1, 151), 50))
    })

    return users, orders, order_items, reviews

In [13]:
users, orders, order_items, reviews = create_sample_data()

In [15]:
synthesizer = RecursiveMultiTableSynthesizer(synthesizer_type='gaussian_copula')

In [19]:
if __name__ == "__main__":
    print("=== MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

    # Create sample data
    users, orders, order_items, reviews = create_sample_data()
    print("✓ Sample data created")

    # Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Add tables (order matters - add parent tables first)
    print("\nAdding tables to synthesizer...")
    synthesizer.add_table_data('users', users, primary_key='user_id')
    synthesizer.add_table_data('orders', orders, primary_key='order_id')
    synthesizer.add_table_data('order_items', order_items, primary_key='item_id')
    synthesizer.add_table_data('reviews', reviews, primary_key='review_id')

    # Add relationships
    print("\nAdding relationships...")
    synthesizer.add_foreign_key_relationship('orders', 'user_id', 'users', 'user_id')
    synthesizer.add_foreign_key_relationship('order_items', 'order_id', 'orders', 'order_id')
    synthesizer.add_foreign_key_relationship('reviews', 'user_id', 'users', 'user_id')

    # Add recursive relationship (reviews can reply to other reviews)
    synthesizer.add_self_referencing_relationship('reviews', 'reply_to', 'review_id')

    # Train the synthesizer
    print("\n" + "="*50)
    synthesizer.fit()

    # Generate synthetic data
    print("\n" + "="*50)
    synthetic_data = synthesizer.generate_synthetic_data(scale=0.8)

    # Show sample results
    print("\n=== SAMPLE SYNTHETIC DATA ===")
    for table_name, df in synthetic_data.items():
        print(f"\n{table_name.upper()}:")
        print(df.head(3))

    # Validate relationships
    print("\n" + "="*50)
    validation = synthesizer.validate_relationships()

    # Get summary statistics
    print("\n=== SUMMARY STATISTICS ===")
    summary = synthesizer.get_summary_statistics()
    for table_name, comparison in summary['comparison'].items():
        ratio = comparison['row_ratio']
        print(f"{table_name}: Generated {ratio:.2f}x the original data size")

    print("\n✓ Multi-table recursive synthesis completed!")

=== MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===

✓ Sample data created

Adding tables to synthesizer...

Adding relationships...
✗ Error adding relationship: Unknown table name ('user_id').
✗ Error adding relationship: Unknown table name ('order_id').
✗ Error adding relationship: Unknown table name ('user_id').
✗ Error adding self-referencing relationship: Unknown table name ('reply_to').


Metadata Summary:
Tables: ['users', 'orders', 'order_items', 'reviews']
Relationships: 0
✓ Metadata validation successful!
Initializing gaussian_copula synthesizer...
✓ Synthesizer initialized successfully!
Training multi-table synthesizer...


Preprocess Tables: 100%|██████████| 4/4 [00:00<00:00, 15.90it/s]



Learning relationships:



Modeling Tables: 100%|██████████| 4/4 [00:00<00:00,  5.72it/s]


✓ Training completed successfully!

Generating synthetic data...
✓ Synthetic data generation completed!
  - order_items: 400 rows, 5 columns
  - reviews: 160 rows, 5 columns
  - orders: 240 rows, 4 columns
  - users: 80 rows, 4 columns

=== SAMPLE SYNTHETIC DATA ===

ORDER_ITEMS:
    item_id  order_id product_name  quantity   price
0  10088105       246    Product_B         5  157.65
1    279585       178    Product_B         5  168.44
2    853788       141    Product_C         3  173.25

REVIEWS:
   review_id  user_id  rating             comment  reply_to
0    5728054       26       2  Review comment 161       NaN
1    7789966        9       1  Review comment 165       NaN
2    3340734       53       3  Review comment 135       NaN

ORDERS:
   order_id  user_id  amount     status
0  11716418       93  124.75    pending
1   7454845       28  647.93  completed
2   6336744       39  697.52    pending

USERS:
    user_id     name  age            city
0   6138286  User_92   47  Johnsonboro

In [9]:
# Initialize synthesizer
synthesizer = RecursiveMultiTableSynthesizer(synthesizer_type='gaussian_copula')

In [11]:
# Add tables with relationships
synthesizer.add_table_data('users', users, primary_key='user_id')

AttributeError: 'MultiTableMetadata' object has no attribute 'detect_from_dataframe'

In [22]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

    def add_table_data(self, table_name: str, data: pd.DataFrame,
                      primary_key: Optional[str] = None):
        """
        Add a table to the multi-table structure.

        Args:
            table_name (str): Name of the table
            data (pd.DataFrame): The actual data
            primary_key (str): Primary key column name
        """
        self.real_data[table_name] = data.copy()

        # Add table to metadata using the correct method
        self.metadata.detect_table_from_dataframe(table_name, data)

        # Set primary key if provided
        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
            except Exception as e:
                print(f"Warning setting primary key for {table_name}: {e}")
        else:
            print(f"✓ Added table '{table_name}' (no primary key specified)")

    def add_foreign_key_relationship(self, child_table: str, child_column: str,
                                   parent_table: str, parent_column: str):
        """
        Add a foreign key relationship between tables.

        Args:
            child_table (str): Child table name
            child_column (str): Foreign key column in child table
            parent_table (str): Parent table name
            parent_column (str): Referenced column in parent table (should be primary key)
        """
        try:
            # Ensure both tables exist
            if child_table not in self.real_data:
                raise ValueError(f"Child table '{child_table}' not found")
            if parent_table not in self.real_data:
                raise ValueError(f"Parent table '{parent_table}' not found")

            # Verify the columns exist
            if child_column not in self.real_data[child_table].columns:
                raise ValueError(f"Column '{child_column}' not found in table '{child_table}'")
            if parent_column not in self.real_data[parent_table].columns:
                raise ValueError(f"Column '{parent_column}' not found in table '{parent_table}'")

            # Mark foreign key column as 'id' type
            self.metadata.update_column(child_table, child_column, sdtype='id')

            # Add relationship to metadata
            # Note: The parent_primary_key should be the actual primary key column name
            # which should match the parent_column parameter
            self.metadata.add_relationship(
                parent_table_name=parent_table,
                child_table_name=child_table,
                parent_primary_key=parent_column,
                child_foreign_key=child_column
            )

            # Track for dependency graph
            self.dependency_graph.add_edge(parent_table, child_table)

            # Store relationship info
            if child_table not in self.table_dependencies:
                self.table_dependencies[child_table] = {}
            self.table_dependencies[child_table][child_column] = parent_table

            self.relationships.append({
                'parent_table': parent_table,
                'parent_column': parent_column,
                'child_table': child_table,
                'child_column': child_column
            })

            print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

        except Exception as e:
            print(f"✗ Error adding relationship: {e}")
            # Debug info
            print(f"  Parent table columns: {list(self.real_data[parent_table].columns) if parent_table in self.real_data else 'Table not found'}")
            print(f"  Child table columns: {list(self.real_data[child_table].columns) if child_table in self.real_data else 'Table not found'}")

    def add_self_referencing_relationship(self, table_name: str,
                                        foreign_key_col: str,
                                        primary_key_col: str):
        """
        Add a self-referencing (recursive) relationship within a table.

        Args:
            table_name (str): Table name
            foreign_key_col (str): Self-referencing foreign key column
            primary_key_col (str): Primary key column being referenced
        """
        try:
            if table_name not in self.real_data:
                raise ValueError(f"Table '{table_name}' not found")

            # Update the foreign key column to be 'id' type
            self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

            # Add self-referencing relationship
            self.metadata.add_relationship(
                parent_table_name=table_name,
                child_table_name=table_name,
                parent_primary_key=primary_key_col,
                child_foreign_key=foreign_key_col
            )

            # Add self-loop to dependency graph
            self.dependency_graph.add_edge(table_name, table_name)

            print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

        except Exception as e:
            print(f"✗ Error adding self-referencing relationship: {e}")

    def _detect_recursive_relationships(self):
        """
        Detect and handle recursive relationships in the table structure.
        """
        recursive_tables = []

        # Find self-loops (recursive relationships)
        for table in self.dependency_graph.nodes():
            if self.dependency_graph.has_edge(table, table):
                recursive_tables.append(table)

        # Find cycles in dependency graph
        try:
            cycles = list(nx.simple_cycles(self.dependency_graph))
            if cycles:
                print(f"Detected circular dependencies: {cycles}")

        except Exception as e:
            print(f"Error detecting cycles: {e}")

        return recursive_tables

    def _get_synthesis_order(self) -> List[str]:
        """
        Determine the order for synthesizing tables based on dependencies.

        Returns:
            List[str]: Ordered list of table names
        """
        try:
            # Create a copy of the graph without self-loops for topological sort
            temp_graph = self.dependency_graph.copy()
            self_loops = list(nx.selfloop_edges(temp_graph))
            temp_graph.remove_edges_from(self_loops)

            # Topological sort to get dependency order
            synthesis_order = list(nx.topological_sort(temp_graph))

            # Add any tables not in the dependency graph
            all_tables = set(self.real_data.keys())
            ordered_tables = set(synthesis_order)
            remaining_tables = all_tables - ordered_tables
            synthesis_order.extend(list(remaining_tables))

            return synthesis_order

        except nx.NetworkXError:
            # If there are cycles, use a heuristic approach
            print("Circular dependencies detected. Using heuristic ordering.")
            return list(self.real_data.keys())

    def prepare_for_synthesis(self):
        """
        Prepare the data and metadata for synthesis.
        """
        # Detect recursive relationships
        recursive_tables = self._detect_recursive_relationships()
        if recursive_tables:
            print(f"Detected recursive relationships in tables: {recursive_tables}")

        print("\nMetadata Summary:")
        try:
            # Print metadata info
            metadata_dict = self.metadata.to_dict()
            print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
            relationships = metadata_dict.get('relationships', [])
            print(f"Relationships: {len(relationships)}")
            for rel in relationships:
                print(f"  - {rel}")
        except Exception as e:
            print(f"Could not display metadata: {e}")

        # Validate metadata
        try:
            self.metadata.validate()
            print("✓ Metadata validation successful!")
        except Exception as e:
            print(f"✗ Metadata validation error: {e}")
            raise

        # Initialize synthesizer
        try:
            print(f"Initializing {self.synthesizer_type} synthesizer...")
            self.synthesizer = HMASynthesizer(metadata=self.metadata)
            print("✓ Synthesizer initialized successfully!")

        except Exception as e:
            print(f"✗ Error initializing synthesizer: {e}")
            raise

    def fit(self):
        """
        Train the multi-table synthesizer on the real data.
        """
        if not self.synthesizer:
            self.prepare_for_synthesis()

        print("Training multi-table synthesizer...")
        try:
            self.synthesizer.fit(self.real_data)
            print("✓ Training completed successfully!")
        except Exception as e:
            print(f"✗ Training error: {e}")
            raise

    def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                              scale: float = 1.0) -> Dict[str, pd.DataFrame]:
        """
        Generate synthetic data for all tables.

        Args:
            num_rows (dict): Specific number of rows per table
            scale (float): Scale factor for data generation

        Returns:
            Dict[str, pd.DataFrame]: Synthetic data for each table
        """
        if not self.synthesizer:
            raise ValueError("Synthesizer not trained. Call fit() first.")

        print(f"Generating synthetic data...")

        try:
            # The HMASynthesizer sample method only accepts scale parameter
            if num_rows:
                # Calculate average scale from num_rows
                total_original = sum(len(df) for df in self.real_data.values())
                total_requested = sum(num_rows.get(table, len(df)) for table, df in self.real_data.items())
                calculated_scale = total_requested / total_original if total_original > 0 else 1.0
                print(f"Converting num_rows to scale factor: {calculated_scale:.2f}")
                self.synthetic_data = self.synthesizer.sample(scale=calculated_scale)

                # Then trim tables to requested sizes
                for table_name, requested_rows in num_rows.items():
                    if table_name in self.synthetic_data:
                        current_rows = len(self.synthetic_data[table_name])
                        if current_rows > requested_rows:
                            self.synthetic_data[table_name] = self.synthetic_data[table_name].head(requested_rows)

            else:
                self.synthetic_data = self.synthesizer.sample(scale=scale)

            print("✓ Synthetic data generation completed!")

            # Print summary
            for table_name, df in self.synthetic_data.items():
                print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

            return self.synthetic_data

        except Exception as e:
            print(f"✗ Generation error: {e}")
            # Try with default parameters
            try:
                print("Retrying with scale=1.0...")
                self.synthetic_data = self.synthesizer.sample(scale=1.0)
                print("✓ Fallback generation successful!")
                return self.synthetic_data
            except Exception as e2:
                print(f"✗ Fallback also failed: {e2}")
                raise

    def validate_relationships(self) -> Dict[str, bool]:
        """
        Validate that relationships are maintained in synthetic data.

        Returns:
            Dict[str, bool]: Validation results for each relationship
        """
        validation_results = {}

        if not self.synthetic_data:
            print("No synthetic data to validate.")
            return validation_results

        print("\nValidating relationships...")

        for rel in self.relationships:
            parent_table = rel['parent_table']
            parent_col = rel['parent_column']
            child_table = rel['child_table']
            child_col = rel['child_column']

            if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
                continue

            parent_df = self.synthetic_data[parent_table]
            child_df = self.synthetic_data[child_table]

            if parent_col in parent_df.columns and child_col in child_df.columns:
                # Get valid parent values (excluding None/NaN)
                parent_values = set(parent_df[parent_col].dropna().values)
                child_fk_values = set(child_df[child_col].dropna().values)

                # Check referential integrity
                invalid_refs = child_fk_values - parent_values
                is_valid = len(invalid_refs) == 0

                relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
                validation_results[relationship_name] = is_valid

                status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
                print(f"  {relationship_name}: {status}")

        return validation_results

    def get_summary_statistics(self) -> Dict:
        """
        Get summary statistics comparing real and synthetic data.
        """
        summary = {
            'real_data': {},
            'synthetic_data': {},
            'comparison': {}
        }

        for table_name in self.real_data.keys():
            real_df = self.real_data[table_name]
            summary['real_data'][table_name] = {
                'shape': real_df.shape,
                'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
            }

            if table_name in self.synthetic_data:
                synthetic_df = self.synthetic_data[table_name]
                summary['synthetic_data'][table_name] = {
                    'shape': synthetic_df.shape,
                    'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                    'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
                }

                # Compare basic statistics
                summary['comparison'][table_name] = {
                    'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                    'column_match': synthetic_df.shape[1] == real_df.shape[1]
                }

        return summary

# Example usage and testing
def create_comprehensive_sample_data():
    """
    Create comprehensive sample data with 5 levels of relationships and recursive structures.
    """
    np.random.seed(42)  # For reproducible results

    # LEVEL 1: Companies (Root table)
    companies = pd.DataFrame({
        'company_id': range(1, 21),  # 20 companies
        'company_name': [f'Company_{i}' for i in range(1, 21)],
        'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail'], 20),
        'founded_year': np.random.randint(1990, 2020, 20),
        'headquarters': np.random.choice(['NYC', 'SF', 'LA', 'Chicago', 'Boston'], 20)
    })

    # LEVEL 2: Departments (depends on companies)
    departments = pd.DataFrame({
        'dept_id': range(1, 101),  # 100 departments
        'company_id': np.random.choice(companies['company_id'], 100),
        'dept_name': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], 100),
        'budget': np.round(np.random.uniform(50000, 2000000, 100), 2),
        'manager_name': [f'Manager_{i}' for i in range(1, 101)]
    })

    # LEVEL 3: Employees (depends on departments)
    employees = pd.DataFrame({
        'employee_id': range(1, 501),  # 500 employees
        'dept_id': np.random.choice(departments['dept_id'], 500),
        'first_name': [f'FirstName_{i}' for i in range(1, 501)],
        'last_name': [f'LastName_{i}' for i in range(1, 501)],
        'salary': np.round(np.random.uniform(40000, 150000, 500), 2),
        'hire_date': pd.date_range('2020-01-01', periods=500, freq='D')[:500],
        'position': np.random.choice(['Junior', 'Senior', 'Lead', 'Manager'], 500),
        # Self-referencing: some employees report to other employees
        'reports_to': [None] * 400 + list(np.random.choice(range(1, 401), 100))
    })

    # LEVEL 4: Projects (depends on employees as project managers)
    projects = pd.DataFrame({
        'project_id': range(1, 201),  # 200 projects
        'manager_id': np.random.choice(employees['employee_id'], 200),
        'project_name': [f'Project_{i}' for i in range(1, 201)],
        'start_date': pd.date_range('2023-01-01', periods=200, freq='3D')[:200],
        'budget': np.round(np.random.uniform(10000, 500000, 200), 2),
        'status': np.random.choice(['Planning', 'In Progress', 'Completed', 'On Hold'], 200),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 200)
    })

    # LEVEL 5: Tasks (depends on projects)
    tasks = pd.DataFrame({
        'task_id': range(1, 1001),  # 1000 tasks
        'project_id': np.random.choice(projects['project_id'], 1000),
        'assigned_to': np.random.choice(employees['employee_id'], 1000),
        'task_name': [f'Task_{i}' for i in range(1, 1001)],
        'description': [f'Task description {i}' for i in range(1, 1001)],
        'estimated_hours': np.random.randint(1, 40, 1000),
        'actual_hours': np.random.randint(1, 50, 1000),
        'status': np.random.choice(['Todo', 'In Progress', 'Review', 'Done'], 1000),
        # Self-referencing: some tasks depend on other tasks
        'depends_on_task': [None] * 800 + list(np.random.choice(range(1, 801), 200))
    })

    # ADDITIONAL: Employee Skills (Many-to-Many relationship simulation)
    skills = pd.DataFrame({
        'skill_id': range(1, 51),  # 50 different skills
        'skill_name': [f'Skill_{i}' for i in range(1, 51)],
        'category': np.random.choice(['Technical', 'Management', 'Communication', 'Domain'], 50),
        'difficulty_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 50)
    })

    # Employee-Skills mapping (represents many-to-many)
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 1501),  # 1500 skill assignments
        'employee_id': np.random.choice(employees['employee_id'], 1500),
        'skill_id': np.random.choice(skills['skill_id'], 1500),
        'proficiency': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 1500),
        'years_experience': np.random.randint(1, 10, 1500),
        'certified': np.random.choice([True, False], 1500)
    })

    # Project Teams (Many-to-Many: Projects to Employees)
    project_teams = pd.DataFrame({
        'team_id': range(1, 801),  # 800 team assignments
        'project_id': np.random.choice(projects['project_id'], 800),
        'employee_id': np.random.choice(employees['employee_id'], 800),
        'role': np.random.choice(['Developer', 'Tester', 'Analyst', 'Designer'], 800),
        'allocation_percent': np.random.randint(25, 100, 800),
        'start_date': pd.date_range('2023-01-01', periods=800, freq='2D')[:800]
    })

    return {
        'companies': companies,
        'departments': departments,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,
        'skills': skills,
        'employee_skills': employee_skills,
        'project_teams': project_teams
    }

In [23]:
# Demonstration
if __name__ == "__main__":
    print("=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

    # Create comprehensive sample data with 5 levels of relationships
    data_tables = create_comprehensive_sample_data()
    print("✓ Comprehensive sample data created with 5 levels of relationships")

    # Show data summary
    print("\n=== DATA SUMMARY ===")
    for table_name, df in data_tables.items():
        print(f"{table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

    # Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Add tables in dependency order (parents first)
    print("\n=== ADDING TABLES ===")
    synthesizer.add_table_data('companies', data_tables['companies'], primary_key='company_id')
    synthesizer.add_table_data('departments', data_tables['departments'], primary_key='dept_id')
    synthesizer.add_table_data('employees', data_tables['employees'], primary_key='employee_id')
    synthesizer.add_table_data('projects', data_tables['projects'], primary_key='project_id')
    synthesizer.add_table_data('tasks', data_tables['tasks'], primary_key='task_id')
    synthesizer.add_table_data('skills', data_tables['skills'], primary_key='skill_id')
    synthesizer.add_table_data('employee_skills', data_tables['employee_skills'], primary_key='emp_skill_id')
    synthesizer.add_table_data('project_teams', data_tables['project_teams'], primary_key='team_id')

    # Add relationships - LEVEL 1 to 5 hierarchy
    print("\n=== ADDING RELATIONSHIPS ===")
    print("Level 1->2: Companies to Departments")
    synthesizer.add_foreign_key_relationship('departments', 'company_id', 'companies', 'company_id')

    print("Level 2->3: Departments to Employees")
    synthesizer.add_foreign_key_relationship('employees', 'dept_id', 'departments', 'dept_id')

    print("Level 3->4: Employees to Projects (as managers)")
    synthesizer.add_foreign_key_relationship('projects', 'manager_id', 'employees', 'employee_id')

    print("Level 4->5: Projects to Tasks")
    synthesizer.add_foreign_key_relationship('tasks', 'project_id', 'projects', 'project_id')

    print("Additional: Tasks assigned to Employees")
    synthesizer.add_foreign_key_relationship('tasks', 'assigned_to', 'employees', 'employee_id')

    # Many-to-Many relationships
    print("Many-to-Many: Employee Skills")
    synthesizer.add_foreign_key_relationship('employee_skills', 'employee_id', 'employees', 'employee_id')
    synthesizer.add_foreign_key_relationship('employee_skills', 'skill_id', 'skills', 'skill_id')

    print("Many-to-Many: Project Teams")
    synthesizer.add_foreign_key_relationship('project_teams', 'project_id', 'projects', 'project_id')
    synthesizer.add_foreign_key_relationship('project_teams', 'employee_id', 'employees', 'employee_id')

    # Recursive/Self-referencing relationships
    print("\nRecursive Relationships:")
    synthesizer.add_self_referencing_relationship('employees', 'reports_to', 'employee_id')
    synthesizer.add_self_referencing_relationship('tasks', 'depends_on_task', 'task_id')

    # Train the synthesizer
    print("\n" + "="*60)
    synthesizer.fit()

    # Generate synthetic data with custom scaling
    print("\n" + "="*60)
    # Generate different scales for different table types
    custom_rows = {
        'companies': 15,      # Fewer companies
        'departments': 80,    # Fewer departments
        'employees': 400,     # Scale down employees
        'projects': 150,      # Scale down projects
        'tasks': 800,         # Scale down tasks
        'skills': 40,         # Fewer skills
        'employee_skills': 1200,  # Scale down skill assignments
        'project_teams': 600      # Scale down team assignments
    }

    synthetic_data = synthesizer.generate_synthetic_data(num_rows=custom_rows)

    # Show sample results from each level
    print("\n" + "="*60)
    print("=== SAMPLE SYNTHETIC DATA BY LEVEL ===")

    level_tables = [
        ('Level 1 - Companies', 'companies'),
        ('Level 2 - Departments', 'departments'),
        ('Level 3 - Employees', 'employees'),
        ('Level 4 - Projects', 'projects'),
        ('Level 5 - Tasks', 'tasks')
    ]

    for level_name, table_name in level_tables:
        if table_name in synthetic_data:
            print(f"\n{level_name.upper()}:")
            df = synthetic_data[table_name]
            print(f"Shape: {df.shape}")
            print(df.head(2))

    # Show recursive relationship examples
    print("\n=== RECURSIVE RELATIONSHIPS EXAMPLES ===")

    # Employee hierarchy
    emp_df = synthetic_data['employees']
    reporting_relationships = emp_df[emp_df['reports_to'].notna()][['employee_id', 'reports_to']].head(5)
    if not reporting_relationships.empty:
        print("\nEmployee Reporting Structure:")
        print(reporting_relationships)

    # Task dependencies
    task_df = synthetic_data['tasks']
    task_dependencies = task_df[task_df['depends_on_task'].notna()][['task_id', 'depends_on_task']].head(5)
    if not task_dependencies.empty:
        print("\nTask Dependencies:")
        print(task_dependencies)

    # Validate all relationships
    print("\n" + "="*60)
    validation = synthesizer.validate_relationships()

    # Get comprehensive summary
    print("\n=== COMPREHENSIVE SUMMARY ===")
    summary = synthesizer.get_summary_statistics()
    print("Data Generation Ratios:")
    for table_name, comparison in summary['comparison'].items():
        ratio = comparison['row_ratio']
        print(f"  {table_name:15}: {ratio:.2f}x original size")

    print(f"\n✓ 5-Level Multi-table Recursive Synthesis Completed!")
    print(f"✓ Generated data for {len(synthetic_data)} interconnected tables")
    print(f"✓ Maintained {len(validation)} relationship constraints")

    # Show relationship validation summary
    valid_relationships = sum(1 for v in validation.values() if v)
    total_relationships = len(validation)
    print(f"✓ Relationship integrity: {valid_relationships}/{total_relationships} valid")

    if valid_relationships == total_relationships:
        print("🎉 All relationships maintained perfectly!")
    elif valid_relationships > total_relationships * 0.8:
        print("✨ Most relationships maintained well!")
    else:
        print("⚠️  Some relationship issues detected - check validation details above")

=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===

✓ Comprehensive sample data created with 5 levels of relationships

=== DATA SUMMARY ===
companies: 20 rows, 5 columns
departments: 100 rows, 5 columns
employees: 500 rows, 8 columns
projects: 200 rows, 7 columns
tasks: 1000 rows, 9 columns
skills: 50 rows, 4 columns
employee_skills: 1500 rows, 6 columns
project_teams: 800 rows, 6 columns

=== ADDING TABLES ===

=== ADDING RELATIONSHIPS ===
Level 1->2: Companies to Departments
✗ Error adding relationship: Unknown table name ('company_id').
  Parent table columns: ['company_id', 'company_name', 'industry', 'founded_year', 'headquarters']
  Child table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
Level 2->3: Departments to Employees
✗ Error adding relationship: Unknown table name ('dept_id').
  Parent table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
  Child table columns: ['employee_id', 'dept_id', 'first_nam

Preprocess Tables: 100%|██████████| 8/8 [00:04<00:00,  1.76it/s]



Learning relationships:



Modeling Tables: 100%|██████████| 8/8 [00:02<00:00,  3.21it/s]


✓ Training completed successfully!

Generating synthetic data...
Converting num_rows to scale factor: 0.79
✓ Synthetic data generation completed!
  - companies: 15 rows, 5 columns
  - project_teams: 600 rows, 6 columns
  - employees: 394 rows, 8 columns
  - projects: 150 rows, 7 columns
  - employee_skills: 1182 rows, 6 columns
  - departments: 79 rows, 5 columns
  - skills: 39 rows, 4 columns
  - tasks: 788 rows, 9 columns

=== SAMPLE SYNTHETIC DATA BY LEVEL ===

LEVEL 1 - COMPANIES:
Shape: (15, 5)
   company_id company_name    industry  founded_year headquarters
0     4780961   Company_18  Healthcare          2003          NYC
1      834739   Company_18      Retail          2009          NYC

LEVEL 2 - DEPARTMENTS:
Shape: (79, 5)
    dept_id  company_id  dept_name      budget manager_name
0  16183965           7  Marketing  1658565.38   Manager_15
1  10585028          15  Marketing  1494141.55   Manager_35

LEVEL 3 - EMPLOYEES:
Shape: (394, 8)
   employee_id  dept_id first_name    la

In [4]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

    def add_table_data(self, table_name: str, data: pd.DataFrame,
                      primary_key: Optional[str] = None):
        """
        Add a table to the multi-table structure.

        Args:
            table_name (str): Name of the table
            data (pd.DataFrame): The actual data
            primary_key (str): Primary key column name
        """
        self.real_data[table_name] = data.copy()

        # Add table to metadata using the correct method
        self.metadata.detect_table_from_dataframe(table_name, data)

        # Set primary key if provided
        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
            except Exception as e:
                print(f"Warning setting primary key for {table_name}: {e}")
        else:
            print(f"✓ Added table '{table_name}' (no primary key specified)")

    def add_foreign_key_relationship(self, child_table: str, child_column: str,
                                   parent_table: str, parent_column: str):
        """
        Add a foreign key relationship between tables.

        Args:
            child_table (str): Child table name
            child_column (str): Foreign key column in child table
            parent_table (str): Parent table name
            parent_column (str): Referenced column in parent table (should be primary key)
        """
        try:
            # Ensure both tables exist
            if child_table not in self.real_data:
                raise ValueError(f"Child table '{child_table}' not found")
            if parent_table not in self.real_data:
                raise ValueError(f"Parent table '{parent_table}' not found")

            # Verify the columns exist
            if child_column not in self.real_data[child_table].columns:
                raise ValueError(f"Column '{child_column}' not found in table '{child_table}'")
            if parent_column not in self.real_data[parent_table].columns:
                raise ValueError(f"Column '{parent_column}' not found in table '{parent_table}'")

            # Mark foreign key column as 'id' type
            self.metadata.update_column(child_table, child_column, sdtype='id')

            # Add relationship to metadata
            # Note: The parent_primary_key should be the actual primary key column name
            # which should match the parent_column parameter
            self.metadata.add_relationship(
                parent_table_name=parent_table,
                child_table_name=child_table,
                parent_primary_key=parent_column,
                child_foreign_key=child_column
            )

            # Track for dependency graph
            self.dependency_graph.add_edge(parent_table, child_table)

            # Store relationship info
            if child_table not in self.table_dependencies:
                self.table_dependencies[child_table] = {}
            self.table_dependencies[child_table][child_column] = parent_table

            self.relationships.append({
                'parent_table': parent_table,
                'parent_column': parent_column,
                'child_table': child_table,
                'child_column': child_column
            })

            print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

        except Exception as e:
            print(f"✗ Error adding relationship: {e}")
            # Debug info
            print(f"  Parent table columns: {list(self.real_data[parent_table].columns) if parent_table in self.real_data else 'Table not found'}")
            print(f"  Child table columns: {list(self.real_data[child_table].columns) if child_table in self.real_data else 'Table not found'}")

    def add_self_referencing_relationship(self, table_name: str,
                                        foreign_key_col: str,
                                        primary_key_col: str):
        """
        Add a self-referencing (recursive) relationship within a table.

        Args:
            table_name (str): Table name
            foreign_key_col (str): Self-referencing foreign key column
            primary_key_col (str): Primary key column being referenced
        """
        try:
            if table_name not in self.real_data:
                raise ValueError(f"Table '{table_name}' not found")

            # Update the foreign key column to be 'id' type
            self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

            # Add self-referencing relationship
            self.metadata.add_relationship(
                parent_table_name=table_name,
                child_table_name=table_name,
                parent_primary_key=primary_key_col,
                child_foreign_key=foreign_key_col
            )

            # Add self-loop to dependency graph
            self.dependency_graph.add_edge(table_name, table_name)

            print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

        except Exception as e:
            print(f"✗ Error adding self-referencing relationship: {e}")

    def _detect_recursive_relationships(self):
        """
        Detect and handle recursive relationships in the table structure.
        """
        recursive_tables = []

        # Find self-loops (recursive relationships)
        for table in self.dependency_graph.nodes():
            if self.dependency_graph.has_edge(table, table):
                recursive_tables.append(table)

        # Find cycles in dependency graph
        try:
            cycles = list(nx.simple_cycles(self.dependency_graph))
            if cycles:
                print(f"Detected circular dependencies: {cycles}")

        except Exception as e:
            print(f"Error detecting cycles: {e}")

        return recursive_tables

    def _get_synthesis_order(self) -> List[str]:
        """
        Determine the order for synthesizing tables based on dependencies.

        Returns:
            List[str]: Ordered list of table names
        """
        try:
            # Create a copy of the graph without self-loops for topological sort
            temp_graph = self.dependency_graph.copy()
            self_loops = list(nx.selfloop_edges(temp_graph))
            temp_graph.remove_edges_from(self_loops)

            # Topological sort to get dependency order
            synthesis_order = list(nx.topological_sort(temp_graph))

            # Add any tables not in the dependency graph
            all_tables = set(self.real_data.keys())
            ordered_tables = set(synthesis_order)
            remaining_tables = all_tables - ordered_tables
            synthesis_order.extend(list(remaining_tables))

            return synthesis_order

        except nx.NetworkXError:
            # If there are cycles, use a heuristic approach
            print("Circular dependencies detected. Using heuristic ordering.")
            return list(self.real_data.keys())

    def prepare_for_synthesis(self):
        """
        Prepare the data and metadata for synthesis.
        """
        # Create metadata if not already created
        if self.metadata is None:
            self.create_metadata()

        # Detect recursive relationships
        recursive_tables = self._detect_recursive_relationships()
        if recursive_tables:
            print(f"Detected recursive relationships in tables: {recursive_tables}")

        print("\nMetadata Summary:")
        try:
            # Print metadata info
            metadata_dict = self.metadata.to_dict()
            print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
            relationships = metadata_dict.get('relationships', [])
            print(f"Relationships: {len(relationships)}")
            for rel in relationships:
                print(f"  - {rel}")
        except Exception as e:
            print(f"Could not display metadata: {e}")

        # Validate metadata
        try:
            self.metadata.validate()
            print("✓ Metadata validation successful!")
        except Exception as e:
            print(f"✗ Metadata validation error: {e}")
            raise

        # Initialize synthesizer
        try:
            print(f"Initializing {self.synthesizer_type} synthesizer...")
            self.synthesizer = HMASynthesizer(metadata=self.metadata)
            print("✓ Synthesizer initialized successfully!")

        except Exception as e:
            print(f"✗ Error initializing synthesizer: {e}")
            raise

    def fit(self):
        """
        Train the multi-table synthesizer on the real data.
        """
        if not self.synthesizer:
            self.prepare_for_synthesis()

        print("Training multi-table synthesizer...")
        try:
            self.synthesizer.fit(self.real_data)
            print("✓ Training completed successfully!")
        except Exception as e:
            print(f"✗ Training error: {e}")
            raise

    def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                              scale: float = 1.0) -> Dict[str, pd.DataFrame]:
        """
        Generate synthetic data for all tables.

        Args:
            num_rows (dict): Specific number of rows per table
            scale (float): Scale factor for data generation

        Returns:
            Dict[str, pd.DataFrame]: Synthetic data for each table
        """
        if not self.synthesizer:
            raise ValueError("Synthesizer not trained. Call fit() first.")

        print(f"Generating synthetic data...")

        try:
            # The HMASynthesizer sample method only accepts scale parameter
            if num_rows:
                # Calculate average scale from num_rows
                total_original = sum(len(df) for df in self.real_data.values())
                total_requested = sum(num_rows.get(table, len(df)) for table, df in self.real_data.items())
                calculated_scale = total_requested / total_original if total_original > 0 else 1.0
                print(f"Converting num_rows to scale factor: {calculated_scale:.2f}")
                self.synthetic_data = self.synthesizer.sample(scale=calculated_scale)

                # Then trim tables to requested sizes
                for table_name, requested_rows in num_rows.items():
                    if table_name in self.synthetic_data:
                        current_rows = len(self.synthetic_data[table_name])
                        if current_rows > requested_rows:
                            self.synthetic_data[table_name] = self.synthetic_data[table_name].head(requested_rows)

            else:
                self.synthetic_data = self.synthesizer.sample(scale=scale)

            print("✓ Synthetic data generation completed!")

            # Print summary
            for table_name, df in self.synthetic_data.items():
                print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

            return self.synthetic_data

        except Exception as e:
            print(f"✗ Generation error: {e}")
            # Try with default parameters
            try:
                print("Retrying with scale=1.0...")
                self.synthetic_data = self.synthesizer.sample(scale=1.0)
                print("✓ Fallback generation successful!")
                return self.synthetic_data
            except Exception as e2:
                print(f"✗ Fallback also failed: {e2}")
                raise

    def validate_relationships(self) -> Dict[str, bool]:
        """
        Validate that relationships are maintained in synthetic data.

        Returns:
            Dict[str, bool]: Validation results for each relationship
        """
        validation_results = {}

        if not self.synthetic_data:
            print("No synthetic data to validate.")
            return validation_results

        print("\nValidating relationships...")

        for rel in self.relationships:
            parent_table = rel['parent_table']
            parent_col = rel['parent_column']
            child_table = rel['child_table']
            child_col = rel['child_column']

            if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
                continue

            parent_df = self.synthetic_data[parent_table]
            child_df = self.synthetic_data[child_table]

            if parent_col in parent_df.columns and child_col in child_df.columns:
                # Get valid parent values (excluding None/NaN)
                parent_values = set(parent_df[parent_col].dropna().values)
                child_fk_values = set(child_df[child_col].dropna().values)

                # Check referential integrity
                invalid_refs = child_fk_values - parent_values
                is_valid = len(invalid_refs) == 0

                relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
                validation_results[relationship_name] = is_valid

                status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
                print(f"  {relationship_name}: {status}")

        return validation_results

    def get_summary_statistics(self) -> Dict:
        """
        Get summary statistics comparing real and synthetic data.
        """
        summary = {
            'real_data': {},
            'synthetic_data': {},
            'comparison': {}
        }

        for table_name in self.real_data.keys():
            real_df = self.real_data[table_name]
            summary['real_data'][table_name] = {
                'shape': real_df.shape,
                'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
            }

            if table_name in self.synthetic_data:
                synthetic_df = self.synthetic_data[table_name]
                summary['synthetic_data'][table_name] = {
                    'shape': synthetic_df.shape,
                    'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                    'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
                }

                # Compare basic statistics
                summary['comparison'][table_name] = {
                    'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                    'column_match': synthetic_df.shape[1] == real_df.shape[1]
                }

        return summary

# Example usage and testing
def create_comprehensive_sample_data():
    """
    Create comprehensive sample data with 5 levels of relationships and recursive structures.
    """
    np.random.seed(42)  # For reproducible results

    # LEVEL 1: Companies (Root table)
    companies = pd.DataFrame({
        'company_id': range(1, 21),  # 20 companies
        'company_name': [f'Company_{i}' for i in range(1, 21)],
        'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail'], 20),
        'founded_year': np.random.randint(1990, 2020, 20),
        'headquarters': np.random.choice(['NYC', 'SF', 'LA', 'Chicago', 'Boston'], 20)
    })

    # LEVEL 2: Departments (depends on companies)
    departments = pd.DataFrame({
        'dept_id': range(1, 101),  # 100 departments
        'company_id': np.random.choice(companies['company_id'], 100),
        'dept_name': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], 100),
        'budget': np.round(np.random.uniform(50000, 2000000, 100), 2),
        'manager_name': [f'Manager_{i}' for i in range(1, 101)]
    })

    # LEVEL 3: Employees (depends on departments)
    employees = pd.DataFrame({
        'employee_id': range(1, 501),  # 500 employees
        'dept_id': np.random.choice(departments['dept_id'], 500),
        'first_name': [f'FirstName_{i}' for i in range(1, 501)],
        'last_name': [f'LastName_{i}' for i in range(1, 501)],
        'salary': np.round(np.random.uniform(40000, 150000, 500), 2),
        'hire_date': pd.date_range('2020-01-01', periods=500, freq='D')[:500],
        'position': np.random.choice(['Junior', 'Senior', 'Lead', 'Manager'], 500),
        # Self-referencing: some employees report to other employees
        'reports_to': [None] * 400 + list(np.random.choice(range(1, 401), 100))
    })

    # LEVEL 4: Projects (depends on employees as project managers)
    projects = pd.DataFrame({
        'project_id': range(1, 201),  # 200 projects
        'manager_id': np.random.choice(employees['employee_id'], 200),
        'project_name': [f'Project_{i}' for i in range(1, 201)],
        'start_date': pd.date_range('2023-01-01', periods=200, freq='3D')[:200],
        'budget': np.round(np.random.uniform(10000, 500000, 200), 2),
        'status': np.random.choice(['Planning', 'In Progress', 'Completed', 'On Hold'], 200),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 200)
    })

    # LEVEL 5: Tasks (depends on projects)
    tasks = pd.DataFrame({
        'task_id': range(1, 1001),  # 1000 tasks
        'project_id': np.random.choice(projects['project_id'], 1000),
        'assigned_to': np.random.choice(employees['employee_id'], 1000),
        'task_name': [f'Task_{i}' for i in range(1, 1001)],
        'description': [f'Task description {i}' for i in range(1, 1001)],
        'estimated_hours': np.random.randint(1, 40, 1000),
        'actual_hours': np.random.randint(1, 50, 1000),
        'status': np.random.choice(['Todo', 'In Progress', 'Review', 'Done'], 1000),
        # Self-referencing: some tasks depend on other tasks
        'depends_on_task': [None] * 800 + list(np.random.choice(range(1, 801), 200))
    })

    # ADDITIONAL: Employee Skills (Many-to-Many relationship simulation)
    skills = pd.DataFrame({
        'skill_id': range(1, 51),  # 50 different skills
        'skill_name': [f'Skill_{i}' for i in range(1, 51)],
        'category': np.random.choice(['Technical', 'Management', 'Communication', 'Domain'], 50),
        'difficulty_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 50)
    })

    # Employee-Skills mapping (represents many-to-many)
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 1501),  # 1500 skill assignments
        'employee_id': np.random.choice(employees['employee_id'], 1500),
        'skill_id': np.random.choice(skills['skill_id'], 1500),
        'proficiency': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 1500),
        'years_experience': np.random.randint(1, 10, 1500),
        'certified': np.random.choice([True, False], 1500)
    })

    # Project Teams (Many-to-Many: Projects to Employees)
    project_teams = pd.DataFrame({
        'team_id': range(1, 801),  # 800 team assignments
        'project_id': np.random.choice(projects['project_id'], 800),
        'employee_id': np.random.choice(employees['employee_id'], 800),
        'role': np.random.choice(['Developer', 'Tester', 'Analyst', 'Designer'], 800),
        'allocation_percent': np.random.randint(25, 100, 800),
        'start_date': pd.date_range('2023-01-01', periods=800, freq='2D')[:800]
    })

    return {
        'companies': companies,
        'departments': departments,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,
        'skills': skills,
        'employee_skills': employee_skills,
        'project_teams': project_teams
    }

In [5]:
# Demonstration
if __name__ == "__main__":
    print("=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

    # Create comprehensive sample data with 5 levels of relationships
    data_tables = create_comprehensive_sample_data()
    print("✓ Comprehensive sample data created with 5 levels of relationships")

    # Show data summary
    print("\n=== DATA SUMMARY ===")
    for table_name, df in data_tables.items():
        print(f"{table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

    # Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Add tables in dependency order (parents first)
    print("\n=== ADDING TABLES ===")
    synthesizer.add_table_data('companies', data_tables['companies'], primary_key='company_id')
    synthesizer.add_table_data('departments', data_tables['departments'], primary_key='dept_id')
    synthesizer.add_table_data('employees', data_tables['employees'], primary_key='employee_id')
    synthesizer.add_table_data('projects', data_tables['projects'], primary_key='project_id')
    synthesizer.add_table_data('tasks', data_tables['tasks'], primary_key='task_id')
    synthesizer.add_table_data('skills', data_tables['skills'], primary_key='skill_id')
    synthesizer.add_table_data('employee_skills', data_tables['employee_skills'], primary_key='emp_skill_id')
    synthesizer.add_table_data('project_teams', data_tables['project_teams'], primary_key='team_id')

    # Add relationships - LEVEL 1 to 5 hierarchy
    print("\n=== ADDING RELATIONSHIPS ===")
    print("Level 1->2: Companies to Departments")
    synthesizer.add_foreign_key_relationship('departments', 'company_id', 'companies', 'company_id')

    print("Level 2->3: Departments to Employees")
    synthesizer.add_foreign_key_relationship('employees', 'dept_id', 'departments', 'dept_id')

    print("Level 3->4: Employees to Projects (as managers)")
    synthesizer.add_foreign_key_relationship('projects', 'manager_id', 'employees', 'employee_id')

    print("Level 4->5: Projects to Tasks")
    synthesizer.add_foreign_key_relationship('tasks', 'project_id', 'projects', 'project_id')

    print("Additional: Tasks assigned to Employees")
    synthesizer.add_foreign_key_relationship('tasks', 'assigned_to', 'employees', 'employee_id')

    # Many-to-Many relationships
    print("Many-to-Many: Employee Skills")
    synthesizer.add_foreign_key_relationship('employee_skills', 'employee_id', 'employees', 'employee_id')
    synthesizer.add_foreign_key_relationship('employee_skills', 'skill_id', 'skills', 'skill_id')

    print("Many-to-Many: Project Teams")
    synthesizer.add_foreign_key_relationship('project_teams', 'project_id', 'projects', 'project_id')
    synthesizer.add_foreign_key_relationship('project_teams', 'employee_id', 'employees', 'employee_id')

    # Recursive/Self-referencing relationships
    print("\nRecursive Relationships:")
    synthesizer.add_self_referencing_relationship('employees', 'reports_to', 'employee_id')
    synthesizer.add_self_referencing_relationship('tasks', 'depends_on_task', 'task_id')

    # Train the synthesizer
    print("\n" + "="*60)
    synthesizer.fit()

    # Generate synthetic data with custom scaling
    print("\n" + "="*60)
    # Generate different scales for different table types
    custom_rows = {
        'companies': 15,      # Fewer companies
        'departments': 80,    # Fewer departments
        'employees': 400,     # Scale down employees
        'projects': 150,      # Scale down projects
        'tasks': 800,         # Scale down tasks
        'skills': 40,         # Fewer skills
        'employee_skills': 1200,  # Scale down skill assignments
        'project_teams': 600      # Scale down team assignments
    }

    synthetic_data = synthesizer.generate_synthetic_data(num_rows=custom_rows)

    # Show sample results from each level
    print("\n" + "="*60)
    print("=== SAMPLE SYNTHETIC DATA BY LEVEL ===")

    level_tables = [
        ('Level 1 - Companies', 'companies'),
        ('Level 2 - Departments', 'departments'),
        ('Level 3 - Employees', 'employees'),
        ('Level 4 - Projects', 'projects'),
        ('Level 5 - Tasks', 'tasks')
    ]

    for level_name, table_name in level_tables:
        if table_name in synthetic_data:
            print(f"\n{level_name.upper()}:")
            df = synthetic_data[table_name]
            print(f"Shape: {df.shape}")
            print(df.head(2))

    # Show recursive relationship examples
    print("\n=== RECURSIVE RELATIONSHIPS EXAMPLES ===")

    # Employee hierarchy
    emp_df = synthetic_data['employees']
    reporting_relationships = emp_df[emp_df['reports_to'].notna()][['employee_id', 'reports_to']].head(5)
    if not reporting_relationships.empty:
        print("\nEmployee Reporting Structure:")
        print(reporting_relationships)

    # Task dependencies
    task_df = synthetic_data['tasks']
    task_dependencies = task_df[task_df['depends_on_task'].notna()][['task_id', 'depends_on_task']].head(5)
    if not task_dependencies.empty:
        print("\nTask Dependencies:")
        print(task_dependencies)

    # Validate all relationships
    print("\n" + "="*60)
    validation = synthesizer.validate_relationships()

    # Get comprehensive summary
    print("\n=== COMPREHENSIVE SUMMARY ===")
    summary = synthesizer.get_summary_statistics()
    print("Data Generation Ratios:")
    for table_name, comparison in summary['comparison'].items():
        ratio = comparison['row_ratio']
        print(f"  {table_name:15}: {ratio:.2f}x original size")

    print(f"\n✓ 5-Level Multi-table Recursive Synthesis Completed!")
    print(f"✓ Generated data for {len(synthetic_data)} interconnected tables")
    print(f"✓ Maintained {len(validation)} relationship constraints")

    # Show relationship validation summary
    valid_relationships = sum(1 for v in validation.values() if v)
    total_relationships = len(validation)
    print(f"✓ Relationship integrity: {valid_relationships}/{total_relationships} valid")

    if valid_relationships == total_relationships:
        print("🎉 All relationships maintained perfectly!")
    elif valid_relationships > total_relationships * 0.8:
        print("✨ Most relationships maintained well!")
    else:
        print("⚠️  Some relationship issues detected - check validation details above")


=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===

✓ Comprehensive sample data created with 5 levels of relationships

=== DATA SUMMARY ===
companies: 20 rows, 5 columns
departments: 100 rows, 5 columns
employees: 500 rows, 8 columns
projects: 200 rows, 7 columns
tasks: 1000 rows, 9 columns
skills: 50 rows, 4 columns
employee_skills: 1500 rows, 6 columns
project_teams: 800 rows, 6 columns

=== ADDING TABLES ===

=== ADDING RELATIONSHIPS ===
Level 1->2: Companies to Departments
✗ Error adding relationship: Unknown table name ('company_id').
  Parent table columns: ['company_id', 'company_name', 'industry', 'founded_year', 'headquarters']
  Child table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
Level 2->3: Departments to Employees
✗ Error adding relationship: Unknown table name ('dept_id').
  Parent table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
  Child table columns: ['employee_id', 'dept_id', 'first_nam

Preprocess Tables: 100%|██████████| 8/8 [00:05<00:00,  1.35it/s]



Learning relationships:



Modeling Tables: 100%|██████████| 8/8 [00:03<00:00,  2.46it/s]


✓ Training completed successfully!

Generating synthetic data...
Converting num_rows to scale factor: 0.79
✓ Synthetic data generation completed!
  - projects: 150 rows, 7 columns
  - tasks: 788 rows, 9 columns
  - skills: 39 rows, 4 columns
  - departments: 79 rows, 5 columns
  - employees: 394 rows, 8 columns
  - employee_skills: 1182 rows, 6 columns
  - project_teams: 600 rows, 6 columns
  - companies: 15 rows, 5 columns

=== SAMPLE SYNTHETIC DATA BY LEVEL ===

LEVEL 1 - COMPANIES:
Shape: (15, 5)
   company_id company_name    industry  founded_year headquarters
0     4780961   Company_18  Healthcare          2003          NYC
1      834739   Company_18      Retail          2009          NYC

LEVEL 2 - DEPARTMENTS:
Shape: (79, 5)
    dept_id  company_id  dept_name      budget manager_name
0  16183965           7  Marketing  1658565.38   Manager_15
1  10585028          15  Marketing  1494141.55   Manager_35

LEVEL 3 - EMPLOYEES:
Shape: (394, 8)
   employee_id  dept_id first_name    la

In [6]:
task_sdf = synthetic_data['tasks']
companies_sdf = synthetic_data['companies']
departments_sdf = synthetic_data['departments']
employees_sdf = synthetic_data['employees']
projectssdf = synthetic_data['projects']
skills_sdf = synthetic_data['skills']
employee_skills_sdf = synthetic_data['employee_skills']
project_teams_sdf = synthetic_data['project_teams']

In [8]:
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframes(
    data={
        "task_sdf": task_sdf,
        "companies_sdf": companies_sdf,
        "departments_sdf": departments_sdf,
        "employees_sdf": employees_sdf,
        "projectssdf": projectssdf,
        "skills_sdf": skills_sdf,
        "employee_skills_sdf": employee_skills_sdf,
        "project_teams_sdf": project_teams_sdf
    })

In [9]:
metadata

{
    "tables": {
        "task_sdf": {
            "primary_key": "task_id",
            "columns": {
                "task_id": {
                    "sdtype": "id"
                },
                "project_id": {
                    "sdtype": "id"
                },
                "assigned_to": {
                    "sdtype": "numerical"
                },
                "task_name": {
                    "sdtype": "categorical"
                },
                "description": {
                    "sdtype": "categorical"
                },
                "estimated_hours": {
                    "sdtype": "numerical"
                },
                "actual_hours": {
                    "sdtype": "numerical"
                },
                "status": {
                    "sdtype": "categorical"
                },
                "depends_on_task": {
                    "sdtype": "numerical"
                }
            }
        },
        "companies_sdf": {
           

In [2]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

    def add_table_data(self, table_name: str, data: pd.DataFrame,
                      primary_key: Optional[str] = None):
        """
        Add a table to the multi-table structure.

        Args:
            table_name (str): Name of the table
            data (pd.DataFrame): The actual data
            primary_key (str): Primary key column name
        """
        self.real_data[table_name] = data.copy()

        # Add table to metadata using the correct method
        self.metadata.detect_table_from_dataframe(table_name, data)

        # Set primary key if provided
        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
            except Exception as e:
                print(f"Warning setting primary key for {table_name}: {e}")
        else:
            print(f"✓ Added table '{table_name}' (no primary key specified)")

    def add_foreign_key_relationship(self, child_table: str, child_column: str,
                                   parent_table: str, parent_column: str):
        """
        Add a foreign key relationship between tables.

        Args:
            child_table (str): Child table name
            child_column (str): Foreign key column in child table
            parent_table (str): Parent table name
            parent_column (str): Referenced column in parent table (should be primary key)
        """
        try:
            # Ensure both tables exist
            if child_table not in self.real_data:
                raise ValueError(f"Child table '{child_table}' not found")
            if parent_table not in self.real_data:
                raise ValueError(f"Parent table '{parent_table}' not found")

            # Verify the columns exist
            if child_column not in self.real_data[child_table].columns:
                raise ValueError(f"Column '{child_column}' not found in table '{child_table}'")
            if parent_column not in self.real_data[parent_table].columns:
                raise ValueError(f"Column '{parent_column}' not found in table '{parent_table}'")

            # Mark foreign key column as 'id' type
            self.metadata.update_column(child_table, child_column, sdtype='id')

            # Add relationship to metadata
            # Note: The parent_primary_key should be the actual primary key column name
            # which should match the parent_column parameter
            self.metadata.add_relationship(
                parent_table_name=parent_table,
                child_table_name=child_table,
                parent_primary_key=parent_column,
                child_foreign_key=child_column
            )

            # Track for dependency graph
            self.dependency_graph.add_edge(parent_table, child_table)

            # Store relationship info
            if child_table not in self.table_dependencies:
                self.table_dependencies[child_table] = {}
            self.table_dependencies[child_table][child_column] = parent_table

            self.relationships.append({
                'parent_table': parent_table,
                'parent_column': parent_column,
                'child_table': child_table,
                'child_column': child_column
            })

            print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

        except Exception as e:
            print(f"✗ Error adding relationship: {e}")
            # Debug info
            print(f"  Parent table columns: {list(self.real_data[parent_table].columns) if parent_table in self.real_data else 'Table not found'}")
            print(f"  Child table columns: {list(self.real_data[child_table].columns) if child_table in self.real_data else 'Table not found'}")

    def add_self_referencing_relationship(self, table_name: str,
                                        foreign_key_col: str,
                                        primary_key_col: str):
        """
        Add a self-referencing (recursive) relationship within a table.

        Args:
            table_name (str): Table name
            foreign_key_col (str): Self-referencing foreign key column
            primary_key_col (str): Primary key column being referenced
        """
        try:
            if table_name not in self.real_data:
                raise ValueError(f"Table '{table_name}' not found")

            # Update the foreign key column to be 'id' type
            self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

            # Add self-referencing relationship
            self.metadata.add_relationship(
                parent_table_name=table_name,
                child_table_name=table_name,
                parent_primary_key=primary_key_col,
                child_foreign_key=foreign_key_col
            )

            # Add self-loop to dependency graph
            self.dependency_graph.add_edge(table_name, table_name)

            print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

        except Exception as e:
            print(f"✗ Error adding self-referencing relationship: {e}")

    def _detect_recursive_relationships(self):
        """
        Detect and handle recursive relationships in the table structure.
        """
        recursive_tables = []

        # Find self-loops (recursive relationships)
        for table in self.dependency_graph.nodes():
            if self.dependency_graph.has_edge(table, table):
                recursive_tables.append(table)

        # Find cycles in dependency graph
        try:
            cycles = list(nx.simple_cycles(self.dependency_graph))
            if cycles:
                print(f"Detected circular dependencies: {cycles}")

        except Exception as e:
            print(f"Error detecting cycles: {e}")

        return recursive_tables

    def _get_synthesis_order(self) -> List[str]:
        """
        Determine the order for synthesizing tables based on dependencies.

        Returns:
            List[str]: Ordered list of table names
        """
        try:
            # Create a copy of the graph without self-loops for topological sort
            temp_graph = self.dependency_graph.copy()
            self_loops = list(nx.selfloop_edges(temp_graph))
            temp_graph.remove_edges_from(self_loops)

            # Topological sort to get dependency order
            synthesis_order = list(nx.topological_sort(temp_graph))

            # Add any tables not in the dependency graph
            all_tables = set(self.real_data.keys())
            ordered_tables = set(synthesis_order)
            remaining_tables = all_tables - ordered_tables
            synthesis_order.extend(list(remaining_tables))

            return synthesis_order

        except nx.NetworkXError:
            # If there are cycles, use a heuristic approach
            print("Circular dependencies detected. Using heuristic ordering.")
            return list(self.real_data.keys())

    def prepare_for_synthesis(self):
        """
        Prepare the data and metadata for synthesis.
        """
        # Create metadata if not already created
        if self.metadata is None:
            self.create_metadata()

        # Detect recursive relationships
        recursive_tables = self._detect_recursive_relationships()
        if recursive_tables:
            print(f"Detected recursive relationships in tables: {recursive_tables}")

        print("\nMetadata Summary:")
        try:
            # Print metadata info
            metadata_dict = self.metadata.to_dict()
            print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
            relationships = metadata_dict.get('relationships', [])
            print(f"Relationships: {len(relationships)}")
            for rel in relationships:
                print(f"  - {rel}")
        except Exception as e:
            print(f"Could not display metadata: {e}")

        # Validate metadata
        try:
            self.metadata.validate()
            print("✓ Metadata validation successful!")
        except Exception as e:
            print(f"✗ Metadata validation error: {e}")
            raise

        # Initialize synthesizer
        try:
            print(f"Initializing {self.synthesizer_type} synthesizer...")
            self.synthesizer = HMASynthesizer(metadata=self.metadata)
            print("✓ Synthesizer initialized successfully!")

        except Exception as e:
            print(f"✗ Error initializing synthesizer: {e}")
            raise

    def fit(self):
        """
        Train the multi-table synthesizer on the real data.
        """
        if not self.synthesizer:
            self.prepare_for_synthesis()

        print("Training multi-table synthesizer...")
        try:
            self.synthesizer.fit(self.real_data)
            print("✓ Training completed successfully!")
        except Exception as e:
            print(f"✗ Training error: {e}")
            raise

    def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                              scale: float = 1.0) -> Dict[str, pd.DataFrame]:
        """
        Generate synthetic data for all tables.

        Args:
            num_rows (dict): Specific number of rows per table
            scale (float): Scale factor for data generation

        Returns:
            Dict[str, pd.DataFrame]: Synthetic data for each table
        """
        if not self.synthesizer:
            raise ValueError("Synthesizer not trained. Call fit() first.")

        print(f"Generating synthetic data...")

        try:
            # The HMASynthesizer sample method only accepts scale parameter
            if num_rows:
                # Calculate average scale from num_rows
                total_original = sum(len(df) for df in self.real_data.values())
                total_requested = sum(num_rows.get(table, len(df)) for table, df in self.real_data.items())
                calculated_scale = total_requested / total_original if total_original > 0 else 1.0
                print(f"Converting num_rows to scale factor: {calculated_scale:.2f}")
                self.synthetic_data = self.synthesizer.sample(scale=calculated_scale)

                # Then trim tables to requested sizes
                for table_name, requested_rows in num_rows.items():
                    if table_name in self.synthetic_data:
                        current_rows = len(self.synthetic_data[table_name])
                        if current_rows > requested_rows:
                            self.synthetic_data[table_name] = self.synthetic_data[table_name].head(requested_rows)

            else:
                self.synthetic_data = self.synthesizer.sample(scale=scale)

            print("✓ Synthetic data generation completed!")

            # Print summary
            for table_name, df in self.synthetic_data.items():
                print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

            return self.synthetic_data

        except Exception as e:
            print(f"✗ Generation error: {e}")
            # Try with default parameters
            try:
                print("Retrying with scale=1.0...")
                self.synthetic_data = self.synthesizer.sample(scale=1.0)
                print("✓ Fallback generation successful!")
                return self.synthetic_data
            except Exception as e2:
                print(f"✗ Fallback also failed: {e2}")
                raise

    def validate_relationships(self) -> Dict[str, bool]:
        """
        Validate that relationships are maintained in synthetic data.

        Returns:
            Dict[str, bool]: Validation results for each relationship
        """
        validation_results = {}

        if not self.synthetic_data:
            print("No synthetic data to validate.")
            return validation_results

        print("\nValidating relationships...")

        for rel in self.relationships:
            parent_table = rel['parent_table']
            parent_col = rel['parent_column']
            child_table = rel['child_table']
            child_col = rel['child_column']

            if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
                continue

            parent_df = self.synthetic_data[parent_table]
            child_df = self.synthetic_data[child_table]

            if parent_col in parent_df.columns and child_col in child_df.columns:
                # Get valid parent values (excluding None/NaN)
                parent_values = set(parent_df[parent_col].dropna().values)
                child_fk_values = set(child_df[child_col].dropna().values)

                # Check referential integrity
                invalid_refs = child_fk_values - parent_values
                is_valid = len(invalid_refs) == 0

                relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
                validation_results[relationship_name] = is_valid

                status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
                print(f"  {relationship_name}: {status}")

        return validation_results

    def get_summary_statistics(self) -> Dict:
        """
        Get summary statistics comparing real and synthetic data.
        """
        summary = {
            'real_data': {},
            'synthetic_data': {},
            'comparison': {}
        }

        for table_name in self.real_data.keys():
            real_df = self.real_data[table_name]
            summary['real_data'][table_name] = {
                'shape': real_df.shape,
                'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
            }

            if table_name in self.synthetic_data:
                synthetic_df = self.synthetic_data[table_name]
                summary['synthetic_data'][table_name] = {
                    'shape': synthetic_df.shape,
                    'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                    'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
                }

                # Compare basic statistics
                summary['comparison'][table_name] = {
                    'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                    'column_match': synthetic_df.shape[1] == real_df.shape[1]
                }

        return summary

# Example usage and testing
def create_comprehensive_sample_data():
    """
    Create comprehensive sample data with 5 levels of relationships and recursive structures.
    """
    np.random.seed(42)  # For reproducible results

    # LEVEL 1: Companies (Root table)
    companies = pd.DataFrame({
        'company_id': range(1, 21),  # 20 companies
        'company_name': [f'Company_{i}' for i in range(1, 21)],
        'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail'], 20),
        'founded_year': np.random.randint(1990, 2020, 20),
        'headquarters': np.random.choice(['NYC', 'SF', 'LA', 'Chicago', 'Boston'], 20)
    })

    # LEVEL 2: Departments (depends on companies)
    departments = pd.DataFrame({
        'dept_id': range(1, 101),  # 100 departments
        'company_id': np.random.choice(companies['company_id'], 100),
        'dept_name': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], 100),
        'budget': np.round(np.random.uniform(50000, 2000000, 100), 2),
        'manager_name': [f'Manager_{i}' for i in range(1, 101)]
    })

    # LEVEL 3: Employees (depends on departments)
    employees = pd.DataFrame({
        'employee_id': range(1, 501),  # 500 employees
        'dept_id': np.random.choice(departments['dept_id'], 500),
        'first_name': [f'FirstName_{i}' for i in range(1, 501)],
        'last_name': [f'LastName_{i}' for i in range(1, 501)],
        'salary': np.round(np.random.uniform(40000, 150000, 500), 2),
        'hire_date': pd.date_range('2020-01-01', periods=500, freq='D')[:500],
        'position': np.random.choice(['Junior', 'Senior', 'Lead', 'Manager'], 500),
        # Self-referencing: some employees report to other employees
        'reports_to': [None] * 400 + list(np.random.choice(range(1, 401), 100))
    })

    # LEVEL 4: Projects (depends on employees as project managers)
    projects = pd.DataFrame({
        'project_id': range(1, 201),  # 200 projects
        'manager_id': np.random.choice(employees['employee_id'], 200),
        'project_name': [f'Project_{i}' for i in range(1, 201)],
        'start_date': pd.date_range('2023-01-01', periods=200, freq='3D')[:200],
        'budget': np.round(np.random.uniform(10000, 500000, 200), 2),
        'status': np.random.choice(['Planning', 'In Progress', 'Completed', 'On Hold'], 200),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 200)
    })

    # LEVEL 5: Tasks (depends on projects)
    tasks = pd.DataFrame({
        'task_id': range(1, 1001),  # 1000 tasks
        'project_id': np.random.choice(projects['project_id'], 1000),
        'assigned_to': np.random.choice(employees['employee_id'], 1000),
        'task_name': [f'Task_{i}' for i in range(1, 1001)],
        'description': [f'Task description {i}' for i in range(1, 1001)],
        'estimated_hours': np.random.randint(1, 40, 1000),
        'actual_hours': np.random.randint(1, 50, 1000),
        'status': np.random.choice(['Todo', 'In Progress', 'Review', 'Done'], 1000),
        # Self-referencing: some tasks depend on other tasks
        'depends_on_task': [None] * 800 + list(np.random.choice(range(1, 801), 200))
    })

    # ADDITIONAL: Employee Skills (Many-to-Many relationship simulation)
    skills = pd.DataFrame({
        'skill_id': range(1, 51),  # 50 different skills
        'skill_name': [f'Skill_{i}' for i in range(1, 51)],
        'category': np.random.choice(['Technical', 'Management', 'Communication', 'Domain'], 50),
        'difficulty_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 50)
    })

    # Employee-Skills mapping (represents many-to-many)
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 1501),  # 1500 skill assignments
        'employee_id': np.random.choice(employees['employee_id'], 1500),
        'skill_id': np.random.choice(skills['skill_id'], 1500),
        'proficiency': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 1500),
        'years_experience': np.random.randint(1, 10, 1500),
        'certified': np.random.choice([True, False], 1500)
    })

    # Project Teams (Many-to-Many: Projects to Employees)
    project_teams = pd.DataFrame({
        'team_id': range(1, 801),  # 800 team assignments
        'project_id': np.random.choice(projects['project_id'], 800),
        'employee_id': np.random.choice(employees['employee_id'], 800),
        'role': np.random.choice(['Developer', 'Tester', 'Analyst', 'Designer'], 800),
        'allocation_percent': np.random.randint(25, 100, 800),
        'start_date': pd.date_range('2023-01-01', periods=800, freq='2D')[:800]
    })

    return {
        'companies': companies,
        'departments': departments,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,
        'skills': skills,
        'employee_skills': employee_skills,
        'project_teams': project_teams
    }

In [3]:
# Demonstration
if __name__ == "__main__":
    print("=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

    # Create comprehensive sample data with 5 levels of relationships
    data_tables = create_comprehensive_sample_data()
    print("✓ Comprehensive sample data created with 5 levels of relationships")

    # Show data summary
    print("\n=== DATA SUMMARY ===")
    for table_name, df in data_tables.items():
        print(f"{table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

    # Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Add tables in dependency order (parents first)
    print("\n=== ADDING TABLES ===")
    synthesizer.add_table_data('companies', data_tables['companies'], primary_key='company_id')
    synthesizer.add_table_data('departments', data_tables['departments'], primary_key='dept_id')
    synthesizer.add_table_data('employees', data_tables['employees'], primary_key='employee_id')
    synthesizer.add_table_data('projects', data_tables['projects'], primary_key='project_id')
    synthesizer.add_table_data('tasks', data_tables['tasks'], primary_key='task_id')
    synthesizer.add_table_data('skills', data_tables['skills'], primary_key='skill_id')
    synthesizer.add_table_data('employee_skills', data_tables['employee_skills'], primary_key='emp_skill_id')
    synthesizer.add_table_data('project_teams', data_tables['project_teams'], primary_key='team_id')

    # Add relationships - LEVEL 1 to 5 hierarchy
    print("\n=== ADDING RELATIONSHIPS ===")
    print("Level 1->2: Companies to Departments")
    synthesizer.add_foreign_key_relationship('departments', 'company_id', 'companies', 'company_id')

    print("Level 2->3: Departments to Employees")
    synthesizer.add_foreign_key_relationship('employees', 'dept_id', 'departments', 'dept_id')

    print("Level 3->4: Employees to Projects (as managers)")
    synthesizer.add_foreign_key_relationship('projects', 'manager_id', 'employees', 'employee_id')

    print("Level 4->5: Projects to Tasks")
    synthesizer.add_foreign_key_relationship('tasks', 'project_id', 'projects', 'project_id')

    print("Additional: Tasks assigned to Employees")
    synthesizer.add_foreign_key_relationship('tasks', 'assigned_to', 'employees', 'employee_id')

    # Many-to-Many relationships
    print("Many-to-Many: Employee Skills")
    synthesizer.add_foreign_key_relationship('employee_skills', 'employee_id', 'employees', 'employee_id')
    synthesizer.add_foreign_key_relationship('employee_skills', 'skill_id', 'skills', 'skill_id')

    print("Many-to-Many: Project Teams")
    synthesizer.add_foreign_key_relationship('project_teams', 'project_id', 'projects', 'project_id')
    synthesizer.add_foreign_key_relationship('project_teams', 'employee_id', 'employees', 'employee_id')

    # Recursive/Self-referencing relationships
    print("\nRecursive Relationships:")
    synthesizer.add_self_referencing_relationship('employees', 'reports_to', 'employee_id')
    synthesizer.add_self_referencing_relationship('tasks', 'depends_on_task', 'task_id')

    # Train the synthesizer
    print("\n" + "="*60)
    synthesizer.fit()

    # Generate synthetic data with custom scaling
    print("\n" + "="*60)
    # Generate different scales for different table types
    custom_rows = {
        'companies': 15,      # Fewer companies
        'departments': 80,    # Fewer departments
        'employees': 400,     # Scale down employees
        'projects': 150,      # Scale down projects
        'tasks': 800,         # Scale down tasks
        'skills': 40,         # Fewer skills
        'employee_skills': 1200,  # Scale down skill assignments
        'project_teams': 600      # Scale down team assignments
    }

    synthetic_data = synthesizer.generate_synthetic_data(num_rows=custom_rows)

    # Show sample results from each level
    print("\n" + "="*60)
    print("=== SAMPLE SYNTHETIC DATA BY LEVEL ===")

    level_tables = [
        ('Level 1 - Companies', 'companies'),
        ('Level 2 - Departments', 'departments'),
        ('Level 3 - Employees', 'employees'),
        ('Level 4 - Projects', 'projects'),
        ('Level 5 - Tasks', 'tasks')
    ]

    for level_name, table_name in level_tables:
        if table_name in synthetic_data:
            print(f"\n{level_name.upper()}:")
            df = synthetic_data[table_name]
            print(f"Shape: {df.shape}")
            print(df.head(2))

    # Show recursive relationship examples
    print("\n=== RECURSIVE RELATIONSHIPS EXAMPLES ===")

    # Employee hierarchy
    emp_df = synthetic_data['employees']
    reporting_relationships = emp_df[emp_df['reports_to'].notna()][['employee_id', 'reports_to']].head(5)
    if not reporting_relationships.empty:
        print("\nEmployee Reporting Structure:")
        print(reporting_relationships)

    # Task dependencies
    task_df = synthetic_data['tasks']
    task_dependencies = task_df[task_df['depends_on_task'].notna()][['task_id', 'depends_on_task']].head(5)
    if not task_dependencies.empty:
        print("\nTask Dependencies:")
        print(task_dependencies)

    # Validate all relationships
    print("\n" + "="*60)
    validation = synthesizer.validate_relationships()

    # Get comprehensive summary
    print("\n=== COMPREHENSIVE SUMMARY ===")
    summary = synthesizer.get_summary_statistics()
    print("Data Generation Ratios:")
    for table_name, comparison in summary['comparison'].items():
        ratio = comparison['row_ratio']
        print(f"  {table_name:15}: {ratio:.2f}x original size")

    print(f"\n✓ 5-Level Multi-table Recursive Synthesis Completed!")
    print(f"✓ Generated data for {len(synthetic_data)} interconnected tables")
    print(f"✓ Maintained {len(validation)} relationship constraints")

    # Show relationship validation summary
    valid_relationships = sum(1 for v in validation.values() if v)
    total_relationships = len(validation)
    print(f"✓ Relationship integrity: {valid_relationships}/{total_relationships} valid")

    if valid_relationships == total_relationships:
        print("🎉 All relationships maintained perfectly!")
    elif valid_relationships > total_relationships * 0.8:
        print("✨ Most relationships maintained well!")
    else:
        print("⚠️  Some relationship issues detected - check validation details above")

=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===

✓ Comprehensive sample data created with 5 levels of relationships

=== DATA SUMMARY ===
companies: 20 rows, 5 columns
departments: 100 rows, 5 columns
employees: 500 rows, 8 columns
projects: 200 rows, 7 columns
tasks: 1000 rows, 9 columns
skills: 50 rows, 4 columns
employee_skills: 1500 rows, 6 columns
project_teams: 800 rows, 6 columns

=== ADDING TABLES ===

=== ADDING RELATIONSHIPS ===
Level 1->2: Companies to Departments
✗ Error adding relationship: Unknown table name ('company_id').
  Parent table columns: ['company_id', 'company_name', 'industry', 'founded_year', 'headquarters']
  Child table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
Level 2->3: Departments to Employees
✗ Error adding relationship: Unknown table name ('dept_id').
  Parent table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
  Child table columns: ['employee_id', 'dept_id', 'first_nam

Preprocess Tables: 100%|██████████| 8/8 [00:02<00:00,  3.16it/s]



Learning relationships:



Modeling Tables: 100%|██████████| 8/8 [00:03<00:00,  2.53it/s]


✓ Training completed successfully!

Generating synthetic data...
Converting num_rows to scale factor: 0.79
✓ Synthetic data generation completed!
  - employee_skills: 1182 rows, 6 columns
  - skills: 39 rows, 4 columns
  - companies: 15 rows, 5 columns
  - project_teams: 600 rows, 6 columns
  - projects: 150 rows, 7 columns
  - departments: 79 rows, 5 columns
  - employees: 394 rows, 8 columns
  - tasks: 788 rows, 9 columns

=== SAMPLE SYNTHETIC DATA BY LEVEL ===

LEVEL 1 - COMPANIES:
Shape: (15, 5)
   company_id company_name    industry  founded_year headquarters
0     4780961   Company_18  Healthcare          2003          NYC
1      834739   Company_18      Retail          2009          NYC

LEVEL 2 - DEPARTMENTS:
Shape: (79, 5)
    dept_id  company_id  dept_name      budget manager_name
0  16183965           7  Marketing  1658565.38   Manager_15
1  10585028          15  Marketing  1494141.55   Manager_35

LEVEL 3 - EMPLOYEES:
Shape: (394, 8)
   employee_id  dept_id first_name    la

In [12]:
synth_companies = synthetic_data['companies']
synth_departments = synthetic_data['departments']
synth_employees = synthetic_data['employees']
synth_projects = synthetic_data['projects']
synth_tasks = synthetic_data['tasks']
synth_skills = synthetic_data['skills']
synth_employee_skills = synthetic_data['employee_skills']
synth_project_teams = synthetic_data['project_teams']

In [11]:
synth_companies = synthetic_data['companies']

(15, 5)

In [15]:
from sdv.metadata import Metadata

synth_metadata = Metadata.detect_from_dataframes(
    data={
      "synth_companies": synth_companies,
      "synth_departments": synth_departments,
      "synth_employees": synth_employees,
      "synth_projects": synth_projects,
      "synth_tasks": synth_tasks,
      "synth_skills": synth_skills,
      "synth_employee_skills": synth_employee_skills,
      "synth_project_teams": synth_project_teams,
    })

In [16]:
synth_metadata

{
    "tables": {
        "synth_companies": {
            "columns": {
                "company_id": {
                    "sdtype": "id"
                },
                "company_name": {
                    "sdtype": "categorical"
                },
                "industry": {
                    "sdtype": "categorical"
                },
                "founded_year": {
                    "sdtype": "numerical"
                },
                "headquarters": {
                    "sdtype": "categorical"
                }
            },
            "primary_key": "company_id"
        },
        "synth_departments": {
            "columns": {
                "dept_id": {
                    "sdtype": "id"
                },
                "company_id": {
                    "sdtype": "id"
                },
                "dept_name": {
                    "sdtype": "categorical"
                },
                "budget": {
                    "sdtype": "numerical"
      

In [7]:
 data_tables = create_comprehensive_sample_data()

In [8]:
metadata1 = Metadata.detect_from_dataframes(data=data_tables)

In [9]:
metadata1

{
    "tables": {
        "companies": {
            "columns": {
                "company_id": {
                    "sdtype": "id"
                },
                "company_name": {
                    "sdtype": "categorical"
                },
                "industry": {
                    "sdtype": "categorical"
                },
                "founded_year": {
                    "sdtype": "numerical"
                },
                "headquarters": {
                    "sdtype": "categorical"
                }
            },
            "primary_key": "company_id"
        },
        "departments": {
            "columns": {
                "dept_id": {
                    "sdtype": "id"
                },
                "company_id": {
                    "sdtype": "id"
                },
                "dept_name": {
                    "sdtype": "categorical"
                },
                "budget": {
                    "sdtype": "numerical"
                },

In [17]:
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>RecursiveMultiTableSynthesizer Architecture</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 20px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
        }

        .container {
            max-width: 1400px;
            margin: 0 auto;
            background: rgba(255, 255, 255, 0.95);
            border-radius: 15px;
            padding: 30px;
            box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
        }

        h1 {
            text-align: center;
            color: #2c3e50;
            margin-bottom: 30px;
            font-size: 2.5em;
            background: linear-gradient(135deg, #667eea, #764ba2);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }

        .diagram {
            display: flex;
            flex-direction: column;
            gap: 30px;
            align-items: center;
        }

        .phase {
            width: 100%;
            max-width: 1200px;
            background: #f8f9fa;
            border-radius: 12px;
            padding: 25px;
            border: 2px solid #e9ecef;
            position: relative;
            transition: all 0.3s ease;
        }

        .phase:hover {
            transform: translateY(-5px);
            box-shadow: 0 15px 30px rgba(0, 0, 0, 0.1);
        }

        .phase-title {
            font-size: 1.4em;
            font-weight: bold;
            color: #2c3e50;
            margin-bottom: 20px;
            text-align: center;
            background: linear-gradient(135deg, #667eea, #764ba2);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }

        .components {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
            gap: 20px;
        }

        .component {
            background: white;
            border-radius: 8px;
            padding: 20px;
            border-left: 4px solid;
            transition: all 0.3s ease;
            position: relative;
            overflow: hidden;
        }

        .component::before {
            content: '';
            position: absolute;
            top: 0;
            left: 0;
            width: 100%;
            height: 3px;
            background: linear-gradient(90deg, transparent, rgba(102, 126, 234, 0.3), transparent);
            animation: shimmer 2s infinite;
        }

        @keyframes shimmer {
            0% { transform: translateX(-100%); }
            100% { transform: translateX(100%); }
        }

        .component.input { border-left-color: #3498db; }
        .component.analysis { border-left-color: #9b59b6; }
        .component.modeling { border-left-color: #e67e22; }
        .component.synthesis { border-left-color: #2ecc71; }
        .component.output { border-left-color: #e74c3c; }

        .component:hover {
            transform: translateY(-3px);
            box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1);
        }

        .component-title {
            font-weight: bold;
            color: #2c3e50;
            margin-bottom: 10px;
            font-size: 1.1em;
        }

        .component-desc {
            color: #6c757d;
            font-size: 0.9em;
            line-height: 1.4;
        }

        .arrow {
            font-size: 2em;
            color: #667eea;
            text-align: center;
            margin: 10px 0;
            animation: bounce 2s infinite;
        }

        @keyframes bounce {
            0%, 20%, 50%, 80%, 100% { transform: translateY(0); }
            40% { transform: translateY(-10px); }
            60% { transform: translateY(-5px); }
        }

        .flow-connections {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin: 20px 0;
            flex-wrap: wrap;
            gap: 10px;
        }

        .connection-line {
            flex: 1;
            height: 3px;
            background: linear-gradient(90deg, #667eea, #764ba2);
            border-radius: 2px;
            margin: 0 10px;
            animation: flow 3s infinite;
        }

        @keyframes flow {
            0% { background-position: 0% 50%; }
            50% { background-position: 100% 50%; }
            100% { background-position: 0% 50%; }
        }

        .recursive-indicator {
            position: absolute;
            top: 10px;
            right: 15px;
            background: linear-gradient(135deg, #667eea, #764ba2);
            color: white;
            padding: 5px 10px;
            border-radius: 15px;
            font-size: 0.8em;
            font-weight: bold;
        }

        .legend {
            display: flex;
            justify-content: center;
            gap: 30px;
            margin-top: 30px;
            flex-wrap: wrap;
        }

        .legend-item {
            display: flex;
            align-items: center;
            gap: 10px;
        }

        .legend-color {
            width: 20px;
            height: 4px;
            border-radius: 2px;
        }

        .data-flow {
            background: white;
            border-radius: 12px;
            padding: 20px;
            margin-top: 20px;
            border: 2px dashed #667eea;
        }

        .flow-title {
            font-weight: bold;
            color: #2c3e50;
            margin-bottom: 15px;
            text-align: center;
        }

        .flow-steps {
            display: flex;
            justify-content: space-between;
            align-items: center;
            flex-wrap: wrap;
            gap: 15px;
        }

        .flow-step {
            background: linear-gradient(135deg, #667eea, #764ba2);
            color: white;
            padding: 10px 15px;
            border-radius: 20px;
            font-size: 0.9em;
            font-weight: 500;
            min-width: 120px;
            text-align: center;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>RecursiveMultiTableSynthesizer Architecture</h1>

        <div class="diagram">
            <!-- Phase 1: Input & Schema Analysis -->
            <div class="phase">
                <div class="phase-title">Phase 1: Input & Schema Analysis</div>
                <div class="components">
                    <div class="component input">
                        <div class="component-title">Original Database</div>
                        <div class="component-desc">Multi-table relational database with complex relationships, constraints, and business logic</div>
                    </div>
                    <div class="component analysis">
                        <div class="component-title">Schema Parser</div>
                        <div class="component-desc">Extracts table structures, data types, primary/foreign keys, and constraints</div>
                    </div>
                    <div class="component analysis">
                        <div class="component-title">Dependency Mapper</div>
                        <div class="component-desc">Creates dependency graph showing table relationships and identifies cycles</div>
                    </div>
                </div>
            </div>

            <div class="arrow">⬇</div>

            <!-- Phase 2: Data Profiling & Pattern Learning -->
            <div class="phase">
                <div class="phase-title">Phase 2: Data Profiling & Pattern Learning</div>
                <div class="components">
                    <div class="component modeling">
                        <div class="component-title">Statistical Profiler</div>
                        <div class="component-desc">Analyzes data distributions, patterns, nullability, and value ranges per column</div>
                    </div>
                    <div class="component modeling">
                        <div class="component-title">Relationship Analyzer</div>
                        <div class="component-desc">Models correlations within and across tables, identifies functional dependencies</div>
                    </div>
                    <div class="component modeling">
                        <div class="component-title">Constraint Validator</div>
                        <div class="component-desc">Catalogs all integrity constraints, business rules, and validation logic</div>
                    </div>
                </div>
            </div>

            <div class="arrow">⬇</div>

            <!-- Phase 3: Model Training & Preparation -->
            <div class="phase">
                <div class="phase-title">Phase 3: Model Training & Preparation</div>
                <div class="components">
                    <div class="component synthesis">
                        <div class="component-title">Topological Sorter</div>
                        <div class="component-desc">Orders tables for synthesis based on dependencies, handles circular references</div>
                    </div>
                    <div class="component synthesis">
                        <div class="component-title">Multi-Model Generator</div>
                        <div class="component-desc">Trains individual models per table (GANs, VAEs, or statistical models)</div>
                    </div>
                    <div class="component synthesis">
                        <div class="component-title">Cross-Table Coordinator</div>
                        <div class="component-desc">Manages inter-table relationships and ensures referential integrity</div>
                    </div>
                </div>
            </div>

            <div class="arrow">⬇</div>

            <!-- Phase 4: Recursive Synthesis Engine -->
            <div class="phase">
                <div class="recursive-indicator">RECURSIVE</div>
                <div class="phase-title">Phase 4: Recursive Synthesis Engine</div>
                <div class="data-flow">
                    <div class="flow-title">Synthesis Flow</div>
                    <div class="flow-steps">
                        <div class="flow-step">Root Tables</div>
                        <div class="flow-step">Level 1 Deps</div>
                        <div class="flow-step">Level 2 Deps</div>
                        <div class="flow-step">Validation</div>
                        <div class="flow-step">Refinement</div>
                    </div>
                </div>
                <div class="components">
                    <div class="component synthesis">
                        <div class="component-title">Synthesis Orchestrator</div>
                        <div class="component-desc">Manages recursive generation process, maintains global state and progress tracking</div>
                    </div>
                    <div class="component synthesis">
                        <div class="component-title">Constraint Enforcer</div>
                        <div class="component-desc">Ensures all generated data satisfies constraints and business rules in real-time</div>
                    </div>
                    <div class="component synthesis">
                        <div class="component-title">Quality Controller</div>
                        <div class="component-desc">Monitors data quality, triggers regeneration when constraints are violated</div>
                    </div>
                </div>
            </div>

            <div class="arrow">⬇</div>

            <!-- Phase 5: Output & Validation -->
            <div class="phase">
                <div class="phase-title">Phase 5: Output & Validation</div>
                <div class="components">
                    <div class="component output">
                        <div class="component-title">Synthetic Database</div>
                        <div class="component-desc">Complete multi-table synthetic dataset maintaining all original relationships</div>
                    </div>
                    <div class="component output">
                        <div class="component-title">Quality Metrics</div>
                        <div class="component-desc">Statistical comparison reports, privacy analysis, and utility measurements</div>
                    </div>
                    <div class="component output">
                        <div class="component-title">Export Handler</div>
                        <div class="component-desc">Formats output for various targets (SQL dumps, CSV files, API endpoints)</div>
                    </div>
                </div>
            </div>
        </div>

        <div class="legend">
            <div class="legend-item">
                <div class="legend-color" style="background-color: #3498db;"></div>
                <span>Input Layer</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background-color: #9b59b6;"></div>
                <span>Analysis Layer</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background-color: #e67e22;"></div>
                <span>Modeling Layer</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background-color: #2ecc71;"></div>
                <span>Synthesis Layer</span>
            </div>
            <div class="legend-item">
                <div class="legend-color" style="background-color: #e74c3c;"></div>
                <span>Output Layer</span>
            </div>
        </div>
    </div>
</body>
</html>

SyntaxError: invalid decimal literal (ipython-input-959985991.py, line 11)