In [None]:
# Install required packages
!pip install sdv pandas numpy networkx matplotlib seaborn

# Verify installations
import pandas as pd
import numpy as np
import networkx as nx
print("✓ All packages installed successfully!")

## Cell 1: Import Libraries and Setup

In [3]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


## Cell 2: Class Definition - Part 1 (Initialization)

In [4]:
class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

## Cell 3: Class Definition - Part 2 (Add Table Data Method)

In [None]:
def add_table_data(self, table_name: str, data: pd.DataFrame,
                  primary_key: Optional[str] = None):
    """
    Add a table to the multi-table structure.

    Args:
        table_name (str): Name of the table
        data (pd.DataFrame): The actual data
        primary_key (str): Primary key column name
    """
    self.real_data[table_name] = data.copy()

    # Add table to metadata using the correct method
    self.metadata.detect_table_from_dataframe(table_name, data)

    # Set primary key if provided
    if primary_key and primary_key in data.columns:
        try:
            self.metadata.update_column(table_name, primary_key, sdtype='id')
            self.metadata.set_primary_key(table_name, primary_key)
            print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
        except Exception as e:
            print(f"Warning setting primary key for {table_name}: {e}")
    else:
        print(f"✓ Added table '{table_name}' (no primary key specified)")

# Add method to class
RecursiveMultiTableSynthesizer.add_table_data = add_table_data
print("✓ add_table_data method added to class")

✓ add_table_data method added to class


In [13]:
from sdv.metadata import Metadata
import pandas as pd
from typing import Dict
from sdv.metadata import Metadata

def add_tables(self, tables: Dict[str, pd.DataFrame]) -> None:
    """
    Add multiple tables and generate SDV metadata in one go.

    Args:
        tables (Dict[str, pd.DataFrame]):
            Dictionary of {table_name: DataFrame}.

    Side effects:
        - Stores all tables into self.real_data
        - Creates Metadata object with Metadata.detect_from_dataframes
    """
    if not isinstance(tables, dict) or not all(isinstance(v, pd.DataFrame) for v in tables.values()):
        raise TypeError("tables must be a dict of {str: pd.DataFrame}")

    # Store all real data
    self.real_data = {name: df.copy() for name, df in tables.items()}

    try:
        # Detect metadata for all tables at once
        self.metadata = Metadata.detect_from_dataframes(self.real_data)
        print(f"✓ Added {len(self.real_data)} tables and generated metadata")
    except Exception as e:
        print(f"⚠️ Error detecting metadata: {e}")

# Add method to class
RecursiveMultiTableSynthesizer.add_tables = add_tables
print("✓ add_table_data method added to class")


✓ add_table_data method added to class


## Cell 4: Class Definition - Part 3 (Foreign Key Relationship Method)

In [None]:
def add_foreign_key_relationship(self, child_table: str, child_column: str,
                               parent_table: str, parent_column: str):
    """
    Add a foreign key relationship between tables.

    Args:
        child_table (str): Child table name
        child_column (str): Foreign key column in child table
        parent_table (str): Parent table name
        parent_column (str): Referenced column in parent table (should be primary key)
    """
    try:
        # Ensure both tables exist
        if child_table not in self.real_data:
            raise ValueError(f"Child table '{child_table}' not found")
        if parent_table not in self.real_data:
            raise ValueError(f"Parent table '{parent_table}' not found")

        # Verify the columns exist
        if child_column not in self.real_data[child_table].columns:
            raise ValueError(f"Column '{child_column}' not found in table '{child_table}'")
        if parent_column not in self.real_data[parent_table].columns:
            raise ValueError(f"Column '{parent_column}' not found in table '{parent_table}'")

        # Mark foreign key column as 'id' type
        self.metadata.update_column(child_table, child_column, sdtype='id')

        # Add relationship to metadata
        # Note: The parent_primary_key should be the actual primary key column name
        # which should match the parent_column parameter
        self.metadata.add_relationship(
            parent_table_name=parent_table,
            child_table_name=child_table,
            parent_primary_key=parent_column,
            child_foreign_key=child_column
        )

        # Track for dependency graph
        self.dependency_graph.add_edge(parent_table, child_table)

        # Store relationship info
        if child_table not in self.table_dependencies:
            self.table_dependencies[child_table] = {}
        self.table_dependencies[child_table][child_column] = parent_table

        self.relationships.append({
            'parent_table': parent_table,
            'parent_column': parent_column,
            'child_table': child_table,
            'child_column': child_column
        })

        print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

    except Exception as e:
        print(f"✗ Error adding relationship: {e}")
        # Debug info
        print(f"  Parent table columns: {list(self.real_data[parent_table].columns) if parent_table in self.real_data else 'Table not found'}")
        print(f"  Child table columns: {list(self.real_data[child_table].columns) if child_table in self.real_data else 'Table not found'}")

# Add method to class
RecursiveMultiTableSynthesizer.add_foreign_key_relationship = add_foreign_key_relationship
print("✓ add_foreign_key_relationship method added to class")

✓ add_foreign_key_relationship method added to class


## Cell 5: Class Definition - Part 4 (Self-Referencing Relationship Method)

In [None]:
def add_self_referencing_relationship(self, table_name: str,
                                    foreign_key_col: str,
                                    primary_key_col: str):
    """
    Add a self-referencing (recursive) relationship within a table.

    Args:
        table_name (str): Table name
        foreign_key_col (str): Self-referencing foreign key column
        primary_key_col (str): Primary key column being referenced
    """
    try:
        if table_name not in self.real_data:
            raise ValueError(f"Table '{table_name}' not found")

        # Update the foreign key column to be 'id' type
        self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

        # Add self-referencing relationship
        self.metadata.add_relationship(
            parent_table_name=table_name,
            child_table_name=table_name,
            parent_primary_key=primary_key_col,
            child_foreign_key=foreign_key_col
        )

        # Add self-loop to dependency graph
        self.dependency_graph.add_edge(table_name, table_name)

        print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

    except Exception as e:
        print(f"✗ Error adding self-referencing relationship: {e}")

# Add method to class
RecursiveMultiTableSynthesizer.add_self_referencing_relationship = add_self_referencing_relationship
print("✓ add_self_referencing_relationship method added to class")

✓ add_self_referencing_relationship method added to class


## Cell 6: Class Definition - Part 5 (Helper Methods)

In [6]:
def _detect_recursive_relationships(self):
    """
    Detect and handle recursive relationships in the table structure.
    """
    recursive_tables = []

    # Find self-loops (recursive relationships)
    for table in self.dependency_graph.nodes():
        if self.dependency_graph.has_edge(table, table):
            recursive_tables.append(table)

    # Find cycles in dependency graph
    try:
        cycles = list(nx.simple_cycles(self.dependency_graph))
        if cycles:
            print(f"Detected circular dependencies: {cycles}")

    except Exception as e:
        print(f"Error detecting cycles: {e}")

    return recursive_tables

def _get_synthesis_order(self) -> List[str]:
    """
    Determine the order for synthesizing tables based on dependencies.

    Returns:
        List[str]: Ordered list of table names
    """
    try:
        # Create a copy of the graph without self-loops for topological sort
        temp_graph = self.dependency_graph.copy()
        self_loops = list(nx.selfloop_edges(temp_graph))
        temp_graph.remove_edges_from(self_loops)

        # Topological sort to get dependency order
        synthesis_order = list(nx.topological_sort(temp_graph))

        # Add any tables not in the dependency graph
        all_tables = set(self.real_data.keys())
        ordered_tables = set(synthesis_order)
        remaining_tables = all_tables - ordered_tables
        synthesis_order.extend(list(remaining_tables))

        return synthesis_order

    except nx.NetworkXError:
        # If there are cycles, use a heuristic approach
        print("Circular dependencies detected. Using heuristic ordering.")
        return list(self.real_data.keys())

# Add methods to class
RecursiveMultiTableSynthesizer._detect_recursive_relationships = _detect_recursive_relationships
RecursiveMultiTableSynthesizer._get_synthesis_order = _get_synthesis_order
print("✓ Helper methods added to class")

✓ Helper methods added to class


## Cell 7: Class Definition - Part 6 (Preparation and Training Methods)

In [7]:
def prepare_for_synthesis(self):
    """
    Prepare the data and metadata for synthesis.
    """
    # Create metadata if not already created
    if self.metadata is None:
        self.create_metadata()

    # Detect recursive relationships
    recursive_tables = self._detect_recursive_relationships()
    if recursive_tables:
        print(f"Detected recursive relationships in tables: {recursive_tables}")

    print("\nMetadata Summary:")
    try:
        # Print metadata info
        metadata_dict = self.metadata.to_dict()
        print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
        relationships = metadata_dict.get('relationships', [])
        print(f"Relationships: {len(relationships)}")
        for rel in relationships:
            print(f"  - {rel}")
    except Exception as e:
        print(f"Could not display metadata: {e}")

    # Validate metadata
    try:
        self.metadata.validate()
        print("✓ Metadata validation successful!")
    except Exception as e:
        print(f"✗ Metadata validation error: {e}")
        raise

    # Initialize synthesizer
    try:
        print(f"Initializing {self.synthesizer_type} synthesizer...")
        self.synthesizer = HMASynthesizer(metadata=self.metadata)
        print("✓ Synthesizer initialized successfully!")

    except Exception as e:
        print(f"✗ Error initializing synthesizer: {e}")
        raise

def fit(self):
    """
    Train the multi-table synthesizer on the real data.
    """
    if not self.synthesizer:
        self.prepare_for_synthesis()

    print("Training multi-table synthesizer...")
    try:
        self.synthesizer.fit(self.real_data)
        print("✓ Training completed successfully!")
    except Exception as e:
        print(f"✗ Training error: {e}")
        raise

# Add methods to class
RecursiveMultiTableSynthesizer.prepare_for_synthesis = prepare_for_synthesis
RecursiveMultiTableSynthesizer.fit = fit
print("✓ Preparation and training methods added to class")

✓ Preparation and training methods added to class


## Cell 8: Class Definition - Part 7 (Data Generation Method)

In [8]:
def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                          scale: float = 1.0) -> Dict[str, pd.DataFrame]:
    """
    Generate synthetic data for all tables.

    Args:
        num_rows (dict): Specific number of rows per table
        scale (float): Scale factor for data generation

    Returns:
        Dict[str, pd.DataFrame]: Synthetic data for each table
    """
    if not self.synthesizer:
        raise ValueError("Synthesizer not trained. Call fit() first.")

    print(f"Generating synthetic data...")

    try:
        # The HMASynthesizer sample method only accepts scale parameter
        if num_rows:
            # Calculate average scale from num_rows
            total_original = sum(len(df) for df in self.real_data.values())
            total_requested = sum(num_rows.get(table, len(df)) for table, df in self.real_data.items())
            calculated_scale = total_requested / total_original if total_original > 0 else 1.0
            print(f"Converting num_rows to scale factor: {calculated_scale:.2f}")
            self.synthetic_data = self.synthesizer.sample(scale=calculated_scale)

            # Then trim tables to requested sizes
            for table_name, requested_rows in num_rows.items():
                if table_name in self.synthetic_data:
                    current_rows = len(self.synthetic_data[table_name])
                    if current_rows > requested_rows:
                        self.synthetic_data[table_name] = self.synthetic_data[table_name].head(requested_rows)

        else:
            self.synthetic_data = self.synthesizer.sample(scale=scale)

        print("✓ Synthetic data generation completed!")

        # Print summary
        for table_name, df in self.synthetic_data.items():
            print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

        return self.synthetic_data

    except Exception as e:
        print(f"✗ Generation error: {e}")
        # Try with default parameters
        try:
            print("Retrying with scale=1.0...")
            self.synthetic_data = self.synthesizer.sample(scale=1.0)
            print("✓ Fallback generation successful!")
            return self.synthetic_data
        except Exception as e2:
            print(f"✗ Fallback also failed: {e2}")
            raise

# Add method to class
RecursiveMultiTableSynthesizer.generate_synthetic_data = generate_synthetic_data
print("✓ generate_synthetic_data method added to class")

✓ generate_synthetic_data method added to class


## Cell 9: Class Definition - Part 8 (Validation and Statistics Methods)

In [9]:
def validate_relationships(self) -> Dict[str, bool]:
    """
    Validate that relationships are maintained in synthetic data.

    Returns:
        Dict[str, bool]: Validation results for each relationship
    """
    validation_results = {}

    if not self.synthetic_data:
        print("No synthetic data to validate.")
        return validation_results

    print("\nValidating relationships...")

    for rel in self.relationships:
        parent_table = rel['parent_table']
        parent_col = rel['parent_column']
        child_table = rel['child_table']
        child_col = rel['child_column']

        if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
            continue

        parent_df = self.synthetic_data[parent_table]
        child_df = self.synthetic_data[child_table]

        if parent_col in parent_df.columns and child_col in child_df.columns:
            # Get valid parent values (excluding None/NaN)
            parent_values = set(parent_df[parent_col].dropna().values)
            child_fk_values = set(child_df[child_col].dropna().values)

            # Check referential integrity
            invalid_refs = child_fk_values - parent_values
            is_valid = len(invalid_refs) == 0

            relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
            validation_results[relationship_name] = is_valid

            status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
            print(f"  {relationship_name}: {status}")

    return validation_results

def get_summary_statistics(self) -> Dict:
    """
    Get summary statistics comparing real and synthetic data.
    """
    summary = {
        'real_data': {},
        'synthetic_data': {},
        'comparison': {}
    }

    for table_name in self.real_data.keys():
        real_df = self.real_data[table_name]
        summary['real_data'][table_name] = {
            'shape': real_df.shape,
            'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
            'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
        }

        if table_name in self.synthetic_data:
            synthetic_df = self.synthetic_data[table_name]
            summary['synthetic_data'][table_name] = {
                'shape': synthetic_df.shape,
                'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
            }

            # Compare basic statistics
            summary['comparison'][table_name] = {
                'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                'column_match': synthetic_df.shape[1] == real_df.shape[1]
            }

    return summary

# Add methods to class
RecursiveMultiTableSynthesizer.validate_relationships = validate_relationships
RecursiveMultiTableSynthesizer.get_summary_statistics = get_summary_statistics
print("✓ Validation and statistics methods added to class")

✓ Validation and statistics methods added to class


## Cell 10: Sample Data Creation Function

In [10]:
# Example usage and testing
def create_sample_data():
    """
    Create sample data with recursive relationships for testing.
    """
    np.random.seed(42)  # For reproducible results

    # Users table (root table)
    users = pd.DataFrame({
        'user_id': range(1, 101),
        'name': [f'User_{i}' for i in range(1, 101)],
        'age': np.random.randint(18, 80, 100),
        'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], 100)
    })

    # Orders table (depends on users)
    orders = pd.DataFrame({
        'order_id': range(1, 301),
        'user_id': np.random.choice(users['user_id'], 300),
        'amount': np.round(np.random.uniform(10, 1000, 300), 2),
        'status': np.random.choice(['pending', 'completed', 'cancelled'], 300)
    })

    # Order_items table (depends on orders)
    order_items = pd.DataFrame({
        'item_id': range(1, 501),
        'order_id': np.random.choice(orders['order_id'], 500),
        'product_name': np.random.choice(['Product_A', 'Product_B', 'Product_C'], 500),
        'quantity': np.random.randint(1, 10, 500),
        'price': np.round(np.random.uniform(5, 200, 500), 2)
    })

    # Reviews table with self-referencing (recursive) relationship
    review_ids = range(1, 201)
    reviews = pd.DataFrame({
        'review_id': review_ids,
        'user_id': np.random.choice(users['user_id'], 200),
        'rating': np.random.randint(1, 6, 200),
        'comment': [f'Review comment {i}' for i in review_ids],
        # Self-reference: some reviews are replies to other reviews
        'reply_to': [None] * 150 + list(np.random.choice(range(1, 151), 50))
    })

    return users, orders, order_items, reviews

In [None]:
print("=== MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

# Create sample data
users, orders, order_items, reviews = create_sample_data()
print("✓ Sample data created")

# Display basic information about each table
print("\n=== SAMPLE DATA OVERVIEW ===")
tables = {'users': users, 'orders': orders, 'order_items': order_items, 'reviews': reviews}

for table_name, df in tables.items():
    print(f"\n{table_name.upper()}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print("  Sample data:")
    print(df.head(3))

In [14]:
# Initialize synthesizer
synthesizer = RecursiveMultiTableSynthesizer()
synthesizer.add_tables(tables)

# Add tables (order matters - add parent tables first)
print("\nAdding tables to synthesizer...")
#synthesizer.add_table_data('users', users, primary_key='user_id')
#synthesizer.add_table_data('orders', orders, primary_key='order_id')
#synthesizer.add_table_data('order_items', order_items, primary_key='item_id')
#synthesizer.add_table_data('reviews', reviews, primary_key='review_id')

print(f"\n✓ Added {len(synthesizer.real_data)} tables to synthesizer")

✓ Added 4 tables and generated metadata

Adding tables to synthesizer...

✓ Added 4 tables to synthesizer


In [None]:
# Add relationships
print("\nAdding relationships...")

# Standard foreign key relationships
synthesizer.add_foreign_key_relationship('orders', 'user_id', 'users', 'user_id')
synthesizer.add_foreign_key_relationship('order_items', 'order_id', 'orders', 'order_id')
synthesizer.add_foreign_key_relationship('reviews', 'user_id', 'users', 'user_id')

# Add recursive relationship (reviews can reply to other reviews)
synthesizer.add_self_referencing_relationship('reviews', 'reply_to', 'review_id')

print(f"\n✓ Added {len(synthesizer.relationships)} relationships")

# Display relationship summary
print("\nRelationship Summary:")
for i, rel in enumerate(synthesizer.relationships, 1):
    rel_type = "Recursive" if rel['parent_table'] == rel['child_table'] else "Standard"
    print(f"  {i}. {rel_type}: {rel['parent_table']}.{rel['parent_column']} -> {rel['child_table']}.{rel['child_column']}")


Adding relationships...
✗ Error adding relationship: Unknown table name ('user_id').
  Parent table columns: ['user_id', 'name', 'age', 'city']
  Child table columns: ['order_id', 'user_id', 'amount', 'status']
✗ Error adding relationship: Unknown table name ('order_id').
  Parent table columns: ['order_id', 'user_id', 'amount', 'status']
  Child table columns: ['item_id', 'order_id', 'product_name', 'quantity', 'price']
✗ Error adding relationship: Unknown table name ('user_id').
  Parent table columns: ['user_id', 'name', 'age', 'city']
  Child table columns: ['review_id', 'user_id', 'rating', 'comment', 'reply_to']
✗ Error adding self-referencing relationship: Unknown table name ('reply_to').

✓ Added 0 relationships

Relationship Summary:


In [15]:
# Train the synthesizer
print("\n" + "="*50)
print("TRAINING SYNTHESIZER")
print("="*50)

import time
start_time = time.time()

synthesizer.fit()

end_time = time.time()
training_time = end_time - start_time
print(f"\n✓ Training completed in {training_time:.2f} seconds")


TRAINING SYNTHESIZER

Metadata Summary:
Tables: ['users', 'orders', 'order_items', 'reviews']
Relationships: 3
  - {'parent_table_name': 'users', 'child_table_name': 'reviews', 'parent_primary_key': 'user_id', 'child_foreign_key': 'user_id'}
  - {'parent_table_name': 'users', 'child_table_name': 'orders', 'parent_primary_key': 'user_id', 'child_foreign_key': 'user_id'}
  - {'parent_table_name': 'orders', 'child_table_name': 'order_items', 'parent_primary_key': 'order_id', 'child_foreign_key': 'order_id'}
✓ Metadata validation successful!
Initializing gaussian_copula synthesizer...
✓ Synthesizer initialized successfully!
Training multi-table synthesizer...


Preprocess Tables: 100%|██████████| 4/4 [00:00<00:00, 22.66it/s]



Learning relationships:


(1/3) Tables 'users' and 'reviews' ('user_id'): 100%|██████████| 90/90 [00:08<00:00, 10.49it/s]
(2/3) Tables 'orders' and 'order_items' ('order_id'): 100%|██████████| 244/244 [00:22<00:00, 10.95it/s]
(3/3) Tables 'users' and 'orders' ('user_id'): 100%|██████████| 94/94 [00:13<00:00,  7.02it/s]





Modeling Tables: 100%|██████████| 4/4 [00:03<00:00,  1.20it/s]

✓ Training completed successfully!

✓ Training completed in 51.51 seconds





In [16]:
# Generate synthetic data
print("\n" + "="*50)
print("GENERATING SYNTHETIC DATA")
print("="*50)

start_time = time.time()
synthetic_data = synthesizer.generate_synthetic_data(scale=0.8)
end_time = time.time()

generation_time = end_time - start_time
print(f"\n✓ Generation completed in {generation_time:.2f} seconds")


GENERATING SYNTHETIC DATA
Generating synthetic data...
✓ Synthetic data generation completed!
  - users: 80 rows, 4 columns
  - reviews: 160 rows, 5 columns
  - orders: 240 rows, 4 columns
  - order_items: 400 rows, 5 columns

✓ Generation completed in 10.79 seconds


In [None]:
# Show sample results
print("\n=== SAMPLE SYNTHETIC DATA ===")
for table_name, df in synthetic_data.items():
    print(f"\n{table_name.upper()}:")
    print(f"Shape: {df.shape}")
    print("Sample rows:")
    print(df.head(3))
    print("-" * 40)

In [None]:
# Validate relationships
print("\n" + "="*50)
print("RELATIONSHIP VALIDATION")
print("="*50)

validation = synthesizer.validate_relationships()

# Summary of validation results
valid_count = sum(validation.values())
total_count = len(validation)

print(f"\nValidation Summary:")
print(f"  Total relationships: {total_count}")
print(f"  Valid relationships: {valid_count}")
print(f"  Invalid relationships: {total_count - valid_count}")
print(f"  Success rate: {(valid_count/total_count)*100:.1f}%" if total_count > 0 else "No relationships to validate")

if valid_count == total_count:
    print("🎉 All relationships are valid!")
elif valid_count > 0:
    print("✨ Most relationships are valid!")
else:
    print("⚠️ Relationship validation issues detected!")


RELATIONSHIP VALIDATION

Validating relationships...

Validation Summary:
  Total relationships: 0
  Valid relationships: 0
  Invalid relationships: 0
No relationships to validate
🎉 All relationships are valid!


In [None]:
# Analyze recursive relationships specifically
print("\n=== RECURSIVE RELATIONSHIP ANALYSIS ===")

# Check reviews that reply to other reviews
reviews_df = synthetic_data['reviews']
reply_reviews = reviews_df[reviews_df['reply_to'].notna()]

print(f"Reviews Analysis:")
print(f"  Total reviews: {len(reviews_df)}")
print(f"  Reviews with replies: {len(reply_reviews)}")
print(f"  Percentage with replies: {(len(reply_reviews)/len(reviews_df))*100:.1f}%")

if not reply_reviews.empty:
    print(f"\nSample reply relationships:")
    sample_replies = reply_reviews[['review_id', 'reply_to', 'comment']].head(5)
    for _, row in sample_replies.iterrows():
        print(f"  Review {row['review_id']} replies to Review {row['reply_to']}: '{row['comment']}'")
else:
    print("\nNo reply relationships found in synthetic data")


=== RECURSIVE RELATIONSHIP ANALYSIS ===
Reviews Analysis:
  Total reviews: 160
  Reviews with replies: 42
  Percentage with replies: 26.2%

Sample reply relationships:
  Review 12533464 replies to Review 75.0: 'Review comment 124'
  Review 12851129 replies to Review 76.0: 'Review comment 182'
  Review 11795186 replies to Review 75.0: 'Review comment 19'
  Review 16771899 replies to Review 75.0: 'Review comment 62'
  Review 4226262 replies to Review 75.0: 'Review comment 11'


In [None]:
# Get summary statistics
print("\n=== SUMMARY STATISTICS ===")
summary = synthesizer.get_summary_statistics()

print("Data Generation Comparison:")
print(f"{'Table':<15} {'Original':<10} {'Synthetic':<10} {'Ratio':<8} {'Status':<10}")
print("-" * 55)

for table_name, comparison in summary['comparison'].items():
    original_rows = summary['real_data'][table_name]['shape'][0]
    synthetic_rows = summary['synthetic_data'][table_name]['shape'][0]
    ratio = comparison['row_ratio']
    status = "✓ Good" if 0.5 <= ratio <= 1.5 else "⚠ Check"

    print(f"{table_name:<15} {original_rows:<10} {synthetic_rows:<10} {ratio:<8.2f} {status:<10}")

print(f"\nColumn Analysis:")
for table_name in summary['real_data'].keys():
    real_info = summary['real_data'][table_name]
    synthetic_info = summary['synthetic_data'][table_name]

    print(f"\n{table_name.upper()}:")
    print(f"  Numeric columns: {real_info['numeric_columns']} -> {synthetic_info['numeric_columns']}")
    print(f"  Categorical columns: {real_info['categorical_columns']} -> {synthetic_info['categorical_columns']}")
    print(f"  Column match: {'✓' if summary['comparison'][table_name]['column_match'] else '✗'}")


=== SUMMARY STATISTICS ===
Data Generation Comparison:
Table           Original   Synthetic  Ratio    Status    
-------------------------------------------------------
users           100        80         0.80     ✓ Good    
orders          300        240        0.80     ✓ Good    
order_items     500        400        0.80     ✓ Good    
reviews         200        160        0.80     ✓ Good    

Column Analysis:

USERS:
  Numeric columns: 2 -> 2
  Categorical columns: 2 -> 2
  Column match: ✓

ORDERS:
  Numeric columns: 3 -> 3
  Categorical columns: 1 -> 1
  Column match: ✓

ORDER_ITEMS:
  Numeric columns: 4 -> 4
  Categorical columns: 1 -> 1
  Column match: ✓

REVIEWS:
  Numeric columns: 4 -> 4
  Categorical columns: 1 -> 1
  Column match: ✓


In [None]:
print("\n" + "="*60)
print("SYNTHESIS COMPLETION SUMMARY")
print("="*60)

# Calculate totals
total_original_rows = sum(summary['real_data'][table]['shape'][0] for table in summary['real_data'])
total_synthetic_rows = sum(summary['synthetic_data'][table]['shape'][0] for table in summary['synthetic_data'])
overall_ratio = total_synthetic_rows / total_original_rows

print(f"📊 OVERALL STATISTICS:")
print(f"  Tables processed: {len(synthetic_data)}")
print(f"  Total original rows: {total_original_rows:,}")
print(f"  Total synthetic rows: {total_synthetic_rows:,}")
print(f"  Overall generation ratio: {overall_ratio:.2f}x")

print(f"\n🔗 RELATIONSHIP STATISTICS:")
print(f"  Total relationships: {len(validation)}")
print(f"  Valid relationships: {sum(validation.values())}")
print(f"  Recursive relationships: {sum(1 for rel in synthesizer.relationships if rel['parent_table'] == rel['child_table'])}")

print(f"\n⏱️ PERFORMANCE METRICS:")
print(f"  Training time: {training_time:.2f} seconds")
print(f"  Generation time: {generation_time:.2f} seconds")
print(f"  Total time: {(training_time + generation_time):.2f} seconds")

print(f"\n✅ QUALITY ASSESSMENT:")
valid_relationships = sum(validation.values())
total_relationships = len(validation)
relationship_score = (valid_relationships / total_relationships) * 100 if total_relationships > 0 else 100

if relationship_score == 100:
    quality_status = "EXCELLENT"
elif relationship_score >= 80:
    quality_status = "GOOD"
elif relationship_score >= 60:
    quality_status = "ACCEPTABLE"
else:
    quality_status = "NEEDS IMPROVEMENT"

print(f"  Relationship integrity: {relationship_score:.1f}%")
print(f"  Overall quality: {quality_status}")

print(f"\n🎯 CONCLUSION:")
if quality_status in ["EXCELLENT", "GOOD"]:
    print("✓ Multi-table recursive synthesis completed successfully!")
    print("✓ Synthetic data maintains referential integrity and relationships")
    print("✓ Data is ready for use in testing, analysis, or model training")
else:
    print("⚠ Synthesis completed with some issues")
    print("⚠ Review relationship validation results above")
    print("⚠ Consider adjusting data generation parameters")

print(f"\n📁 AVAILABLE DATA:")
for table_name, df in synthetic_data.items():
    print(f"  {table_name}: {len(df)} rows x {len(df.columns)} columns")

print(f"\n🏁 Analysis complete!")


SYNTHESIS COMPLETION SUMMARY
📊 OVERALL STATISTICS:
  Tables processed: 4
  Total original rows: 1,100
  Total synthetic rows: 880
  Overall generation ratio: 0.80x

🔗 RELATIONSHIP STATISTICS:
  Total relationships: 0
  Valid relationships: 0
  Recursive relationships: 0

⏱️ PERFORMANCE METRICS:
  Training time: 4.07 seconds
  Generation time: 0.16 seconds
  Total time: 4.22 seconds

✅ QUALITY ASSESSMENT:
  Relationship integrity: 100.0%
  Overall quality: EXCELLENT

🎯 CONCLUSION:
✓ Multi-table recursive synthesis completed successfully!
✓ Synthetic data maintains referential integrity and relationships
✓ Data is ready for use in testing, analysis, or model training

📁 AVAILABLE DATA:
  reviews: 160 rows x 5 columns
  orders: 240 rows x 4 columns
  order_items: 400 rows x 5 columns
  users: 80 rows x 4 columns

🏁 Analysis complete!


In [None]:
users_synth= synthetic_data['users']
orders_synth= synthetic_data['orders']
order_items_synth= synthetic_data['order_items']
reviews_synth= synthetic_data['reviews']

In [None]:
synth_data={
    'users': users_synth,
    'orders': orders_synth,
    'order_items': order_items_synth,
    'reviews': reviews_synth
}

In [None]:
metadata = Metadata.detect_from_dataframes(data=synth_data)

In [None]:
metadata

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def create_complex_enterprise_data():
    """
    Create complex enterprise sample data with 6+ levels of depth,
    many-to-many relationships, and recursive structures.

    Hierarchy Structure:
    Level 1: Organizations (Root)
    Level 2: Divisions → Organizations
    Level 3: Departments → Divisions
    Level 4: Teams → Departments
    Level 5: Employees → Teams
    Level 6: Projects → Employees (as managers)
    Level 7: Tasks → Projects

    Additional Complex Relationships:
    - Many-to-Many: Employee Skills, Project Teams, Task Dependencies
    - Recursive: Employee Management, Task Dependencies, Team Hierarchies
    - Cross-references: Multiple foreign keys per table
    """

    np.random.seed(42)  # For reproducible results

    print("Creating complex enterprise data with 6+ depth levels...")

    # ===========================================
    # LEVEL 1: ORGANIZATIONS (Root - 25 orgs)
    # ===========================================
    organizations = pd.DataFrame({
        'org_id': range(1, 26),
        'org_name': [f'Enterprise_{i}' for i in range(1, 26)],
        'org_type': np.random.choice(['Corporation', 'LLC', 'Partnership', 'Non-Profit'], 25),
        'industry': np.random.choice(['Technology', 'Finance', 'Healthcare', 'Manufacturing', 'Retail', 'Energy'], 25),
        'founded_year': np.random.randint(1950, 2020, 25),
        'headquarters': np.random.choice(['New York', 'San Francisco', 'Chicago', 'Austin', 'Seattle', 'Boston'], 25),
        'revenue_millions': np.round(np.random.uniform(50, 10000, 25), 2),
        'employee_count': np.random.randint(100, 50000, 25),
        'parent_org_id': [None] * 20 + list(np.random.choice(range(1, 21), 5))  # Some orgs are subsidiaries
    })

    # ===========================================
    # LEVEL 2: DIVISIONS (150 divisions)
    # ===========================================
    divisions = pd.DataFrame({
        'division_id': range(1, 151),
        'org_id': np.random.choice(organizations['org_id'], 150),
        'division_name': np.random.choice([
            'Engineering', 'Sales', 'Marketing', 'Operations', 'Finance', 'HR',
            'Legal', 'R&D', 'Manufacturing', 'Customer Service', 'IT', 'Security'
        ], 150),
        'division_code': [f'DIV_{i:03d}' for i in range(1, 151)],
        'budget_millions': np.round(np.random.uniform(1, 500, 150), 2),
        'established_date': pd.date_range('2000-01-01', '2023-01-01', periods=150),
        'status': np.random.choice(['Active', 'Restructuring', 'New', 'Merging'], 150),
        'region': np.random.choice(['North America', 'Europe', 'Asia Pacific', 'Latin America'], 150)
    })

    # ===========================================
    # LEVEL 3: DEPARTMENTS (400 departments)
    # ===========================================
    departments = pd.DataFrame({
        'dept_id': range(1, 401),
        'division_id': np.random.choice(divisions['division_id'], 400),
        'dept_name': np.random.choice([
            'Software Engineering', 'Data Science', 'DevOps', 'QA Testing', 'Product Management',
            'Sales Operations', 'Account Management', 'Business Development', 'Customer Success',
            'Digital Marketing', 'Content Marketing', 'Brand Management', 'Public Relations',
            'Financial Planning', 'Accounting', 'Treasury', 'Risk Management', 'Compliance',
            'Talent Acquisition', 'Learning & Development', 'Employee Relations', 'Compensation'
        ], 400),
        'dept_code': [f'DEPT_{i:04d}' for i in range(1, 401)],
        'floor_number': np.random.randint(1, 20, 400),
        'budget_thousands': np.round(np.random.uniform(50, 5000, 400), 2),
        'head_count': np.random.randint(5, 100, 400),
        'cost_center': [f'CC_{np.random.randint(1000, 9999)}' for _ in range(400)]
    })

    # ===========================================
    # LEVEL 4: TEAMS (800 teams)
    # ===========================================
    teams = pd.DataFrame({
        'team_id': range(1, 801),
        'dept_id': np.random.choice(departments['dept_id'], 800),
        'team_name': np.random.choice([
            'Alpha Team', 'Beta Squad', 'Gamma Unit', 'Delta Force', 'Echo Group',
            'Frontend Team', 'Backend Team', 'Mobile Team', 'Infrastructure Team',
            'Analytics Team', 'Security Team', 'Platform Team', 'Growth Team',
            'Enterprise Sales', 'SMB Sales', 'Inside Sales', 'Channel Partners',
            'Creative Team', 'Content Team', 'SEO Team', 'Social Media Team'
        ], 800),
        'team_type': np.random.choice(['Development', 'Sales', 'Marketing', 'Operations', 'Support'], 800),
        'team_size': np.random.randint(3, 15, 800),
        'formation_date': pd.date_range('2018-01-01', '2024-01-01', periods=800),
        'parent_team_id': [None] * 600 + list(np.random.choice(range(1, 601), 200)),  # Some teams are sub-teams
        'team_lead_id': None  # Will be filled after employees are created
    })

    # ===========================================
    # LEVEL 5: EMPLOYEES (2000 employees)
    # ===========================================
    # Generate realistic names
    first_names = ['James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda',
                  'William', 'Elizabeth', 'David', 'Barbara', 'Richard', 'Susan', 'Joseph', 'Jessica',
                  'Thomas', 'Sarah', 'Christopher', 'Karen', 'Charles', 'Helen', 'Daniel', 'Nancy',
                  'Matthew', 'Betty', 'Anthony', 'Dorothy', 'Mark', 'Lisa', 'Donald', 'Anna']

    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis',
                 'Rodriguez', 'Martinez', 'Hernandez', 'Lopez', 'Gonzalez', 'Wilson', 'Anderson',
                 'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin', 'Lee', 'Perez', 'Thompson',
                 'White', 'Harris', 'Sanchez', 'Clark', 'Ramirez', 'Lewis', 'Robinson', 'Walker']

    employees = pd.DataFrame({
        'employee_id': range(1, 2001),
        'team_id': np.random.choice(teams['team_id'], 2000),
        'employee_number': [f'EMP_{i:06d}' for i in range(1, 2001)],
        'first_name': np.random.choice(first_names, 2000),
        'last_name': np.random.choice(last_names, 2000),
        'email': [f'employee{i}@company.com' for i in range(1, 2001)],
        'hire_date': pd.date_range('2015-01-01', '2024-01-01', periods=2000),
        'job_title': np.random.choice([
            'Software Engineer', 'Senior Software Engineer', 'Principal Engineer', 'Engineering Manager',
            'Data Scientist', 'Senior Data Scientist', 'ML Engineer', 'Data Engineer',
            'Product Manager', 'Senior Product Manager', 'Product Owner', 'Business Analyst',
            'Sales Representative', 'Senior Sales Rep', 'Account Manager', 'Sales Manager',
            'Marketing Specialist', 'Marketing Manager', 'Content Creator', 'SEO Specialist',
            'HR Specialist', 'HR Manager', 'Recruiter', 'Training Coordinator',
            'Financial Analyst', 'Senior Financial Analyst', 'Accountant', 'Controller'
        ], 2000),
        'salary': np.round(np.random.normal(85000, 25000, 2000), 2),
        'salary': np.clip(np.random.normal(85000, 25000, 2000), 35000, 250000),  # Realistic salary range
        'employment_status': np.random.choice(['Active', 'On Leave', 'Remote', 'Part-time'], 2000, p=[0.8, 0.05, 0.1, 0.05]),
        'performance_rating': np.random.choice(['Exceeds', 'Meets', 'Below', 'Outstanding'], 2000, p=[0.2, 0.6, 0.15, 0.05]),
        'manager_id': [None] * 1600 + list(np.random.choice(range(1, 1601), 400)),  # Management hierarchy
        'mentor_id': [None] * 1500 + list(np.random.choice(range(1, 1501), 500)),  # Mentorship relationships
        'location': np.random.choice(['Office', 'Remote', 'Hybrid'], 2000, p=[0.4, 0.3, 0.3])
    })

    # Update team leads
    team_leads = employees.groupby('team_id')['employee_id'].first().reset_index()
    teams = teams.merge(team_leads.rename(columns={'employee_id': 'team_lead_id'}), on='team_id', how='left')

    # ===========================================
    # LEVEL 6: PROJECTS (500 projects)
    # ===========================================
    projects = pd.DataFrame({
        'project_id': range(1, 501),
        'project_manager_id': np.random.choice(employees['employee_id'], 500),
        'sponsor_id': np.random.choice(employees['employee_id'], 500),  # Executive sponsor
        'project_name': [f'Project_{chr(65 + i//26)}{chr(65 + i%26)}_{np.random.randint(100, 999)}' for i in range(500)],
        'project_code': [f'PRJ_{i:04d}' for i in range(1, 501)],
        'project_type': np.random.choice([
            'Software Development', 'Infrastructure', 'Data Migration', 'Process Improvement',
            'Marketing Campaign', 'Product Launch', 'Compliance', 'Research', 'Training'
        ], 500),
        'priority': np.random.choice(['Critical', 'High', 'Medium', 'Low'], 500, p=[0.1, 0.3, 0.5, 0.1]),
        'status': np.random.choice(['Planning', 'In Progress', 'Testing', 'Completed', 'On Hold', 'Cancelled'], 500),
        'start_date': pd.date_range('2023-01-01', '2024-06-01', periods=500),
        'planned_end_date': None,  # Will calculate
        'actual_end_date': None,   # Some projects completed
        'budget_thousands': np.round(np.random.uniform(10, 2000, 500), 2),
        'spent_thousands': None,   # Will calculate
        'parent_project_id': [None] * 400 + list(np.random.choice(range(1, 401), 100))  # Sub-projects
    })

    # Calculate planned end dates (3-18 months from start)
    project_durations = np.random.randint(90, 540, 500)  # 3-18 months in days
    projects['planned_end_date'] = projects['start_date'] + pd.to_timedelta(project_durations, unit='D')

    # Some projects are completed
    completed_mask = projects['status'] == 'Completed'
    projects.loc[completed_mask, 'actual_end_date'] = projects.loc[completed_mask, 'planned_end_date'] - pd.to_timedelta(np.random.randint(-30, 60, completed_mask.sum()), unit='D')

    # Calculate spent amounts
    projects['spent_thousands'] = np.round(projects['budget_thousands'] * np.random.uniform(0.1, 1.2, 500), 2)

    # ===========================================
    # LEVEL 7: TASKS (2500 tasks)
    # ===========================================
    tasks = pd.DataFrame({
        'task_id': range(1, 2501),
        'project_id': np.random.choice(projects['project_id'], 2500),
        'assigned_to_id': np.random.choice(employees['employee_id'], 2500),
        'created_by_id': np.random.choice(employees['employee_id'], 2500),
        'task_name': [f'Task_{i}_{np.random.choice(["Analysis", "Development", "Testing", "Review", "Deploy", "Research", "Design", "Document"])}' for i in range(1, 2501)],
        'task_description': [f'Task description for task {i} involving {np.random.choice(["implementation", "analysis", "testing", "documentation", "optimization"])}' for i in range(1, 2501)],
        'task_type': np.random.choice([
            'Development', 'Testing', 'Documentation', 'Research', 'Design',
            'Review', 'Deployment', 'Analysis', 'Planning', 'Training'
        ], 2500),
        'priority': np.random.choice(['Urgent', 'High', 'Medium', 'Low'], 2500, p=[0.05, 0.25, 0.6, 0.1]),
        'status': np.random.choice(['Todo', 'In Progress', 'Review', 'Testing', 'Done', 'Blocked'], 2500),
        'estimated_hours': np.random.randint(1, 80, 2500),
        'actual_hours': np.random.randint(1, 120, 2500),
        'created_date': pd.date_range('2023-01-01', '2024-08-01', periods=2500),
        'due_date': None,  # Will calculate
        'completed_date': None,  # For completed tasks
        'depends_on_task_id': [None] * 2000 + list(np.random.choice(range(1, 2001), 500)),  # Task dependencies
        'parent_task_id': [None] * 2200 + list(np.random.choice(range(1, 2201), 300))  # Subtasks
    })

    # Calculate due dates (1-4 weeks from creation)
    task_durations = np.random.randint(7, 28, 2500)
    tasks['due_date'] = tasks['created_date'] + pd.to_timedelta(task_durations, unit='D')

    # Some tasks are completed
    completed_task_mask = tasks['status'] == 'Done'
    tasks.loc[completed_task_mask, 'completed_date'] = tasks.loc[completed_task_mask, 'due_date'] - pd.to_timedelta(np.random.randint(-7, 14, completed_task_mask.sum()), unit='D')

    # ===========================================
    # MANY-TO-MANY RELATIONSHIPS
    # ===========================================

    # SKILLS CATALOG (200 skills)
    skills = pd.DataFrame({
        'skill_id': range(1, 201),
        'skill_name': np.random.choice([
            'Python', 'Java', 'JavaScript', 'C++', 'C#', 'Go', 'Rust', 'Swift', 'Kotlin',
            'React', 'Angular', 'Vue.js', 'Node.js', 'Django', 'Flask', 'Spring Boot',
            'AWS', 'Azure', 'GCP', 'Docker', 'Kubernetes', 'Terraform', 'Ansible',
            'PostgreSQL', 'MySQL', 'MongoDB', 'Redis', 'Elasticsearch', 'Cassandra',
            'Machine Learning', 'Deep Learning', 'Data Science', 'Statistics', 'R',
            'Project Management', 'Agile', 'Scrum', 'Kanban', 'JIRA', 'Confluence',
            'Leadership', 'Communication', 'Problem Solving', 'Critical Thinking',
            'Sales', 'Marketing', 'Customer Service', 'Negotiation', 'Presentation',
            'Finance', 'Accounting', 'Legal', 'Compliance', 'Risk Management'
        ], 200),
        'skill_category': np.random.choice([
            'Programming', 'Web Development', 'Cloud Platforms', 'Databases', 'DevOps',
            'Data Science', 'Machine Learning', 'Project Management', 'Soft Skills',
            'Business', 'Finance', 'Legal'
        ], 200),
        'skill_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 200),
        'certification_available': np.random.choice([True, False], 200, p=[0.3, 0.7]),
        'market_demand': np.random.choice(['Low', 'Medium', 'High', 'Very High'], 200, p=[0.1, 0.3, 0.4, 0.2])
    })

    # EMPLOYEE SKILLS (Many-to-Many: 6000 skill assignments)
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 6001),
        'employee_id': np.random.choice(employees['employee_id'], 6000),
        'skill_id': np.random.choice(skills['skill_id'], 6000),
        'proficiency_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 6000),
        'years_experience': np.random.randint(0, 15, 6000),
        'certified': np.random.choice([True, False], 6000, p=[0.2, 0.8]),
        'last_used_date': pd.date_range('2020-01-01', '2024-08-01', periods=6000),
        'skill_source': np.random.choice(['Work', 'Education', 'Self-taught', 'Training'], 6000),
        'endorsements': np.random.randint(0, 50, 6000)
    })

    # PROJECT TEAM ASSIGNMENTS (Many-to-Many: 3000 assignments)
    project_teams = pd.DataFrame({
        'assignment_id': range(1, 3001),
        'project_id': np.random.choice(projects['project_id'], 3000),
        'employee_id': np.random.choice(employees['employee_id'], 3000),
        'role': np.random.choice([
            'Developer', 'Senior Developer', 'Tech Lead', 'Architect',
            'Tester', 'QA Lead', 'Business Analyst', 'Product Owner',
            'Designer', 'DevOps Engineer', 'Data Scientist', 'Consultant'
        ], 3000),
        'allocation_percentage': np.random.randint(10, 100, 3000),
        'start_date': pd.date_range('2023-01-01', '2024-06-01', periods=3000),
        'end_date': None,  # Some assignments ongoing
        'hourly_rate': np.round(np.random.uniform(25, 200, 3000), 2),
        'assignment_status': np.random.choice(['Active', 'Completed', 'On Hold'], 3000, p=[0.6, 0.3, 0.1]),
        'performance_score': np.random.randint(1, 5, 3000)
    })

    # Some assignments are completed
    completed_assignment_mask = project_teams['assignment_status'] == 'Completed'
    project_teams.loc[completed_assignment_mask, 'end_date'] = project_teams.loc[completed_assignment_mask, 'start_date'] + pd.to_timedelta(np.random.randint(30, 365, completed_assignment_mask.sum()), unit='D')

    # TASK DEPENDENCIES (Many-to-Many: 1500 dependencies)
    task_dependencies = pd.DataFrame({
        'dependency_id': range(1, 1501),
        'predecessor_task_id': np.random.choice(tasks['task_id'], 1500),
        'successor_task_id': np.random.choice(tasks['task_id'], 1500),
        'dependency_type': np.random.choice(['Finish-to-Start', 'Start-to-Start', 'Finish-to-Finish', 'Start-to-Finish'], 1500, p=[0.7, 0.15, 0.1, 0.05]),
        'lag_days': np.random.randint(0, 14, 1500),
        'created_date': pd.date_range('2023-01-01', '2024-08-01', periods=1500),
        'created_by_id': np.random.choice(employees['employee_id'], 1500),
        'is_critical': np.random.choice([True, False], 1500, p=[0.3, 0.7])
    })

    # TEAM COLLABORATIONS (Many-to-Many: 1200 collaborations)
    team_collaborations = pd.DataFrame({
        'collaboration_id': range(1, 1201),
        'team_1_id': np.random.choice(teams['team_id'], 1200),
        'team_2_id': np.random.choice(teams['team_id'], 1200),
        'collaboration_type': np.random.choice([
            'Joint Project', 'Knowledge Sharing', 'Resource Sharing', 'Cross-Training',
            'Process Integration', 'Strategic Partnership'
        ], 1200),
        'start_date': pd.date_range('2023-01-01', '2024-06-01', periods=1200),
        'end_date': None,  # Some ongoing
        'success_rating': np.random.randint(1, 5, 1200),
        'frequency': np.random.choice(['Daily', 'Weekly', 'Monthly', 'As Needed'], 1200),
        'primary_contact_1': np.random.choice(employees['employee_id'], 1200),
        'primary_contact_2': np.random.choice(employees['employee_id'], 1200)
    })

    # ===========================================
    # ADDITIONAL COMPLEX TABLES
    # ===========================================

    # CERTIFICATIONS (300 certifications)
    certifications = pd.DataFrame({
        'certification_id': range(1, 301),
        'cert_name': np.random.choice([
            'AWS Certified Solutions Architect', 'Google Cloud Professional', 'Microsoft Azure Expert',
            'PMP Certification', 'Scrum Master Certification', 'Six Sigma Black Belt',
            'CISSP Security Certification', 'Salesforce Administrator', 'Oracle DBA Certification',
            'Cisco Network Professional', 'VMware Certified Professional', 'Red Hat Certified Engineer'
        ], 300),
        'issuing_organization': np.random.choice([
            'Amazon', 'Google', 'Microsoft', 'PMI', 'Scrum Alliance', 'ASQ',
            'ISC2', 'Salesforce', 'Oracle', 'Cisco', 'VMware', 'Red Hat'
        ], 300),
        'certification_level': np.random.choice(['Associate', 'Professional', 'Expert', 'Master'], 300),
        'validity_years': np.random.choice([1, 2, 3, 5, None], 300),
        'cost_usd': np.random.randint(100, 5000, 300),
        'difficulty_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 300)
    })

    # EMPLOYEE CERTIFICATIONS (Many-to-Many: 800 certifications earned)
    employee_certifications = pd.DataFrame({
        'emp_cert_id': range(1, 801),
        'employee_id': np.random.choice(employees['employee_id'], 800),
        'certification_id': np.random.choice(certifications['certification_id'], 800),
        'earned_date': pd.date_range('2018-01-01', '2024-08-01', periods=800),
        'expiry_date': None,  # Will calculate
        'score': np.random.randint(70, 100, 800),
        'attempt_number': np.random.randint(1, 4, 800),
        'cost_covered_by_company': np.random.choice([True, False], 800, p=[0.7, 0.3])
    })

    # Calculate expiry dates for certifications with validity periods
    for i, row in employee_certifications.iterrows():
        cert_validity = certifications.loc[certifications['certification_id'] == row['certification_id'], 'validity_years'].iloc[0]
        if cert_validity is not None:
            employee_certifications.at[i, 'expiry_date'] = row['earned_date'] + pd.DateOffset(years=cert_validity)

    # TRAINING PROGRAMS (150 programs)
    training_programs = pd.DataFrame({
        'training_id': range(1, 151),
        'program_name': np.random.choice([
            'Leadership Development', 'Technical Skills Bootcamp', 'Communication Workshop',
            'Data Science Fundamentals', 'Cloud Architecture Training', 'Agile Methodology',
            'Customer Service Excellence', 'Sales Techniques', 'Financial Planning',
            'Diversity & Inclusion', 'Cybersecurity Awareness', 'Project Management'
        ], 150),
        'training_type': np.random.choice(['Online', 'In-Person', 'Hybrid', 'Workshop'], 150),
        'duration_hours': np.random.randint(4, 80, 150),
        'max_participants': np.random.randint(10, 100, 150),
        'cost_per_person': np.random.randint(100, 3000, 150),
        'trainer_id': np.random.choice(employees['employee_id'], 150),
        'department_id': np.random.choice(departments['dept_id'], 150),
        'is_mandatory': np.random.choice([True, False], 150, p=[0.3, 0.7])
    })

    # TRAINING ENROLLMENTS (Many-to-Many: 2000 enrollments)
    training_enrollments = pd.DataFrame({
        'enrollment_id': range(1, 2001),
        'employee_id': np.random.choice(employees['employee_id'], 2000),
        'training_id': np.random.choice(training_programs['training_id'], 2000),
        'enrollment_date': pd.date_range('2023-01-01', '2024-08-01', periods=2000),
        'completion_date': None,  # Will calculate for completed
        'status': np.random.choice(['Enrolled', 'In Progress', 'Completed', 'Dropped'], 2000, p=[0.2, 0.3, 0.4, 0.1]),
        'final_score': np.random.randint(60, 100, 2000),
        'feedback_rating': np.random.randint(1, 5, 2000),
        'completion_certificate': np.random.choice([True, False], 2000, p=[0.6, 0.4])
    })

    # Calculate completion dates for completed trainings
    completed_training_mask = training_enrollments['status'] == 'Completed'
    training_enrollments.loc[completed_training_mask, 'completion_date'] = training_enrollments.loc[completed_training_mask, 'enrollment_date'] + pd.to_timedelta(np.random.randint(7, 60, completed_training_mask.sum()), unit='D')

    # ===========================================
    # RETURN ALL TABLES
    # ===========================================

    tables = {
        # Core hierarchy (7 levels)
        'organizations': organizations,
        'divisions': divisions,
        'departments': departments,
        'teams': teams,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,

        # Supporting tables
        'skills': skills,
        'certifications': certifications,
        'training_programs': training_programs,

        # Many-to-Many relationship tables
        'employee_skills': employee_skills,
        'project_teams': project_teams,
        'task_dependencies': task_dependencies,
        'team_collaborations': team_collaborations,
        'employee_certifications': employee_certifications,
        'training_enrollments': training_enrollments
    }

    # Print summary
    print(f"\n✅ COMPLEX ENTERPRISE DATA CREATED")
    print(f"📊 Total tables: {len(tables)}")
    total_rows = sum(len(df) for df in tables.values())
    print(f"📊 Total rows: {total_rows:,}")

    print(f"\n🏗️ HIERARCHY LEVELS:")
    print(f"  Level 1: Organizations ({len(organizations)} rows)")
    print(f"  Level 2: Divisions ({len(divisions)} rows)")
    print(f"  Level 3: Departments ({len(departments)} rows)")
    print(f"  Level 4: Teams ({len(teams)} rows)")
    print(f"  Level 5: Employees ({len(employees)} rows)")
    print(f"  Level 6: Projects ({len(projects)} rows)")
    print(f"  Level 7: Tasks ({len(tasks)} rows)")

    print(f"\n🔗 RELATIONSHIP TYPES:")
    print(f"  Hierarchical: 6 levels deep")
    print(f"  Many-to-Many: 6 tables")
    print(f"  Recursive: 4 types (orgs, employees, teams, tasks)")
    print(f"  Cross-references: Multiple per table")

    return tables

In [None]:
# Create the complex dataset
enterprise_data = create_complex_enterprise_data()

Creating complex enterprise data with 6+ depth levels...

✅ COMPLEX ENTERPRISE DATA CREATED
📊 Total tables: 16
📊 Total rows: 21,525

🏗️ HIERARCHY LEVELS:
  Level 1: Organizations (25 rows)
  Level 2: Divisions (150 rows)
  Level 3: Departments (400 rows)
  Level 4: Teams (800 rows)
  Level 5: Employees (2000 rows)
  Level 6: Projects (500 rows)
  Level 7: Tasks (2500 rows)

🔗 RELATIONSHIP TYPES:
  Hierarchical: 6 levels deep
  Many-to-Many: 6 tables
  Recursive: 4 types (orgs, employees, teams, tasks)
  Cross-references: Multiple per table


In [None]:
def setup_complex_synthesizer_example():
    """
    Complete example showing how to use the complex enterprise data
    with RecursiveMultiTableSynthesizer
    """

    print("=" * 80)
    print("COMPLEX ENTERPRISE DATA SYNTHESIS EXAMPLE")
    print("=" * 80)

    # Step 1: Create the data
    print("\n1️⃣ Creating complex enterprise dataset...")
    enterprise_data = create_complex_enterprise_data()

    # Step 2: Initialize synthesizer
    print("\n2️⃣ Initializing synthesizer...")
    from sdv.metadata import Metadata
    from sdv.multi_table import HMASynthesizer
    import networkx as nx

    class RecursiveMultiTableSynthesizer:
        def __init__(self, synthesizer_type='gaussian_copula'):
            self.synthesizer_type = synthesizer_type
            self.metadata = Metadata()
            self.synthesizer = None
            self.table_dependencies = {}
            self.dependency_graph = nx.DiGraph()
            self.real_data = {}
            self.synthetic_data = {}
            self.relationships = []

        def add_table_data(self, table_name: str, data, primary_key=None):
            self.real_data[table_name] = data.copy()
            self.metadata.detect_table_from_dataframe(table_name, data)
            if primary_key and primary_key in data.columns:
                try:
                    self.metadata.update_column(table_name, primary_key, sdtype='id')
                    self.metadata.set_primary_key(table_name, primary_key)
                    print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
                except Exception as e:
                    print(f"Warning setting primary key for {table_name}: {e}")

        def add_foreign_key_relationship(self, child_table, child_column, parent_table, parent_column):
            try:
                self.metadata.update_column(child_table, child_column, sdtype='id')
                self.metadata.add_relationship(
                    parent_table_name=parent_table,
                    child_table_name=child_table,
                    parent_primary_key=parent_column,
                    child_foreign_key=child_column
                )
                self.dependency_graph.add_edge(parent_table, child_table)
                self.relationships.append({
                    'parent_table': parent_table, 'parent_column': parent_column,
                    'child_table': child_table, 'child_column': child_column
                })
                print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")
            except Exception as e:
                print(f"✗ Error adding relationship: {e}")

        def add_self_referencing_relationship(self, table_name, foreign_key_col, primary_key_col):
            try:
                self.metadata.update_column(table_name, foreign_key_col, sdtype='id')
                self.metadata.add_relationship(
                    parent_table_name=table_name, child_table_name=table_name,
                    parent_primary_key=primary_key_col, child_foreign_key=foreign_key_col
                )
                self.dependency_graph.add_edge(table_name, table_name)
                print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")
            except Exception as e:
                print(f"✗ Error adding self-referencing relationship: {e}")

        def fit(self):
            try:
                self.metadata.validate()
                print("✓ Metadata validation successful!")
                self.synthesizer = HMASynthesizer(metadata=self.metadata)
                print("✓ Synthesizer initialized successfully!")
                print("Training multi-table synthesizer...")
                self.synthesizer.fit(self.real_data)
                print("✓ Training completed successfully!")
            except Exception as e:
                print(f"✗ Error during training: {e}")
                raise

        def generate_synthetic_data(self, scale=0.5):
            if not self.synthesizer:
                raise ValueError("Synthesizer not trained. Call fit() first.")
            try:
                self.synthetic_data = self.synthesizer.sample(scale=scale)
                print("✓ Synthetic data generation completed!")
                for table_name, df in self.synthetic_data.items():
                    print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")
                return self.synthetic_data
            except Exception as e:
                print(f"✗ Generation error: {e}")
                raise

    synthesizer = RecursiveMultiTableSynthesizer()

    # Step 3: Add all tables with primary keys
    print("\n3️⃣ Adding tables to synthesizer...")
    table_configs = [
        ('organizations', 'org_id'),
        ('divisions', 'division_id'),
        ('departments', 'dept_id'),
        ('teams', 'team_id'),
        ('employees', 'employee_id'),
        ('projects', 'project_id'),
        ('tasks', 'task_id'),
        ('skills', 'skill_id'),
        ('certifications', 'certification_id'),
        ('training_programs', 'training_id'),
        ('employee_skills', 'emp_skill_id'),
        ('project_teams', 'assignment_id'),
        ('task_dependencies', 'dependency_id'),
        ('team_collaborations', 'collaboration_id'),
        ('employee_certifications', 'emp_cert_id'),
        ('training_enrollments', 'enrollment_id')
    ]

    for table_name, primary_key in table_configs:
        if table_name in enterprise_data:
            synthesizer.add_table_data(table_name, enterprise_data[table_name], primary_key)

    # Step 4: Add hierarchical relationships (7 levels)
    print("\n4️⃣ Adding hierarchical relationships...")
    hierarchical_relationships = [
        # Level 1 -> 2
        ('divisions', 'org_id', 'organizations', 'org_id'),
        # Level 2 -> 3
        ('departments', 'division_id', 'divisions', 'division_id'),
        # Level 3 -> 4
        ('teams', 'dept_id', 'departments', 'dept_id'),
        # Level 4 -> 5
        ('employees', 'team_id', 'teams', 'team_id'),
        # Level 5 -> 6
        ('projects', 'project_manager_id', 'employees', 'employee_id'),
        ('projects', 'sponsor_id', 'employees', 'employee_id'),
        # Level 6 -> 7
        ('tasks', 'project_id', 'projects', 'project_id'),
        ('tasks', 'assigned_to_id', 'employees', 'employee_id'),
        ('tasks', 'created_by_id', 'employees', 'employee_id'),
    ]

    for child_table, child_col, parent_table, parent_col in hierarchical_relationships:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 5: Add many-to-many relationships
    print("\n5️⃣ Adding many-to-many relationships...")
    many_to_many_relationships = [
        # Employee Skills
        ('employee_skills', 'employee_id', 'employees', 'employee_id'),
        ('employee_skills', 'skill_id', 'skills', 'skill_id'),
        # Project Teams
        ('project_teams', 'project_id', 'projects', 'project_id'),
        ('project_teams', 'employee_id', 'employees', 'employee_id'),
        # Task Dependencies
        ('task_dependencies', 'predecessor_task_id', 'tasks', 'task_id'),
        ('task_dependencies', 'successor_task_id', 'tasks', 'task_id'),
        ('task_dependencies', 'created_by_id', 'employees', 'employee_id'),
        # Team Collaborations
        ('team_collaborations', 'team_1_id', 'teams', 'team_id'),
        ('team_collaborations', 'team_2_id', 'teams', 'team_id'),
        ('team_collaborations', 'primary_contact_1', 'employees', 'employee_id'),
        ('team_collaborations', 'primary_contact_2', 'employees', 'employee_id'),
        # Employee Certifications
        ('employee_certifications', 'employee_id', 'employees', 'employee_id'),
        ('employee_certifications', 'certification_id', 'certifications', 'certification_id'),
        # Training Enrollments
        ('training_enrollments', 'employee_id', 'employees', 'employee_id'),
        ('training_enrollments', 'training_id', 'training_programs', 'training_id'),
        ('training_programs', 'trainer_id', 'employees', 'employee_id'),
        ('training_programs', 'department_id', 'departments', 'dept_id'),
    ]

    for child_table, child_col, parent_table, parent_col in many_to_many_relationships:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 6: Add recursive relationships
    print("\n6️⃣ Adding recursive relationships...")
    recursive_relationships = [
        ('organizations', 'parent_org_id', 'org_id'),
        ('employees', 'manager_id', 'employee_id'),
        ('employees', 'mentor_id', 'employee_id'),
        ('teams', 'parent_team_id', 'team_id'),
        ('teams', 'team_lead_id', 'employee_id'),
        ('projects', 'parent_project_id', 'project_id'),
        ('tasks', 'depends_on_task_id', 'task_id'),
        ('tasks', 'parent_task_id', 'task_id'),
    ]

    for table_name, foreign_key_col, primary_key_col in recursive_relationships:
        if foreign_key_col == primary_key_col:  # Self-referencing
            synthesizer.add_self_referencing_relationship(table_name, foreign_key_col, primary_key_col)
        else:  # Cross-referencing to other table
            synthesizer.add_foreign_key_relationship(table_name, foreign_key_col, 'employees', 'employee_id')

    print(f"\n✅ Setup complete!")
    print(f"📊 Total tables: {len(synthesizer.real_data)}")
    print(f"🔗 Total relationships: {len(synthesizer.relationships)}")

    return synthesizer, enterprise_data

In [None]:
# ==============================================================================
# RELATIONSHIP MAPPING REFERENCE
# ==============================================================================

def print_relationship_mapping():
    """
    Print detailed relationship mapping for reference
    """
    print("\n" + "="*80)
    print("RELATIONSHIP MAPPING REFERENCE")
    print("="*80)

    print("\n🏗️ HIERARCHICAL RELATIONSHIPS (7 Levels):")
    print("Level 1: Organizations (25)")
    print("Level 2: Divisions (150) → Organizations")
    print("Level 3: Departments (400) → Divisions")
    print("Level 4: Teams (800) → Departments")
    print("Level 5: Employees (2000) → Teams")
    print("Level 6: Projects (500) → Employees (managers/sponsors)")
    print("Level 7: Tasks (2500) → Projects")

    print("\n🔄 RECURSIVE RELATIONSHIPS:")
    print("• Organizations → parent_org_id (subsidiaries)")
    print("• Employees → manager_id (management hierarchy)")
    print("• Employees → mentor_id (mentorship relationships)")
    print("• Teams → parent_team_id (sub-teams)")
    print("• Projects → parent_project_id (sub-projects)")
    print("• Tasks → depends_on_task_id (task dependencies)")
    print("• Tasks → parent_task_id (subtasks)")

    print("\n🔗 MANY-TO-MANY RELATIONSHIPS:")
    print("• Employee Skills (6000): Employees ↔ Skills")
    print("• Project Teams (3000): Projects ↔ Employees")
    print("• Task Dependencies (1500): Tasks ↔ Tasks")
    print("• Team Collaborations (1200): Teams ↔ Teams")
    print("• Employee Certifications (800): Employees ↔ Certifications")
    print("• Training Enrollments (2000): Employees ↔ Training Programs")

    print("\n🔀 CROSS-REFERENCES:")
    print("• Teams → team_lead_id (Employees)")
    print("• Projects → project_manager_id, sponsor_id (Employees)")
    print("• Tasks → assigned_to_id, created_by_id (Employees)")
    print("• Training Programs → trainer_id (Employees), department_id (Departments)")
    print("• Task Dependencies → created_by_id (Employees)")
    print("• Team Collaborations → primary_contact_1, primary_contact_2 (Employees)")

    print("\n📊 COMPLEXITY METRICS:")
    print("• Total Tables: 16")
    print("• Total Rows: ~16,000")
    print("• Hierarchy Depth: 7 levels")
    print("• Recursive Types: 7")
    print("• Many-to-Many Tables: 6")
    print("• Cross-Reference Links: 15+")

    print("\n⚡ PERFORMANCE EXPECTATIONS:")
    print("• Training Time: 5-15 minutes")
    print("• Generation Time: 2-8 minutes")
    print("• Memory Usage: 4-12GB")
    print("• Recommended Scale: 0.3-0.7 for initial testing")

In [None]:
if __name__ == "__main__":
    # Run the example
    # - print_relationship_mapping()

    # Uncomment to run full example:
    synthesizer, data = setup_complex_synthesizer_example()
    #
    # # Train and generate (this will take several minutes)
    # print("\n7️⃣ Training synthesizer (this may take 5-15 minutes)...")
    synthesizer.fit()
    #
    # print("\n8️⃣ Generating synthetic data...")
    # synthetic_data = synthesizer.generate_synthetic_data(scale=0.5)
    #
    # print("\n🎉 Complex enterprise synthesis completed!")

COMPLEX ENTERPRISE DATA SYNTHESIS EXAMPLE

1️⃣ Creating complex enterprise dataset...
Creating complex enterprise data with 6+ depth levels...

✅ COMPLEX ENTERPRISE DATA CREATED
📊 Total tables: 16
📊 Total rows: 21,525

🏗️ HIERARCHY LEVELS:
  Level 1: Organizations (25 rows)
  Level 2: Divisions (150 rows)
  Level 3: Departments (400 rows)
  Level 4: Teams (800 rows)
  Level 5: Employees (2000 rows)
  Level 6: Projects (500 rows)
  Level 7: Tasks (2500 rows)

🔗 RELATIONSHIP TYPES:
  Hierarchical: 6 levels deep
  Many-to-Many: 6 tables
  Recursive: 4 types (orgs, employees, teams, tasks)
  Cross-references: Multiple per table

2️⃣ Initializing synthesizer...

3️⃣ Adding tables to synthesizer...

4️⃣ Adding hierarchical relationships...
✗ Error adding relationship: Unknown table name ('org_id').
✗ Error adding relationship: Unknown table name ('division_id').
✗ Error adding relationship: Unknown table name ('dept_id').
✗ Error adding relationship: Unknown table name ('team_id').
✗ Error a

Preprocess Tables: 100%|██████████| 16/16 [00:23<00:00,  1.46s/it]



Learning relationships:



Modeling Tables: 100%|██████████| 16/16 [00:11<00:00,  1.43it/s]

✓ Training completed successfully!





In [None]:
print("\n8️⃣ Generating synthetic data...")
synthetic_data = synthesizer.generate_synthetic_data(scale=0.5)
print("\n🎉 Complex enterprise synthesis completed!")


8️⃣ Generating synthetic data...
✓ Synthetic data generation completed!
  - employee_skills: 3000 rows, 9 columns
  - teams: 400 rows, 9 columns
  - organizations: 12 rows, 9 columns
  - training_programs: 75 rows, 9 columns
  - projects: 250 rows, 14 columns
  - employees: 1000 rows, 14 columns
  - task_dependencies: 750 rows, 8 columns
  - tasks: 1250 rows, 16 columns
  - project_teams: 1500 rows, 10 columns
  - team_collaborations: 600 rows, 10 columns
  - skills: 100 rows, 6 columns
  - divisions: 75 rows, 8 columns
  - certifications: 150 rows, 7 columns
  - training_enrollments: 1000 rows, 9 columns
  - departments: 200 rows, 8 columns
  - employee_certifications: 400 rows, 8 columns

🎉 Complex enterprise synthesis completed!


In [None]:
employee_skills_synth = synthetic_data['employee_skills']
teams_synth = synthetic_data['teams']
organizations_synth = synthetic_data['organizations']
training_programs_synth = synthetic_data['training_programs']
projects_synth = synthetic_data['projects']
employees_synth = synthetic_data['employees']
task_dependencies_synth = synthetic_data['task_dependencies']
tasks_synth = synthetic_data['tasks']
project_teams_synth = synthetic_data['project_teams']
team_collaborations_synth = synthetic_data['team_collaborations']
skills_synth = synthetic_data['skills']
divisions_synth = synthetic_data['divisions']
certifications_synth = synthetic_data['certifications']
training_enrollments_synth = synthetic_data['training_enrollments']
departments_synth = synthetic_data['departments']
employee_certifications_synth = synthetic_data['employee_certifications']

In [None]:
synth_data1={
    'employee_skills_synth': employee_skills_synth,
    'teams_synth': teams_synth,
    'organizations_synth': organizations_synth,
    'training_programs_synth': training_programs_synth,
    'projects_synth': projects_synth,
    'employees_synth': employees_synth,
    'task_dependencies_synth': task_dependencies_synth,
    'tasks_synth': tasks_synth,
    'project_teams_synth': project_teams_synth,
    'team_collaborations_synth': team_collaborations_synth,
    'skills_synth': skills_synth,
    'divisions_synth': divisions_synth,
    'certifications_synth': certifications_synth,
    'training_enrollments_synth': training_enrollments_synth,
    'departments_synth': departments_synth,
    'employee_certifications_synth': employee_certifications_synth
  }

In [None]:
metadata1 = Metadata.detect_from_dataframes(data=synth_data1)

In [None]:
metadata1

In [None]:
import numpy as np
from datetime import datetime, timedelta
import random
import string
import warnings
warnings.filterwarnings('ignore')

def generate_realistic_names(count):
    """Generate realistic first and last names"""
    first_names = [
        'James', 'Mary', 'John', 'Patricia', 'Robert', 'Jennifer', 'Michael', 'Linda',
        'William', 'Elizabeth', 'David', 'Barbara', 'Richard', 'Susan', 'Joseph', 'Jessica',
        'Thomas', 'Sarah', 'Christopher', 'Karen', 'Charles', 'Helen', 'Daniel', 'Nancy',
        'Matthew', 'Betty', 'Anthony', 'Dorothy', 'Mark', 'Lisa', 'Donald', 'Anna',
        'Steven', 'Kimberly', 'Paul', 'Deborah', 'Andrew', 'Rachel', 'Joshua', 'Carolyn',
        'Kenneth', 'Janet', 'Kevin', 'Catherine', 'Brian', 'Maria', 'George', 'Heather',
        'Timothy', 'Diane', 'Ronald', 'Ruth', 'Jason', 'Julie', 'Edward', 'Joyce',
        'Jeffrey', 'Virginia', 'Ryan', 'Victoria', 'Jacob', 'Kelly', 'Gary', 'Christina'
    ]

    last_names = [
        'Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis',
        'Rodriguez', 'Martinez', 'Hernandez', 'Lopez', 'Gonzalez', 'Wilson', 'Anderson',
        'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin', 'Lee', 'Perez', 'Thompson',
        'White', 'Harris', 'Sanchez', 'Clark', 'Ramirez', 'Lewis', 'Robinson', 'Walker',
        'Young', 'Allen', 'King', 'Wright', 'Scott', 'Torres', 'Nguyen', 'Hill',
        'Flores', 'Green', 'Adams', 'Nelson', 'Baker', 'Hall', 'Rivera', 'Campbell',
        'Mitchell', 'Carter', 'Roberts', 'Gomez', 'Phillips', 'Evans', 'Turner', 'Diaz'
    ]

    names = []
    for _ in range(count):
        first = random.choice(first_names)
        last = random.choice(last_names)
        names.append((first, last))
    return names

def generate_ssn():
    """Generate realistic but fake SSN"""
    return f"{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}"

def generate_phone():
    """Generate realistic phone number"""
    area_codes = ['212', '213', '214', '215', '216', '303', '312', '313', '404', '415', '512', '602', '702', '713', '718', '801', '816', '972']
    area = random.choice(area_codes)
    exchange = random.randint(200, 999)
    number = random.randint(1000, 9999)
    return f"({area}) {exchange}-{number}"

def generate_email(first_name, last_name, domain=None):
    """Generate realistic email address"""
    domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'company.com', 'corporation.net']
    if domain is None:
        domain = random.choice(domains)

    separators = ['.', '_', '']
    separator = random.choice(separators)
    number = random.randint(1, 999) if random.random() < 0.3 else ''

    email = f"{first_name.lower()}{separator}{last_name.lower()}{number}@{domain}"
    return email

def generate_address():
    """Generate realistic address"""
    street_numbers = random.randint(1, 9999)
    street_names = ['Main St', 'Oak Ave', 'Pine Rd', 'Maple Dr', 'Cedar Ln', 'Elm St', 'Park Ave', 'First St', 'Second St', 'Broadway']
    cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose']
    states = ['NY', 'CA', 'IL', 'TX', 'AZ', 'PA', 'FL', 'OH', 'NC', 'GA']

    street = f"{street_numbers} {random.choice(street_names)}"
    city = random.choice(cities)
    state = random.choice(states)
    zip_code = f"{random.randint(10000, 99999)}"

    return street, city, state, zip_code

def generate_credit_card():
    """Generate fake credit card number"""
    # Fake Visa format (starts with 4)
    first_four = "4000"
    remaining = ''.join([str(random.randint(0, 9)) for _ in range(12)])
    return f"{first_four}-{remaining[:4]}-{remaining[4:8]}-{remaining[8:]}"

def create_large_pii_dataset():
    """
    Create a large-scale test dataset with 15 tables, many-to-many relationships,
    50+ columns per table, some tables with 100K+ records, and realistic PII data.
    """

    print("🚀 Creating Large-Scale PII Test Dataset...")
    print("⚠️  WARNING: This dataset contains synthetic PII for testing purposes only!")

    np.random.seed(42)
    random.seed(42)

    # =====================================================================
    # TABLE 1: CUSTOMERS (150K records) - Large table with extensive PII
    # =====================================================================
    print("📊 Creating CUSTOMERS table (150,000 records)...")

    customer_names = generate_realistic_names(150000)
    customers_data = []

    for i in range(150000):
        first_name, last_name = customer_names[i]
        street, city, state, zip_code = generate_address()

        customer = {
            'customer_id': i + 1,
            'customer_uuid': f"cust-{random.randint(100000, 999999)}-{random.randint(1000, 9999)}",
            'first_name': first_name,
            'last_name': last_name,
            'full_name': f"{first_name} {last_name}",
            'middle_initial': random.choice(string.ascii_uppercase) if random.random() < 0.7 else None,
            'maiden_name': random.choice(['Anderson', 'Brown', 'Davis', 'Johnson', 'Smith']) if random.random() < 0.3 else None,
            'date_of_birth': pd.Timestamp('1950-01-01') + pd.Timedelta(days=random.randint(0, 25550)),  # 1950-2020
            'age': None,  # Will calculate
            'gender': random.choice(['Male', 'Female', 'Other', 'Prefer not to say']),
            'marital_status': random.choice(['Single', 'Married', 'Divorced', 'Widowed', 'Separated']),
            'ssn': generate_ssn(),
            'drivers_license': f"{random.choice(['NY', 'CA', 'TX', 'FL'])}{random.randint(10000000, 99999999)}",
            'passport_number': f"{''.join(random.choices(string.ascii_uppercase, k=2))}{random.randint(1000000, 9999999)}",
            'email_primary': generate_email(first_name, last_name),
            'email_secondary': generate_email(first_name, last_name) if random.random() < 0.4 else None,
            'phone_primary': generate_phone(),
            'phone_secondary': generate_phone() if random.random() < 0.5 else None,
            'phone_work': generate_phone() if random.random() < 0.6 else None,
            'address_street': street,
            'address_city': city,
            'address_state': state,
            'address_zip': zip_code,
            'address_country': 'USA',
            'mailing_address_same': random.choice([True, False]),
            'mailing_street': street if random.random() < 0.8 else f"{random.randint(1, 9999)} {random.choice(['Oak Ave', 'Pine Rd'])}",
            'mailing_city': city if random.random() < 0.8 else random.choice(['Boston', 'Seattle', 'Denver']),
            'mailing_state': state if random.random() < 0.8 else random.choice(['MA', 'WA', 'CO']),
            'mailing_zip': zip_code if random.random() < 0.8 else f"{random.randint(10000, 99999)}",
            'emergency_contact_name': f"{random.choice(['Alice', 'Bob', 'Carol', 'David'])} {last_name}",
            'emergency_contact_phone': generate_phone(),
            'emergency_contact_relationship': random.choice(['Spouse', 'Parent', 'Sibling', 'Child', 'Friend']),
            'occupation': random.choice(['Engineer', 'Teacher', 'Doctor', 'Lawyer', 'Manager', 'Analyst', 'Sales Rep', 'Consultant']),
            'employer': f"Company_{random.randint(1, 1000)}",
            'annual_income': random.randint(25000, 250000),
            'credit_score': random.randint(300, 850),
            'customer_since': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'account_status': random.choice(['Active', 'Inactive', 'Suspended', 'Closed']),
            'vip_status': random.choice(['Gold', 'Silver', 'Bronze', 'Standard']),
            'marketing_consent': random.choice([True, False]),
            'data_sharing_consent': random.choice([True, False]),
            'last_login': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-90, 0)),
            'ip_address_last': f"{random.randint(192, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'browser_fingerprint': ''.join(random.choices(string.ascii_letters + string.digits, k=32)),
            'device_id': f"device-{random.randint(100000, 999999)}",
            'preferred_language': random.choice(['English', 'Spanish', 'French', 'German', 'Chinese']),
            'time_zone': random.choice(['EST', 'PST', 'CST', 'MST']),
            'communication_preference': random.choice(['Email', 'Phone', 'SMS', 'Mail']),
            'newsletter_subscription': random.choice([True, False]),
            'referral_source': random.choice(['Google', 'Facebook', 'Friend', 'Advertisement', 'Other']),
            'loyalty_points': random.randint(0, 50000),
            'lifetime_value': round(random.uniform(100, 50000), 2),
            'risk_score': random.randint(1, 100),
            'fraud_alerts': random.randint(0, 5),
            'notes': f"Customer notes for {first_name} {last_name}" if random.random() < 0.3 else None,
            'created_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'updated_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-30, 0)),
            'created_by': f"user_{random.randint(1, 100)}",
            'updated_by': f"user_{random.randint(1, 100)}"
        }

        # Calculate age
        today = pd.Timestamp.now()
        customer['age'] = (today - customer['date_of_birth']).days // 365

        customers_data.append(customer)

    customers = pd.DataFrame(customers_data)

    # =====================================================================
    # TABLE 2: EMPLOYEES (25K records) - Extensive employee PII
    # =====================================================================
    print("📊 Creating EMPLOYEES table (25,000 records)...")

    employee_names = generate_realistic_names(25000)
    employees_data = []

    for i in range(25000):
        first_name, last_name = employee_names[i]
        street, city, state, zip_code = generate_address()

        employee = {
            'employee_id': i + 1,
            'employee_number': f"EMP{i+1:06d}",
            'badge_id': f"BDG{random.randint(100000, 999999)}",
            'first_name': first_name,
            'last_name': last_name,
            'full_name': f"{first_name} {last_name}",
            'preferred_name': first_name if random.random() < 0.9 else random.choice(['Alex', 'Sam', 'Chris', 'Jordan']),
            'middle_name': random.choice(['James', 'Marie', 'Lee', 'Ann', 'Michael']) if random.random() < 0.6 else None,
            'ssn': generate_ssn(),
            'date_of_birth': pd.Timestamp('1960-01-01') + pd.Timedelta(days=random.randint(0, 20075)),
            'age': None,  # Will calculate
            'gender': random.choice(['Male', 'Female', 'Non-binary', 'Prefer not to say']),
            'ethnicity': random.choice(['White', 'Black', 'Hispanic', 'Asian', 'Native American', 'Other', 'Prefer not to say']),
            'nationality': random.choice(['US Citizen', 'Permanent Resident', 'Work Visa', 'Other']),
            'emergency_contact_1_name': f"{random.choice(['Alice', 'Bob', 'Carol', 'David'])} {last_name}",
            'emergency_contact_1_phone': generate_phone(),
            'emergency_contact_1_relationship': random.choice(['Spouse', 'Parent', 'Sibling', 'Child']),
            'emergency_contact_2_name': f"{random.choice(['Eve', 'Frank', 'Grace', 'Henry'])} Smith",
            'emergency_contact_2_phone': generate_phone(),
            'emergency_contact_2_relationship': random.choice(['Friend', 'Relative', 'Neighbor']),
            'personal_email': generate_email(first_name, last_name),
            'work_email': generate_email(first_name, last_name, 'company.com'),
            'personal_phone': generate_phone(),
            'work_phone': generate_phone(),
            'mobile_phone': generate_phone(),
            'home_address_street': street,
            'home_address_city': city,
            'home_address_state': state,
            'home_address_zip': zip_code,
            'work_location': random.choice(['HQ Building A', 'HQ Building B', 'Remote', 'Branch Office 1', 'Branch Office 2']),
            'department_id': random.randint(1, 50),
            'job_title': random.choice(['Software Engineer', 'Data Analyst', 'Product Manager', 'Sales Rep', 'HR Specialist']),
            'job_level': random.choice(['Junior', 'Mid', 'Senior', 'Lead', 'Principal', 'Director']),
            'manager_id': random.randint(1, 1000) if random.random() < 0.8 else None,
            'hire_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'start_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'employment_type': random.choice(['Full-time', 'Part-time', 'Contract', 'Intern']),
            'employment_status': random.choice(['Active', 'On Leave', 'Terminated', 'Retired']),
            'salary_annual': random.randint(40000, 300000),
            'salary_hourly': None,  # Will calculate for hourly employees
            'bonus_target': random.randint(5000, 50000),
            'commission_rate': round(random.uniform(0, 0.15), 3) if random.random() < 0.3 else None,
            'benefits_401k': random.choice([True, False]),
            'benefits_health_insurance': random.choice([True, False]),
            'benefits_dental': random.choice([True, False]),
            'benefits_vision': random.choice([True, False]),
            'vacation_days_accrued': random.randint(0, 30),
            'sick_days_accrued': random.randint(0, 10),
            'performance_rating': random.choice(['Exceeds', 'Meets', 'Below', 'Outstanding']),
            'last_review_date': pd.Timestamp('2023-01-01') + pd.Timedelta(days=random.randint(0, 365)),
            'next_review_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(0, 365)),
            'security_clearance': random.choice(['None', 'Public Trust', 'Secret', 'Top Secret']) if random.random() < 0.2 else 'None',
            'training_completed': random.randint(0, 20),
            'certifications_count': random.randint(0, 5),
            'languages_spoken': random.choice(['English', 'English, Spanish', 'English, French', 'English, Mandarin']),
            'education_level': random.choice(['High School', 'Associates', 'Bachelors', 'Masters', 'PhD']),
            'university': random.choice(['Harvard', 'MIT', 'Stanford', 'Berkeley', 'NYU', 'State University']),
            'graduation_year': random.randint(1980, 2024),
            'previous_employer': f"Previous Company {random.randint(1, 100)}" if random.random() < 0.7 else None,
            'background_check_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'drug_test_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'i9_form_completed': random.choice([True, False]),
            'w4_form_completed': random.choice([True, False]),
            'direct_deposit_account': f"****{random.randint(1000, 9999)}",
            'direct_deposit_routing': f"0{random.randint(10000000, 99999999)}",
            'tax_state': state,
            'withholding_allowances': random.randint(0, 5),
            'created_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'updated_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-30, 0))
        }

        # Calculate age
        today = pd.Timestamp.now()
        employee['age'] = (today - employee['date_of_birth']).days // 365

        # Calculate hourly rate for part-time employees
        if employee['employment_type'] in ['Part-time', 'Contract']:
            employee['salary_hourly'] = round(employee['salary_annual'] / 2080, 2)

        employees_data.append(employee)

    employees = pd.DataFrame(employees_data)

    # =====================================================================
    # TABLE 3: TRANSACTIONS (500K records) - Financial transaction data
    # =====================================================================
    print("📊 Creating TRANSACTIONS table (500,000 records)...")

    transactions_data = []
    for i in range(500000):
        transaction = {
            'transaction_id': i + 1,
            'transaction_uuid': f"txn-{random.randint(100000, 999999)}-{random.randint(1000, 9999)}",
            'customer_id': random.randint(1, 150000),
            'account_id': random.randint(1, 200000),
            'card_id': random.randint(1, 300000),
            'merchant_id': random.randint(1, 10000),
            'transaction_type': random.choice(['Purchase', 'Refund', 'Transfer', 'Withdrawal', 'Deposit', 'Fee']),
            'transaction_category': random.choice(['Groceries', 'Gas', 'Restaurant', 'Shopping', 'Entertainment', 'Bills', 'Healthcare']),
            'amount': round(random.uniform(-1000, 5000), 2),
            'currency': random.choice(['USD', 'EUR', 'GBP', 'CAD']),
            'exchange_rate': round(random.uniform(0.8, 1.2), 4) if random.random() < 0.1 else 1.0,
            'amount_usd': None,  # Will calculate
            'fee_amount': round(random.uniform(0, 50), 2) if random.random() < 0.3 else 0,
            'tax_amount': round(random.uniform(0, 100), 2) if random.random() < 0.4 else 0,
            'tip_amount': round(random.uniform(0, 50), 2) if random.random() < 0.2 else 0,
            'transaction_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'transaction_time': f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}:{random.randint(0, 59):02d}",
            'posted_date': None,  # Will calculate
            'settlement_date': None,  # Will calculate
            'authorization_code': f"AUTH{random.randint(100000, 999999)}",
            'reference_number': f"REF{random.randint(10000000, 99999999)}",
            'batch_id': f"BATCH{random.randint(1000, 9999)}",
            'terminal_id': f"TERM{random.randint(10000, 99999)}",
            'merchant_name': f"Merchant_{random.randint(1, 1000)}",
            'merchant_category_code': random.randint(1000, 9999),
            'merchant_address': f"{random.randint(1, 999)} Commerce St",
            'merchant_city': random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston']),
            'merchant_state': random.choice(['NY', 'CA', 'IL', 'TX']),
            'merchant_zip': f"{random.randint(10000, 99999)}",
            'merchant_country': 'USA',
            'payment_method': random.choice(['Credit Card', 'Debit Card', 'ACH', 'Wire Transfer', 'Check', 'Cash']),
            'card_type': random.choice(['Visa', 'Mastercard', 'Amex', 'Discover']) if random.random() < 0.8 else None,
            'card_last_four': f"{random.randint(1000, 9999)}",
            'card_present': random.choice([True, False]),
            'online_transaction': random.choice([True, False]),
            'mobile_transaction': random.choice([True, False]),
            'contactless_payment': random.choice([True, False]),
            'recurring_transaction': random.choice([True, False]),
            'transaction_status': random.choice(['Completed', 'Pending', 'Failed', 'Cancelled', 'Disputed']),
            'failure_reason': random.choice(['Insufficient Funds', 'Card Declined', 'Expired Card', None]) if random.random() < 0.1 else None,
            'fraud_score': random.randint(1, 100),
            'fraud_flag': random.choice([True, False]),
            'risk_level': random.choice(['Low', 'Medium', 'High']),
            'ip_address': f"{random.randint(192, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'user_agent': random.choice(['Chrome/91.0', 'Firefox/89.0', 'Safari/14.1', 'Edge/91.0']),
            'device_fingerprint': ''.join(random.choices(string.ascii_letters + string.digits, k=32)),
            'location_lat': round(random.uniform(25.0, 49.0), 6),
            'location_long': round(random.uniform(-125.0, -66.0), 6),
            'description': f"Transaction description {i+1}",
            'notes': f"Transaction notes {i+1}" if random.random() < 0.2 else None,
            'reconciled': random.choice([True, False]),
            'reconciled_date': None,  # Will calculate for reconciled transactions
            'created_by': 'system',
            'created_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'updated_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-30, 0))
        }

        # Calculate USD amount
        transaction['amount_usd'] = round(transaction['amount'] * transaction['exchange_rate'], 2)

        # Calculate posted and settlement dates
        transaction['posted_date'] = transaction['transaction_date'] + pd.Timedelta(days=random.randint(0, 2))
        transaction['settlement_date'] = transaction['posted_date'] + pd.Timedelta(days=random.randint(1, 3))

        # Calculate reconciled date for reconciled transactions
        if transaction['reconciled']:
            transaction['reconciled_date'] = transaction['settlement_date'] + pd.Timedelta(days=random.randint(1, 30))

        transactions_data.append(transaction)

    transactions = pd.DataFrame(transactions_data)

    # =====================================================================
    # TABLE 4: MEDICAL_RECORDS (75K records) - Highly sensitive medical PII
    # =====================================================================
    print("📊 Creating MEDICAL_RECORDS table (75,000 records)...")

    medical_data = []
    for i in range(75000):
        patient_id = random.randint(1, 150000)  # Link to customers

        medical_record = {
            'record_id': i + 1,
            'patient_id': patient_id,
            'medical_record_number': f"MRN{i+1:08d}",
            'patient_ssn': generate_ssn(),
            'insurance_id': f"INS{random.randint(100000000, 999999999)}",
            'policy_number': f"POL{random.randint(10000000, 99999999)}",
            'group_number': f"GRP{random.randint(1000, 9999)}",
            'primary_care_physician': f"Dr. {random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Davis'])}",
            'physician_npi': f"{random.randint(1000000000, 9999999999)}",
            'facility_name': f"Medical Center {random.randint(1, 100)}",
            'facility_address': f"{random.randint(1, 999)} Health St",
            'facility_city': random.choice(['Boston', 'Houston', 'Phoenix', 'Chicago']),
            'facility_state': random.choice(['MA', 'TX', 'AZ', 'IL']),
            'facility_zip': f"{random.randint(10000, 99999)}",
            'visit_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'admission_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'discharge_date': None,  # Will calculate for some records
            'visit_type': random.choice(['Routine Checkup', 'Emergency', 'Surgery', 'Consultation', 'Follow-up']),
            'chief_complaint': random.choice(['Chest pain', 'Headache', 'Fever', 'Back pain', 'Shortness of breath']),
            'diagnosis_primary': random.choice(['Hypertension', 'Diabetes', 'Anxiety', 'Depression', 'Arthritis']),
            'diagnosis_secondary': random.choice(['High cholesterol', 'Obesity', 'Sleep apnea', 'Allergies', None]),
            'icd10_primary': random.choice(['I10', 'E11.9', 'F41.9', 'F32.9', 'M19.90']),
            'icd10_secondary': random.choice(['E78.5', 'E66.9', 'G47.33', 'T78.40XA', None]),
            'procedure_performed': random.choice(['Blood test', 'X-ray', 'MRI', 'CT scan', 'Surgery', None]),
            'procedure_code': random.choice(['80053', '73610', '70551', '74150', '64721', None]),
            'medications_prescribed': random.choice(['Lisinopril', 'Metformin', 'Alprazolam', 'Sertraline', 'Ibuprofen']),
            'medication_dosage': random.choice(['5mg daily', '500mg twice daily', '0.5mg as needed', '50mg daily', '200mg as needed']),
            'allergies': random.choice(['Penicillin', 'Shellfish', 'Latex', 'Peanuts', 'None known']),
            'vital_signs_bp_systolic': random.randint(90, 180),
            'vital_signs_bp_diastolic': random.randint(60, 120),
            'vital_signs_heart_rate': random.randint(60, 120),
            'vital_signs_temperature': round(random.uniform(96.0, 104.0), 1),
            'vital_signs_weight': random.randint(100, 300),
            'vital_signs_height': random.randint(60, 80),
            'vital_signs_bmi': None,  # Will calculate
            'lab_results_glucose': random.randint(70, 200),
            'lab_results_cholesterol': random.randint(150, 300),
            'lab_results_hdl': random.randint(30, 80),
            'lab_results_ldl': random.randint(70, 200),
            'lab_results_triglycerides': random.randint(50, 400),
            'lab_results_hemoglobin': round(random.uniform(10.0, 18.0), 1),
            'lab_results_hematocrit': round(random.uniform(30.0, 55.0), 1),
            'family_history_diabetes': random.choice([True, False]),
            'family_history_heart_disease': random.choice([True, False]),
            'family_history_cancer': random.choice([True, False]),
            'social_history_smoking': random.choice(['Never', 'Former', 'Current']),
            'social_history_alcohol': random.choice(['None', 'Occasional', 'Regular', 'Heavy']),
            'social_history_drugs': random.choice(['None', 'Former', 'Current']),
            'emergency_contact_name': f"{random.choice(['Alice', 'Bob', 'Carol'])} Emergency",
            'emergency_contact_phone': generate_phone(),
            'emergency_contact_relationship': random.choice(['Spouse', 'Parent', 'Sibling', 'Child']),
            'insurance_company': random.choice(['Blue Cross', 'Aetna', 'Cigna', 'UnitedHealth', 'Humana']),
            'insurance_phone': generate_phone(),
            'copay_amount': random.choice([10, 20, 30, 40, 50]),
            'deductible_amount': random.choice([500, 1000, 2500, 5000]),
            'out_of_pocket_max': random.choice([3000, 5000, 7500, 10000]),
            'notes': f"Medical notes for record {i+1}" if random.random() < 0.3 else None,
            'hipaa_authorization': random.choice([True, False]),
            'research_consent': random.choice([True, False]),
            'data_sharing_consent': random.choice([True, False]),
            'created_by': f"provider_{random.randint(1, 100)}",
            'created_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'updated_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-30, 0))
        }

        # Calculate BMI
        height_inches = medical_record['vital_signs_height']
        weight_lbs = medical_record['vital_signs_weight']
        medical_record['vital_signs_bmi'] = round((weight_lbs / (height_inches ** 2)) * 703, 1)

        # Calculate discharge date for some records
        if random.random() < 0.3:
            medical_record['discharge_date'] = medical_record['admission_date'] + pd.Timedelta(days=random.randint(1, 10))

        medical_data.append(medical_record)

    medical_records = pd.DataFrame(medical_data)

    # =====================================================================
    # TABLE 5: FINANCIAL_ACCOUNTS (200K records) - Banking/financial PII
    # =====================================================================
    print("📊 Creating FINANCIAL_ACCOUNTS table (200,000 records)...")

    accounts_data = []
    for i in range(200000):
        account = {
            'account_id': i + 1,
            'account_number': f"{random.randint(1000000000, 9999999999)}",
            'routing_number': f"0{random.randint(10000000, 99999999)}",
            'customer_id': random.randint(1, 150000),
            'account_type': random.choice(['Checking', 'Savings', 'Credit Card', 'Investment', 'Loan', 'Mortgage']),
            'account_subtype': random.choice(['Personal', 'Business', 'Joint', 'Trust', 'Student']),
            'account_name': f"Account_{i+1}",
            'product_code': f"PROD{random.randint(1000, 9999)}",
            'interest_rate': round(random.uniform(0.01, 5.0), 3),
            'balance_current': round(random.uniform(-50000, 500000), 2),
            'balance_available': None,  # Will calculate
            'balance_pending': round(random.uniform(0, 5000), 2),
            'credit_limit': random.randint(1000, 50000) if random.random() < 0.4 else None,
            'minimum_balance': random.choice([0, 100, 500, 1000, 2500]),
            'overdraft_limit': random.randint(0, 1000) if random.random() < 0.3 else 0,
            'monthly_fee': random.choice([0, 5, 10, 15, 25]),
            'transaction_limit_daily': random.randint(500, 10000),
            'withdrawal_limit_daily': random.randint(300, 3000),
            'account_holder_1_name': f"Account Holder {random.randint(1, 150000)}",
            'account_holder_1_ssn': generate_ssn(),
            'account_holder_1_role': random.choice(['Primary', 'Joint', 'Authorized User']),
            'account_holder_2_name': f"Joint Holder {random.randint(1, 150000)}" if random.random() < 0.2 else None,
            'account_holder_2_ssn': generate_ssn() if random.random() < 0.2 else None,
            'account_holder_2_role': 'Joint' if random.random() < 0.2 else None,
            'beneficiary_1_name': f"Beneficiary {random.randint(1, 10000)}" if random.random() < 0.3 else None,
            'beneficiary_1_ssn': generate_ssn() if random.random() < 0.3 else None,
            'beneficiary_1_percentage': random.randint(25, 100) if random.random() < 0.3 else None,
            'beneficiary_2_name': f"Beneficiary {random.randint(1, 10000)}" if random.random() < 0.15 else None,
            'beneficiary_2_ssn': generate_ssn() if random.random() < 0.15 else None,
            'beneficiary_2_percentage': random.randint(25, 75) if random.random() < 0.15 else None,
            'branch_code': f"BR{random.randint(1000, 9999)}",
            'branch_name': f"Branch {random.randint(1, 500)}",
            'branch_address': f"{random.randint(1, 999)} Bank St",
            'branch_city': random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston']),
            'branch_state': random.choice(['NY', 'CA', 'IL', 'TX']),
            'branch_zip': f"{random.randint(10000, 99999)}",
            'account_manager': f"Manager_{random.randint(1, 1000)}",
            'account_manager_phone': generate_phone(),
            'account_manager_email': f"manager{random.randint(1, 1000)}@bank.com",
            'opened_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'closed_date': None,  # Some accounts closed
            'account_status': random.choice(['Active', 'Inactive', 'Closed', 'Frozen', 'Suspended']),
            'status_reason': random.choice(['Good Standing', 'Dormant', 'Customer Request', 'Compliance', None]),
            'last_transaction_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-90, 0)),
            'statement_frequency': random.choice(['Monthly', 'Quarterly', 'Annual']),
            'statement_delivery': random.choice(['Electronic', 'Mail', 'Both']),
            'electronic_statements': random.choice([True, False]),
            'mobile_banking': random.choice([True, False]),
            'online_banking': random.choice([True, False]),
            'debit_card_issued': random.choice([True, False]),
            'debit_card_number': generate_credit_card() if random.random() < 0.7 else None,
            'debit_card_expiry': pd.Timestamp('2025-01-01') + pd.DateOffset(years=random.randint(1, 5)) if random.random() < 0.7 else None,
            'checks_ordered': random.randint(0, 10),
            'direct_deposits': random.randint(0, 5),
            'automatic_payments': random.randint(0, 8),
            'overdraft_protection': random.choice([True, False]),
            'fraud_monitoring': random.choice([True, False]),
            'travel_notifications': random.choice([True, False]),
            'tax_reporting_1099': random.choice([True, False]),
            'escheatment_eligible': random.choice([True, False]),
            'kyc_completed': random.choice([True, False]),
            'kyc_date': pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)),
            'aml_review_date': pd.Timestamp('2023-01-01') + pd.Timedelta(days=random.randint(0, 365)),
            'risk_rating': random.choice(['Low', 'Medium', 'High']),
            'created_by': f"banker_{random.randint(1, 500)}",
            'created_date': pd.Timestamp('2010-01-01') + pd.Timedelta(days=random.randint(0, 5110)),
            'updated_date': pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(-30, 0))
        }

        # Calculate available balance
        account['balance_available'] = account['balance_current'] - account['balance_pending']

        # Set closed date for closed accounts
        if account['account_status'] == 'Closed':
            account['closed_date'] = account['opened_date'] + pd.Timedelta(days=random.randint(30, 1825))

        accounts_data.append(account)

    financial_accounts = pd.DataFrame(accounts_data)

    # =====================================================================
    # TABLES 6-15: Supporting tables with many-to-many relationships
    # =====================================================================

    # TABLE 6: PRODUCTS (10K records)
    print("📊 Creating supporting tables...")
    products = pd.DataFrame({
        'product_id': range(1, 10001),
        'product_code': [f"PROD{i:06d}" for i in range(1, 10001)],
        'product_name': [f"Product_{i}" for i in range(1, 10001)],
        'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Sports', 'Books'], 10000),
        'subcategory': np.random.choice(['Phones', 'Laptops', 'Shirts', 'Furniture', 'Fiction'], 10000),
        'brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD'], 10000),
        'price': np.round(np.random.uniform(5, 2000), 2),
        'cost': np.round(np.random.uniform(2, 1000), 2),
        'weight': np.round(np.random.uniform(0.1, 50), 2),
        'dimensions': [f"{random.randint(1, 100)}x{random.randint(1, 100)}x{random.randint(1, 100)}" for _ in range(10000)],
        'sku': [f"SKU{i:08d}" for i in range(1, 10001)],
        'upc': [f"{random.randint(100000000000, 999999999999)}" for _ in range(10000)],
        'manufacturer': [f"Manufacturer_{random.randint(1, 500)}" for _ in range(10000)],
        'supplier_id': np.random.randint(1, 1000, 10000),
        'inventory_count': np.random.randint(0, 1000, 10000),
        'reorder_point': np.random.randint(10, 100, 10000),
        'discontinued': np.random.choice([True, False], 10000, p=[0.1, 0.9])
    })

    # TABLE 7: ORDERS (300K records)
    orders = pd.DataFrame({
        'order_id': range(1, 300001),
        'customer_id': np.random.choice(range(1, 150001), 300000),
        'order_number': [f"ORD{i:08d}" for i in range(1, 300001)],
        'order_date': pd.date_range('2020-01-01', periods=300000, freq='3min'),
        'ship_date': pd.date_range('2020-01-01', periods=300000, freq='3min') + pd.Timedelta(days=2),
        'delivery_date': pd.date_range('2020-01-01', periods=300000, freq='3min') + pd.Timedelta(days=5),
        'order_status': np.random.choice(['Pending', 'Processing', 'Shipped', 'Delivered', 'Cancelled'], 300000),
        'payment_status': np.random.choice(['Pending', 'Paid', 'Failed', 'Refunded'], 300000),
        'shipping_method': np.random.choice(['Standard', 'Express', 'Overnight', 'Ground'], 300000),
        'shipping_cost': np.round(np.random.uniform(0, 50), 2),
        'tax_amount': np.round(np.random.uniform(0, 100), 2),
        'discount_amount': np.round(np.random.uniform(0, 50), 2),
        'total_amount': np.round(np.random.uniform(10, 1000), 2),
        'billing_address_id': np.random.randint(1, 100000, 300000),
        'shipping_address_id': np.random.randint(1, 100000, 300000),
        'tracking_number': [f"TRK{random.randint(100000000, 999999999)}" for _ in range(300000)],
        'carrier': np.random.choice(['UPS', 'FedEx', 'USPS', 'DHL'], 300000),
        'notes': [f"Order notes {i}" if random.random() < 0.2 else None for i in range(300000)]
    })

    # TABLE 8: ORDER_ITEMS (800K records) - Many-to-many: Orders × Products
    order_items = pd.DataFrame({
        'order_item_id': range(1, 800001),
        'order_id': np.random.choice(range(1, 300001), 800000),
        'product_id': np.random.choice(range(1, 10001), 800000),
        'quantity': np.random.randint(1, 10, 800000),
        'unit_price': np.round(np.random.uniform(5, 500), 2),
        'line_total': np.round(np.random.uniform(5, 2000), 2),
        'discount_applied': np.round(np.random.uniform(0, 100), 2),
        'tax_rate': np.round(np.random.uniform(0, 0.15), 4),
        'gift_wrap': np.random.choice([True, False], 800000, p=[0.2, 0.8]),
        'personalization': [f"Custom text {i}" if random.random() < 0.1 else None for i in range(800000)],
        'serial_number': [f"SN{random.randint(100000, 999999)}" if random.random() < 0.3 else None for _ in range(800000)],
        'warranty_months': np.random.choice([0, 12, 24, 36], 800000),
        'return_eligible': np.random.choice([True, False], 800000, p=[0.9, 0.1]),
        'return_deadline': pd.date_range('2020-01-01', periods=800000, freq='2min') + pd.Timedelta(days=30)
    })

    # TABLE 9: DEPARTMENTS (100 records)
    departments = pd.DataFrame({
        'department_id': range(1, 101),
        'department_name': [f"Department_{i}" for i in range(1, 101)],
        'department_code': [f"DEPT{i:03d}" for i in range(1, 101)],
        'manager_id': np.random.choice(range(1, 1001), 100),
        'budget_annual': np.random.randint(100000, 10000000, 100),
        'cost_center': [f"CC{random.randint(1000, 9999)}" for _ in range(100)],
        'location': np.random.choice(['Building A', 'Building B', 'Building C', 'Remote'], 100),
        'floor_number': np.random.randint(1, 20, 100),
        'phone_number': [generate_phone() for _ in range(100)],
        'email': [f"dept{i}@company.com" for i in range(1, 101)],
        'headcount_target': np.random.randint(5, 200, 100),
        'headcount_actual': np.random.randint(3, 180, 100),
        'established_date': pd.date_range('2000-01-01', '2020-01-01', periods=100),
        'status': np.random.choice(['Active', 'Inactive', 'Restructuring'], 100, p=[0.8, 0.1, 0.1])
    })

    # TABLE 10: PROJECTS (15K records)
    projects = pd.DataFrame({
        'project_id': range(1, 15001),
        'project_name': [f"Project_{i}" for i in range(1, 15001)],
        'project_code': [f"PRJ{i:05d}" for i in range(1, 15001)],
        'department_id': np.random.choice(range(1, 101), 15000),
        'project_manager_id': np.random.choice(range(1, 25001), 15000),
        'sponsor_id': np.random.choice(range(1, 25001), 15000),
        'project_type': np.random.choice(['Development', 'Research', 'Implementation', 'Maintenance'], 15000),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 15000),
        'status': np.random.choice(['Planning', 'Active', 'On Hold', 'Completed', 'Cancelled'], 15000),
        'budget_allocated': np.random.randint(10000, 5000000, 15000),
        'budget_spent': np.random.randint(0, 4000000, 15000),
        'start_date': pd.date_range('2022-01-01', '2024-01-01', periods=15000),
        'planned_end_date': pd.date_range('2023-01-01', '2025-01-01', periods=15000),
        'actual_end_date': [pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(0, 365)) if random.random() < 0.4 else None for _ in range(15000)],
        'completion_percentage': np.random.randint(0, 100, 15000),
        'risk_level': np.random.choice(['Low', 'Medium', 'High'], 15000),
        'stakeholder_count': np.random.randint(3, 50, 15000),
        'deliverables_count': np.random.randint(1, 20, 15000),
        'milestones_total': np.random.randint(3, 15, 15000),
        'milestones_completed': np.random.randint(0, 15, 15000)
    })

    # TABLE 11: EMPLOYEE_PROJECTS (50K records) - Many-to-many: Employees × Projects
    employee_projects = pd.DataFrame({
        'assignment_id': range(1, 50001),
        'employee_id': np.random.choice(range(1, 25001), 50000),
        'project_id': np.random.choice(range(1, 15001), 50000),
        'role': np.random.choice(['Developer', 'Analyst', 'Tester', 'Designer', 'Lead', 'Consultant'], 50000),
        'allocation_percentage': np.random.randint(10, 100, 50000),
        'hourly_rate': np.round(np.random.uniform(25, 200), 2),
        'start_date': pd.date_range('2022-01-01', '2024-01-01', periods=50000),
        'end_date': [pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(0, 730)) if random.random() < 0.3 else None for _ in range(50000)],
        'hours_logged': np.random.randint(0, 2000, 50000),
        'performance_rating': np.random.choice(['Excellent', 'Good', 'Satisfactory', 'Needs Improvement'], 50000),
        'billable': np.random.choice([True, False], 50000, p=[0.8, 0.2]),
        'overtime_eligible': np.random.choice([True, False], 50000),
        'access_level': np.random.choice(['Read', 'Write', 'Admin'], 50000),
        'certifications_required': np.random.choice(['PMP', 'Agile', 'Security', 'None'], 50000),
        'training_completed': np.random.choice([True, False], 50000),
        'contract_type': np.random.choice(['Full-time', 'Part-time', 'Contract', 'Consultant'], 50000)
    })

    # TABLE 12: SKILLS (2K records)
    skills = pd.DataFrame({
        'skill_id': range(1, 2001),
        'skill_name': np.random.choice([
            'Python', 'Java', 'JavaScript', 'SQL', 'React', 'Angular', 'Node.js', 'AWS', 'Azure',
            'Project Management', 'Data Analysis', 'Machine Learning', 'Cybersecurity', 'DevOps'
        ] * 150, 2000),
        'skill_category': np.random.choice(['Technical', 'Soft Skills', 'Management', 'Domain'], 2000),
        'skill_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 2000),
        'certification_available': np.random.choice([True, False], 2000),
        'market_demand': np.random.choice(['Low', 'Medium', 'High', 'Very High'], 2000),
        'average_salary_impact': np.random.randint(-5000, 25000, 2000),
        'learning_hours_required': np.random.randint(10, 500, 2000),
        'obsolescence_risk': np.random.choice(['Low', 'Medium', 'High'], 2000),
        'remote_work_compatible': np.random.choice([True, False], 2000, p=[0.7, 0.3]),
        'industry_specific': np.random.choice([True, False], 2000, p=[0.3, 0.7])
    })

    # TABLE 13: EMPLOYEE_SKILLS (120K records) - Many-to-many: Employees × Skills
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 120001),
        'employee_id': np.random.choice(range(1, 25001), 120000),
        'skill_id': np.random.choice(range(1, 2001), 120000),
        'proficiency_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 120000),
        'years_experience': np.random.randint(0, 20, 120000),
        'last_used_date': pd.date_range('2020-01-01', '2024-08-01', periods=120000),
        'certified': np.random.choice([True, False], 120000, p=[0.2, 0.8]),
        'certification_date': [pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 1460)) if random.random() < 0.2 else None for _ in range(120000)],
        'certification_expiry': [pd.Timestamp('2025-01-01') + pd.Timedelta(days=random.randint(0, 1095)) if random.random() < 0.2 else None for _ in range(120000)],
        'skill_source': np.random.choice(['Work', 'Education', 'Self-taught', 'Training'], 120000),
        'endorsed_by_count': np.random.randint(0, 50, 120000),
        'projects_used_count': np.random.randint(0, 20, 120000),
        'training_hours': np.random.randint(0, 200, 120000),
        'assessment_score': np.random.randint(0, 100, 120000),
        'assessment_date': pd.date_range('2023-01-01', '2024-08-01', periods=120000),
        'skill_priority': np.random.choice(['Critical', 'Important', 'Nice to Have'], 120000),
        'development_plan': np.random.choice([True, False], 120000, p=[0.3, 0.7])
    })

    # TABLE 14: VENDORS (5K records)
    vendors = pd.DataFrame({
        'vendor_id': range(1, 5001),
        'vendor_name': [f"Vendor_{i}" for i in range(1, 5001)],
        'vendor_code': [f"VEN{i:05d}" for i in range(1, 5001)],
        'tax_id': [f"{random.randint(10, 99)}-{random.randint(1000000, 9999999)}" for _ in range(5000)],
        'duns_number': [f"{random.randint(100000000, 999999999)}" for _ in range(5000)],
        'business_type': np.random.choice(['Corporation', 'LLC', 'Partnership', 'Sole Proprietorship'], 5000),
        'industry': np.random.choice(['Technology', 'Manufacturing', 'Services', 'Retail', 'Healthcare'], 5000),
        'contact_person': [f"Contact_{i}" for i in range(1, 5001)],
        'contact_title': np.random.choice(['Sales Manager', 'Account Manager', 'VP Sales', 'Director'], 5000),
        'contact_email': [f"contact{i}@vendor{i}.com" for i in range(1, 5001)],
        'contact_phone': [generate_phone() for _ in range(5000)],
        'address_street': [f"{random.randint(1, 9999)} Business Blvd" for _ in range(5000)],
        'address_city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 5000),
        'address_state': np.random.choice(['NY', 'CA', 'IL', 'TX', 'AZ'], 5000),
        'address_zip': [f"{random.randint(10000, 99999)}" for _ in range(5000)],
        'website': [f"www.vendor{i}.com" for i in range(1, 5001)],
        'established_year': np.random.randint(1950, 2020, 5000),
        'employee_count': np.random.randint(1, 10000, 5000),
        'annual_revenue': np.random.randint(100000, 100000000, 5000),
        'credit_rating': np.random.choice(['AAA', 'AA', 'A', 'BBB', 'BB', 'B'], 5000),
        'payment_terms': np.random.choice(['Net 30', 'Net 60', '2/10 Net 30', 'Due on Receipt'], 5000),
        'currency': np.random.choice(['USD', 'EUR', 'CAD', 'GBP'], 5000, p=[0.7, 0.15, 0.1, 0.05]),
        'vendor_status': np.random.choice(['Active', 'Inactive', 'Suspended', 'Under Review'], 5000),
        'diversity_certification': np.random.choice(['MBE', 'WBE', 'DBE', 'SDVOSB', 'None'], 5000),
        'insurance_liability': np.random.randint(1000000, 10000000, 5000),
        'insurance_expiry': pd.date_range('2024-01-01', '2026-01-01', periods=5000),
        'w9_on_file': np.random.choice([True, False], 5000, p=[0.8, 0.2]),
        'background_check_date': pd.date_range('2022-01-01', '2024-01-01', periods=5000),
        'onboarding_date': pd.date_range('2020-01-01', '2024-01-01', periods=5000),
        'last_audit_date': pd.date_range('2023-01-01', '2024-08-01', periods=5000),
        'contract_expiry_date': pd.date_range('2024-06-01', '2027-01-01', periods=5000),
        'spend_ytd': np.random.randint(0, 5000000, 5000),
        'spend_last_year': np.random.randint(0, 10000000, 5000),
        'invoice_count_ytd': np.random.randint(0, 500, 5000),
        'average_payment_days': np.random.randint(15, 90, 5000),
        'quality_rating': np.random.randint(1, 5, 5000),
        'delivery_rating': np.random.randint(1, 5, 5000),
        'service_rating': np.random.randint(1, 5, 5000),
        'preferred_vendor': np.random.choice([True, False], 5000, p=[0.2, 0.8]),
        'strategic_vendor': np.random.choice([True, False], 5000, p=[0.1, 0.9]),
        'risk_level': np.random.choice(['Low', 'Medium', 'High'], 5000),
        'compliance_score': np.random.randint(60, 100, 5000),
        'sustainability_score': np.random.randint(1, 100, 5000),
        'notes': [f"Vendor notes for {i}" if random.random() < 0.3 else None for i in range(1, 5001)],
        'created_by': [f"buyer_{random.randint(1, 100)}" for _ in range(5000)],
        'created_date': pd.date_range('2015-01-01', '2024-01-01', periods=5000),
        'updated_date': pd.date_range('2024-01-01', '2024-08-01', periods=5000)
    })

    # TABLE 15: PURCHASE_ORDERS (80K records) - Many-to-many: Employees × Vendors
    purchase_orders = pd.DataFrame({
        'po_id': range(1, 80001),
        'po_number': [f"PO{i:08d}" for i in range(1, 80001)],
        'vendor_id': np.random.choice(range(1, 5001), 80000),
        'requestor_id': np.random.choice(range(1, 25001), 80000),
        'approver_id': np.random.choice(range(1, 25001), 80000),
        'buyer_id': np.random.choice(range(1, 25001), 80000),
        'department_id': np.random.choice(range(1, 101), 80000),
        'project_id': np.random.choice(range(1, 15001), 80000),
        'po_type': np.random.choice(['Standard', 'Blanket', 'Contract', 'Emergency', 'Services'], 80000),
        'po_status': np.random.choice(['Draft', 'Pending Approval', 'Approved', 'Issued', 'Received', 'Closed', 'Cancelled'], 80000),
        'priority': np.random.choice(['Low', 'Normal', 'High', 'Urgent'], 80000),
        'requisition_number': [f"REQ{random.randint(100000, 999999)}" for _ in range(80000)],
        'contract_number': [f"CNT{random.randint(100000, 999999)}" if random.random() < 0.3 else None for _ in range(80000)],
        'currency': np.random.choice(['USD', 'EUR', 'CAD', 'GBP'], 80000, p=[0.8, 0.1, 0.05, 0.05]),
        'exchange_rate': np.round(np.random.uniform(0.8, 1.5), 4),
        'subtotal': np.round(np.random.uniform(100, 100000), 2),
        'tax_amount': np.round(np.random.uniform(0, 5000), 2),
        'shipping_amount': np.round(np.random.uniform(0, 500), 2),
        'discount_amount': np.round(np.random.uniform(0, 2000), 2),
        'total_amount': np.round(np.random.uniform(100, 105000), 2),
        'budget_code': [f"BUD{random.randint(1000, 9999)}" for _ in range(80000)],
        'cost_center': [f"CC{random.randint(1000, 9999)}" for _ in range(80000)],
        'gl_account': [f"GL{random.randint(100000, 999999)}" for _ in range(80000)],
        'payment_terms': np.random.choice(['Net 30', 'Net 60', '2/10 Net 30', 'Due on Receipt'], 80000),
        'delivery_method': np.random.choice(['Standard Shipping', 'Express', 'Pickup', 'Drop Ship'], 80000),
        'ship_to_address': [f"{random.randint(1, 999)} Delivery St" for _ in range(80000)],
        'ship_to_city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston'], 80000),
        'ship_to_state': np.random.choice(['NY', 'CA', 'IL', 'TX'], 80000),
        'ship_to_zip': [f"{random.randint(10000, 99999)}" for _ in range(80000)],
        'bill_to_address': [f"{random.randint(1, 999)} Billing Ave" for _ in range(80000)],
        'bill_to_city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston'], 80000),
        'bill_to_state': np.random.choice(['NY', 'CA', 'IL', 'TX'], 80000),
        'bill_to_zip': [f"{random.randint(10000, 99999)}" for _ in range(80000)],
        'requested_delivery_date': pd.date_range('2024-01-01', '2025-01-01', periods=80000),
        'promised_delivery_date': pd.date_range('2024-01-01', '2025-01-01', periods=80000),
        'actual_delivery_date': [pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(0, 365)) if random.random() < 0.6 else None for _ in range(80000)],
        'po_date': pd.date_range('2023-01-01', '2024-08-01', periods=80000),
        'approved_date': [pd.Timestamp('2023-01-01') + pd.Timedelta(days=random.randint(0, 600)) if random.random() < 0.8 else None for _ in range(80000)],
        'issued_date': [pd.Timestamp('2023-01-01') + pd.Timedelta(days=random.randint(0, 600)) if random.random() < 0.7 else None for _ in range(80000)],
        'closed_date': [pd.Timestamp('2024-01-01') + pd.Timedelta(days=random.randint(0, 240)) if random.random() < 0.4 else None for _ in range(80000)],
        'special_instructions': [f"Special delivery instructions {i}" if random.random() < 0.2 else None for i in range(80000)],
        'terms_conditions': np.random.choice(['Standard Terms', 'Custom Terms', 'Master Agreement'], 80000),
        'warranty_period_months': np.random.choice([0, 12, 24, 36], 80000),
        'service_level_agreement': np.random.choice(['Standard SLA', 'Premium SLA', 'Custom SLA', None], 80000),
        'quality_requirements': np.random.choice(['ISO 9001', 'Six Sigma', 'Custom Quality', 'Standard'], 80000),
        'environmental_requirements': np.random.choice(['RoHS Compliant', 'REACH Compliant', 'Green Certified', None], 80000),
        'security_requirements': np.random.choice(['Standard', 'Enhanced', 'Top Secret', None], 80000),
        'insurance_required': np.random.choice([True, False], 80000, p=[0.3, 0.7]),
        'background_check_required': np.random.choice([True, False], 80000, p=[0.2, 0.8]),
        'site_access_required': np.random.choice([True, False], 80000, p=[0.4, 0.6]),
        'training_required': np.random.choice([True, False], 80000, p=[0.3, 0.7]),
        'milestone_payments': np.random.choice([True, False], 80000, p=[0.2, 0.8]),
        'retention_percentage': np.round(np.random.uniform(0, 0.1), 3),
        'escalation_clause': np.random.choice([True, False], 80000, p=[0.1, 0.9]),
        'cancellation_clause': np.random.choice([True, False], 80000, p=[0.8, 0.2]),
        'force_majeure_clause': np.random.choice([True, False], 80000, p=[0.9, 0.1]),
        'dispute_resolution': np.random.choice(['Arbitration', 'Litigation', 'Mediation'], 80000),
        'governing_law': np.random.choice(['New York', 'Delaware', 'California', 'Texas'], 80000),
        'confidentiality_agreement': np.random.choice([True, False], 80000, p=[0.6, 0.4]),
        'intellectual_property_clause': np.random.choice([True, False], 80000, p=[0.4, 0.6]),
        'data_protection_clause': np.random.choice([True, False], 80000, p=[0.7, 0.3]),
        'audit_rights': np.random.choice([True, False], 80000, p=[0.3, 0.7]),
        'right_to_audit': np.random.choice([True, False], 80000, p=[0.3, 0.7]),
        'performance_metrics': np.random.choice(['Delivery Time', 'Quality Score', 'Cost Savings', 'Customer Satisfaction'], 80000),
        'kpi_targets': [f"KPI target {random.randint(1, 100)}%" for _ in range(80000)],
        'penalty_clauses': np.random.choice([True, False], 80000, p=[0.4, 0.6]),
        'bonus_clauses': np.random.choice([True, False], 80000, p=[0.2, 0.8]),
        'revision_number': np.random.randint(0, 5, 80000),
        'amendment_count': np.random.randint(0, 3, 80000),
        'emergency_contact_name': [f"Emergency Contact {random.randint(1, 1000)}" for _ in range(80000)],
        'emergency_contact_phone': [generate_phone() for _ in range(80000)],
        'backup_vendor_id': np.random.choice(range(1, 5001), 80000),
        'supplier_diversity_spend': np.random.choice([True, False], 80000, p=[0.2, 0.8]),
        'sustainability_goals': np.random.choice([True, False], 80000, p=[0.3, 0.7]),
        'carbon_footprint_tracking': np.random.choice([True, False], 80000, p=[0.1, 0.9]),
        'social_responsibility_clause': np.random.choice([True, False], 80000, p=[0.4, 0.6]),
        'created_by': [f"buyer_{random.randint(1, 100)}" for _ in range(80000)],
        'created_date': pd.date_range('2023-01-01', '2024-08-01', periods=80000),
        'updated_date': pd.date_range('2024-01-01', '2024-08-01', periods=80000),
        'workflow_status': np.random.choice(['Not Started', 'In Progress', 'Pending Review', 'Approved', 'Rejected'], 80000),
        'approval_level': np.random.randint(1, 5, 80000),
        'digital_signature': np.random.choice([True, False], 80000, p=[0.8, 0.2]),
        'electronic_delivery': np.random.choice([True, False], 80000, p=[0.7, 0.3]),
        'xml_format': np.random.choice([True, False], 80000, p=[0.5, 0.5]),
        'edi_enabled': np.random.choice([True, False], 80000, p=[0.3, 0.7])
    })

    # =====================================================================
    # COMPILE ALL TABLES
    # =====================================================================

    tables = {
        'customers': customers,
        'employees': employees,
        'transactions': transactions,
        'medical_records': medical_records,
        'financial_accounts': financial_accounts,
        'products': products,
        'orders': orders,
        'order_items': order_items,
        'departments': departments,
        'projects': projects,
        'employee_projects': employee_projects,
        'skills': skills,
        'employee_skills': employee_skills,
        'vendors': vendors,
        'purchase_orders': purchase_orders
    }

    # Print comprehensive summary
    print(f"\n🎉 LARGE-SCALE PII DATASET CREATION COMPLETED!")
    print(f"=" * 80)

    total_rows = sum(len(df) for df in tables.values())
    total_columns = sum(len(df.columns) for df in tables.values())

    print(f"📊 DATASET OVERVIEW:")
    print(f"  Total Tables: {len(tables)}")
    print(f"  Total Rows: {total_rows:,}")
    print(f"  Total Columns: {total_columns}")
    print(f"  Average Columns per Table: {total_columns // len(tables)}")

    print(f"\n📋 TABLE BREAKDOWN:")
    for table_name, df in tables.items():
        size_category = "🔥 LARGE" if len(df) >= 100000 else "📊 MEDIUM" if len(df) >= 10000 else "📝 SMALL"
        print(f"  {table_name:<20}: {len(df):>8,} rows × {len(df.columns):>2} cols {size_category}")

    print(f"\n🔒 PII DATA TYPES INCLUDED:")
    print(f"  • Personal Information: Names, DOB, SSN, Address, Phone, Email")
    print(f"  • Financial Data: Account numbers, Credit cards, Banking info, Transactions")
    print(f"  • Medical Records: Diagnoses, Medications, Lab results, Insurance")
    print(f"  • Employment Data: Salaries, Performance, Background checks")
    print(f"  • Biometric Data: Height, Weight, Vital signs")
    print(f"  • Behavioral Data: Transaction patterns, Login history")

    print(f"\n🔗 RELATIONSHIP STRUCTURE:")
    print(f"  • Many-to-Many Tables: 6 (Order Items, Employee Projects, Employee Skills, etc.)")
    print(f"  • Tables with 100K+ Records: 5 (Customers, Transactions, Financial Accounts, etc.)")
    print(f"  • Cross-references: 25+ foreign key relationships")
    print(f"  • Recursive Relationships: Employee management, Project dependencies")

    print(f"\n⚠️  IMPORTANT DISCLAIMERS:")
    print(f"  • ALL PII DATA IS SYNTHETICALLY GENERATED")
    print(f"  • NO REAL PERSONAL INFORMATION IS INCLUDED")
    print(f"  • FOR TESTING AND DEVELOPMENT PURPOSES ONLY")
    print(f"  • SUITABLE FOR DATA ANONYMIZATION/SDV TESTING")

    return tables

In [None]:
def setup_large_dataset_relationships():
    """
    Provide the relationship setup for the large PII dataset
    """
    print("\n" + "="*80)
    print("LARGE DATASET RELATIONSHIP MAPPING")
    print("="*80)

    relationships = {
        'hierarchical': [
            # Core business hierarchy
            ('orders', 'customer_id', 'customers', 'customer_id'),
            ('transactions', 'customer_id', 'customers', 'customer_id'),
            ('transactions', 'account_id', 'financial_accounts', 'account_id'),
            ('financial_accounts', 'customer_id', 'customers', 'customer_id'),
            ('medical_records', 'patient_id', 'customers', 'customer_id'),

            # Employee/organizational hierarchy
            ('employees', 'department_id', 'departments', 'department_id'),
            ('projects', 'department_id', 'departments', 'department_id'),
            ('projects', 'project_manager_id', 'employees', 'employee_id'),
            ('purchase_orders', 'requestor_id', 'employees', 'employee_id'),
            ('purchase_orders', 'approver_id', 'employees', 'employee_id'),
            ('purchase_orders', 'buyer_id', 'employees', 'employee_id'),
            ('purchase_orders', 'department_id', 'departments', 'department_id'),
            ('purchase_orders', 'vendor_id', 'vendors', 'vendor_id'),
        ],

        'many_to_many': [
            # Products and Orders
            ('order_items', 'order_id', 'orders', 'order_id'),
            ('order_items', 'product_id', 'products', 'product_id'),

            # Employee Projects
            ('employee_projects', 'employee_id', 'employees', 'employee_id'),
            ('employee_projects', 'project_id', 'projects', 'project_id'),

            # Employee Skills
            ('employee_skills', 'employee_id', 'employees', 'employee_id'),
            ('employee_skills', 'skill_id', 'skills', 'skill_id'),

            # Purchase Orders (complex many-to-many)
            ('purchase_orders', 'project_id', 'projects', 'project_id'),
        ],

        'recursive': [
            ('employees', 'manager_id', 'employee_id'),
            ('departments', 'manager_id', 'employee_id'),
        ]
    }

    print("\n🏗️ HIERARCHICAL RELATIONSHIPS:")
    for child_table, child_col, parent_table, parent_col in relationships['hierarchical']:
        print(f"  {parent_table}.{parent_col} → {child_table}.{child_col}")

    print(f"\n🔗 MANY-TO-MANY RELATIONSHIPS:")
    for child_table, child_col, parent_table, parent_col in relationships['many_to_many']:
        print(f"  {parent_table}.{parent_col} ↔ {child_table}.{child_col}")

    print(f"\n🔄 RECURSIVE RELATIONSHIPS:")
    for table_name, foreign_key_col, primary_key_col in relationships['recursive']:
        print(f"  {table_name}.{primary_key_col} → {table_name}.{foreign_key_col}")

    print(f"\n📊 COMPLEXITY METRICS:")
    total_relationships = len(relationships['hierarchical']) + len(relationships['many_to_many']) + len(relationships['recursive'])
    print(f"  Total Relationships: {total_relationships}")
    print(f"  Large Tables (100K+ rows): 5")
    print(f"  Medium Tables (10K-100K rows): 5")
    print(f"  Small Tables (<10K rows): 5")
    print(f"  PII Sensitivity: HIGH (Medical, Financial, Personal)")

    return relationships

def create_synthesizer_setup_code():
    """
    Generate the code needed to set up the synthesizer with all relationships
    """

    setup_code = '''
# =====================================================================
# COMPLETE SYNTHESIZER SETUP FOR LARGE PII DATASET
# =====================================================================

def setup_large_pii_synthesizer():
    """Setup synthesizer with all 15 tables and relationships"""

    # Step 1: Create the large dataset
    print("Creating large PII dataset...")
    enterprise_data = create_large_pii_dataset()

    # Step 2: Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Step 3: Add all tables with primary keys
    print("Adding tables...")
    table_configs = [
        ('customers', 'customer_id'),
        ('employees', 'employee_id'),
        ('transactions', 'transaction_id'),
        ('medical_records', 'record_id'),
        ('financial_accounts', 'account_id'),
        ('products', 'product_id'),
        ('orders', 'order_id'),
        ('order_items', 'order_item_id'),
        ('departments', 'department_id'),
        ('projects', 'project_id'),
        ('employee_projects', 'assignment_id'),
        ('skills', 'skill_id'),
        ('employee_skills', 'emp_skill_id'),
        ('vendors', 'vendor_id'),
        ('purchase_orders', 'po_id')
    ]

    for table_name, primary_key in table_configs:
        synthesizer.add_table_data(table_name, enterprise_data[table_name], primary_key)

    # Step 4: Add hierarchical relationships
    print("Adding hierarchical relationships...")
    hierarchical_rels = [
        ('orders', 'customer_id', 'customers', 'customer_id'),
        ('transactions', 'customer_id', 'customers', 'customer_id'),
        ('transactions', 'account_id', 'financial_accounts', 'account_id'),
        ('financial_accounts', 'customer_id', 'customers', 'customer_id'),
        ('medical_records', 'patient_id', 'customers', 'customer_id'),
        ('employees', 'department_id', 'departments', 'department_id'),
        ('projects', 'department_id', 'departments', 'department_id'),
        ('projects', 'project_manager_id', 'employees', 'employee_id'),
        ('purchase_orders', 'requestor_id', 'employees', 'employee_id'),
        ('purchase_orders', 'approver_id', 'employees', 'employee_id'),
        ('purchase_orders', 'buyer_id', 'employees', 'employee_id'),
        ('purchase_orders', 'department_id', 'departments', 'department_id'),
        ('purchase_orders', 'vendor_id', 'vendors', 'vendor_id'),
    ]

    for child_table, child_col, parent_table, parent_col in hierarchical_rels:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 5: Add many-to-many relationships
    print("Adding many-to-many relationships...")
    many_to_many_rels = [
        ('order_items', 'order_id', 'orders', 'order_id'),
        ('order_items', 'product_id', 'products', 'product_id'),
        ('employee_projects', 'employee_id', 'employees', 'employee_id'),
        ('employee_projects', 'project_id', 'projects', 'project_id'),
        ('employee_skills', 'employee_id', 'employees', 'employee_id'),
        ('employee_skills', 'skill_id', 'skills', 'skill_id'),
        ('purchase_orders', 'project_id', 'projects', 'project_id'),
    ]

    for child_table, child_col, parent_table, parent_col in many_to_many_rels:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 6: Add recursive relationships
    print("Adding recursive relationships...")
    synthesizer.add_self_referencing_relationship('employees', 'manager_id', 'employee_id')

    # Note: departments.manager_id -> employees.employee_id is cross-reference, not recursive
    synthesizer.add_foreign_key_relationship('departments', 'manager_id', 'employees', 'employee_id')

    print(f"✅ Setup complete!")
    print(f"📊 Tables: {len(synthesizer.real_data)}")
    print(f"🔗 Relationships: {len(synthesizer.relationships)}")

    return synthesizer, enterprise_data

# Usage:
# synthesizer, data = setup_large_pii_synthesizer()
# synthesizer.fit()  # This will take 15-30 minutes
# synthetic_data = synthesizer.generate_synthetic_data(scale=0.3)  # Start with small scale
'''

    return setup_code

In [None]:
if __name__ == "__main__":
    # Create the large dataset
    large_dataset = create_large_pii_dataset()

    # Show relationship mapping
    setup_large_dataset_relationships()

    # Display setup code
    print("\n" + "="*80)
    print("SYNTHESIZER SETUP CODE")
    print("="*80)
    setup_code = create_synthesizer_setup_code()
    print(setup_code)

    print("\n🚀 READY FOR SYNTHESIS!")
    print("⚠️  Note: Training may take 15-30 minutes due to dataset size")
    print("💡 Tip: Start with scale=0.1-0.3 for initial testing")

🚀 Creating Large-Scale PII Test Dataset...
📊 Creating CUSTOMERS table (150,000 records)...
📊 Creating EMPLOYEES table (25,000 records)...
📊 Creating TRANSACTIONS table (500,000 records)...
📊 Creating MEDICAL_RECORDS table (75,000 records)...
📊 Creating FINANCIAL_ACCOUNTS table (200,000 records)...
📊 Creating supporting tables...

🎉 LARGE-SCALE PII DATASET CREATION COMPLETED!
📊 DATASET OVERVIEW:
  Total Tables: 15
  Total Rows: 2,332,100
  Total Columns: 572
  Average Columns per Table: 38

📋 TABLE BREAKDOWN:
  customers           :  150,000 rows × 59 cols 🔥 LARGE
  employees           :   25,000 rows × 69 cols 📊 MEDIUM
  transactions        :  500,000 rows × 55 cols 🔥 LARGE
  medical_records     :   75,000 rows × 63 cols 📊 MEDIUM
  financial_accounts  :  200,000 rows × 67 cols 🔥 LARGE
  products            :   10,000 rows × 17 cols 📊 MEDIUM
  orders              :  300,000 rows × 18 cols 🔥 LARGE
  order_items         :  800,000 rows × 14 cols 🔥 LARGE
  departments         :      100 ro

In [None]:
metadata_real = Metadata.detect_from_dataframes(data=tables)

In [None]:
metadata_real

In [None]:
# =====================================================================
# COMPLETE SYNTHESIZER SETUP FOR LARGE PII DATASET
# =====================================================================

def setup_large_pii_synthesizer():
    """Setup synthesizer with all 15 tables and relationships"""

    # Step 1: Create the large dataset
    print("Creating large PII dataset...")
    enterprise_data = create_large_pii_dataset()

    # Step 2: Initialize synthesizer
    synthesizer = RecursiveMultiTableSynthesizer()

    # Step 3: Add all tables with primary keys
    print("Adding tables...")
    table_configs = [
        ('customers', 'customer_id'),
        ('employees', 'employee_id'),
        ('transactions', 'transaction_id'),
        ('medical_records', 'record_id'),
        ('financial_accounts', 'account_id'),
        ('products', 'product_id'),
        ('orders', 'order_id'),
        ('order_items', 'order_item_id'),
        ('departments', 'department_id'),
        ('projects', 'project_id'),
        ('employee_projects', 'assignment_id'),
        ('skills', 'skill_id'),
        ('employee_skills', 'emp_skill_id'),
        ('vendors', 'vendor_id'),
        ('purchase_orders', 'po_id')
    ]

    for table_name, primary_key in table_configs:
        synthesizer.add_table_data(table_name, enterprise_data[table_name], primary_key)

    # Step 4: Add hierarchical relationships
    print("Adding hierarchical relationships...")
    hierarchical_rels = [
        ('orders', 'customer_id', 'customers', 'customer_id'),
        ('transactions', 'customer_id', 'customers', 'customer_id'),
        ('transactions', 'account_id', 'financial_accounts', 'account_id'),
        ('financial_accounts', 'customer_id', 'customers', 'customer_id'),
        ('medical_records', 'patient_id', 'customers', 'customer_id'),
        ('employees', 'department_id', 'departments', 'department_id'),
        ('projects', 'department_id', 'departments', 'department_id'),
        ('projects', 'project_manager_id', 'employees', 'employee_id'),
        ('purchase_orders', 'requestor_id', 'employees', 'employee_id'),
        ('purchase_orders', 'approver_id', 'employees', 'employee_id'),
        ('purchase_orders', 'buyer_id', 'employees', 'employee_id'),
        ('purchase_orders', 'department_id', 'departments', 'department_id'),
        ('purchase_orders', 'vendor_id', 'vendors', 'vendor_id'),
    ]

    for child_table, child_col, parent_table, parent_col in hierarchical_rels:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 5: Add many-to-many relationships
    print("Adding many-to-many relationships...")
    many_to_many_rels = [
        ('order_items', 'order_id', 'orders', 'order_id'),
        ('order_items', 'product_id', 'products', 'product_id'),
        ('employee_projects', 'employee_id', 'employees', 'employee_id'),
        ('employee_projects', 'project_id', 'projects', 'project_id'),
        ('employee_skills', 'employee_id', 'employees', 'employee_id'),
        ('employee_skills', 'skill_id', 'skills', 'skill_id'),
        ('purchase_orders', 'project_id', 'projects', 'project_id'),
    ]

    for child_table, child_col, parent_table, parent_col in many_to_many_rels:
        synthesizer.add_foreign_key_relationship(child_table, child_col, parent_table, parent_col)

    # Step 6: Add recursive relationships
    print("Adding recursive relationships...")
    synthesizer.add_self_referencing_relationship('employees', 'manager_id', 'employee_id')

    # Note: departments.manager_id -> employees.employee_id is cross-reference, not recursive
    synthesizer.add_foreign_key_relationship('departments', 'manager_id', 'employees', 'employee_id')

    print(f"✅ Setup complete!")
    print(f"📊 Tables: {len(synthesizer.real_data)}")
    print(f"🔗 Relationships: {len(synthesizer.relationships)}")

    return synthesizer, enterprise_data

# Usage:
# synthesizer, data = setup_large_pii_synthesizer()
# synthesizer.fit()  # This will take 15-30 minutes
# synthetic_data = synthesizer.generate_synthetic_data(scale=0.3)  # Start with small scale


# 🚀 READY FOR SYNTHESIS!
#⚠ ️️  Note: Training may take 15-30 minutes due to dataset size
# 💡 Tip: Start with scale=0.1-0.3 for initial testing

In [None]:
# Usage:
synthesizer, data = setup_large_pii_synthesizer()


In [None]:
synthesizer.fit()  # This will take 15-30 minutes



Metadata Summary:
Tables: ['customers', 'employees', 'transactions', 'medical_records', 'financial_accounts', 'products', 'orders', 'order_items', 'departments', 'projects', 'employee_projects', 'skills', 'employee_skills', 'vendors', 'purchase_orders']
Relationships: 0
✓ Metadata validation successful!
Initializing gaussian_copula synthesizer...
✓ Synthesizer initialized successfully!
Training multi-table synthesizer...


Preprocess Tables: 100%|██████████| 15/15 [15:35<00:00, 62.34s/it]



Learning relationships:



Modeling Tables: 100%|██████████| 15/15 [22:16<00:00, 89.11s/it]

✓ Training completed successfully!





In [None]:
synthetic_data = synthesizer.generate_synthetic_data(scale=0.3)  # Start with small scale

Generating synthetic data...
✓ Synthetic data generation completed!
  - employees: 7500 rows, 69 columns
  - employee_skills: 36000 rows, 17 columns
  - products: 3000 rows, 17 columns
  - customers: 45000 rows, 59 columns
  - vendors: 1500 rows, 47 columns
  - medical_records: 22500 rows, 63 columns
  - skills: 600 rows, 11 columns
  - financial_accounts: 60000 rows, 67 columns
  - orders: 90000 rows, 18 columns
  - order_items: 240000 rows, 14 columns
  - departments: 30 rows, 14 columns
  - employee_projects: 15000 rows, 16 columns
  - purchase_orders: 24000 rows, 85 columns
  - projects: 4500 rows, 20 columns
  - transactions: 150000 rows, 55 columns


In [None]:
employees_syn= synthetic_data['employees']
customers_syn= synthetic_data['customers']
transactions_syn= synthetic_data['transactions']
medical_records_syn= synthetic_data['medical_records']
financial_accounts_syn= synthetic_data['financial_accounts']
products_syn= synthetic_data['products']
orders_syn= synthetic_data['orders']
order_items_syn= synthetic_data['order_items']
departments_syn= synthetic_data['departments']
projects_syn= synthetic_data['projects']
employee_projects_syn= synthetic_data['employee_projects']
skills_syn=synthetic_data['skills']
employee_skills_syn= synthetic_data['employee_skills']
vendors_syn= synthetic_data['vendors']
purchase_orders_syn= synthetic_data['purchase_orders']


In [None]:


data_s = {
    'employees': employees_syn,
    'customers': customers_syn,
    'transactions': transactions_syn,
    'medical_records': medical_records_syn,
    'financial_accounts': financial_accounts_syn,
    'products': products_syn,
    'orders': orders_syn,
    'order_items': order_items_syn,
    'departments': departments_syn,
    'projects': projects_syn,
    'employee_projects': employee_projects_syn,
    'skills': skills_syn,
    'employee_skills': employee_skills_syn,
    'vendors': vendors_syn,
    'purchase_orders': purchase_orders_syn
}



In [None]:
metadata_s

{
    "tables": {
        "employees": {
            "primary_key": "employee_id",
            "columns": {
                "employee_id": {
                    "sdtype": "id"
                },
                "employee_number": {
                    "sdtype": "categorical"
                },
                "badge_id": {
                    "sdtype": "id"
                },
                "first_name": {
                    "pii": true,
                    "sdtype": "first_name"
                },
                "last_name": {
                    "pii": true,
                    "sdtype": "last_name"
                },
                "full_name": {
                    "sdtype": "categorical"
                },
                "preferred_name": {
                    "sdtype": "categorical"
                },
                "middle_name": {
                    "sdtype": "categorical"
                },
                "ssn": {
                    "pii": true,
                    "s

In [None]:
metadata_s = Metadata.detect_from_dataframes(data=data_s)

NameError: name 'Metadata' is not defined