# Multi-Table Recursive Synthetic Data Generation - Jupyter Cells

## 1: Install Required Dependencies

In [None]:
1: Install Required Dependencies

In [None]:
pip install sdv

In [None]:
# Install required packages
!pip install sdv pandas numpy networkx scikit-learn

# Import warnings to suppress unnecessary output
import warnings
warnings.filterwarnings('ignore')
print("✓ Dependencies installed and warnings suppressed")

 ## 2: Import Libraries

In [4]:
import pandas as pd
import numpy as np
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
import networkx as nx
from typing import Dict, List, Tuple, Optional

print("✓ All libraries imported successfully")

✓ All libraries imported successfully


 ## 3: Define the RecursiveMultiTableSynthesizer Class (Part 1)

In [5]:
class RecursiveMultiTableSynthesizer:
    """
    A comprehensive multi-table synthetic data generator that handles
    recursive relationships and complex table dependencies using SDV.
    """

    def __init__(self, synthesizer_type='gaussian_copula'):
        """
        Initialize the recursive multi-table synthesizer.

        Args:
            synthesizer_type (str): Type of synthesizer ('gaussian_copula' or 'ctgan')
        """
        self.synthesizer_type = synthesizer_type
        self.metadata = Metadata()
        self.synthesizer = None
        self.table_dependencies = {}
        self.dependency_graph = nx.DiGraph()
        self.real_data = {}
        self.synthetic_data = {}
        self.relationships = []

    def add_table_data(self, table_name: str, data: pd.DataFrame,
                      primary_key: Optional[str] = None):
        """
        Add a table to the multi-table structure.

        Args:
            table_name (str): Name of the table
            data (pd.DataFrame): The actual data
            primary_key (str): Primary key column name
        """
        self.real_data[table_name] = data.copy()

        # Add table to metadata using the correct method
        self.metadata.detect_table_from_dataframe(table_name, data)

        # Set primary key if provided
        if primary_key and primary_key in data.columns:
            try:
                self.metadata.update_column(table_name, primary_key, sdtype='id')
                self.metadata.set_primary_key(table_name, primary_key)
                print(f"✓ Added table '{table_name}' with primary key '{primary_key}'")
            except Exception as e:
                print(f"Warning setting primary key for {table_name}: {e}")
        else:
            print(f"✓ Added table '{table_name}' (no primary key specified)")

print("✓ Class initialization methods defined")

✓ Class initialization methods defined


## Cell 4: Define Relationship Methods (Part 2)

In [6]:
def add_foreign_key_relationship(self, child_table: str, child_column: str,
                               parent_table: str, parent_column: str):
    """
    Add a foreign key relationship between tables.
    """
    try:
        # Ensure both tables exist
        if child_table not in self.real_data:
            raise ValueError(f"Child table '{child_table}' not found")
        if parent_table not in self.real_data:
            raise ValueError(f"Parent table '{parent_table}' not found")

        # Verify the columns exist
        if child_column not in self.real_data[child_table].columns:
            raise ValueError(f"Column '{child_column}' not found in table '{child_table}'")
        if parent_column not in self.real_data[parent_table].columns:
            raise ValueError(f"Column '{parent_column}' not found in table '{parent_table}'")

        # Mark foreign key column as 'id' type
        self.metadata.update_column(child_table, child_column, sdtype='id')

        # Add relationship to metadata
        self.metadata.add_relationship(
            parent_table_name=parent_table,
            child_table_name=child_table,
            parent_primary_key=parent_column,
            child_foreign_key=child_column
        )

        # Track for dependency graph
        self.dependency_graph.add_edge(parent_table, child_table)

        # Store relationship info
        if child_table not in self.table_dependencies:
            self.table_dependencies[child_table] = {}
        self.table_dependencies[child_table][child_column] = parent_table

        self.relationships.append({
            'parent_table': parent_table,
            'parent_column': parent_column,
            'child_table': child_table,
            'child_column': child_column
        })

        print(f"✓ Added relationship: {parent_table}.{parent_column} -> {child_table}.{child_column}")

    except Exception as e:
        print(f"✗ Error adding relationship: {e}")
        print(f"  Parent table columns: {list(self.real_data[parent_table].columns) if parent_table in self.real_data else 'Table not found'}")
        print(f"  Child table columns: {list(self.real_data[child_table].columns) if child_table in self.real_data else 'Table not found'}")

def add_self_referencing_relationship(self, table_name: str,
                                    foreign_key_col: str,
                                    primary_key_col: str):
    """
    Add a self-referencing (recursive) relationship within a table.
    """
    try:
        if table_name not in self.real_data:
            raise ValueError(f"Table '{table_name}' not found")

        # Update the foreign key column to be 'id' type
        self.metadata.update_column(table_name, foreign_key_col, sdtype='id')

        # Add self-referencing relationship
        self.metadata.add_relationship(
            parent_table_name=table_name,
            child_table_name=table_name,
            parent_primary_key=primary_key_col,
            child_foreign_key=foreign_key_col
        )

        # Add self-loop to dependency graph
        self.dependency_graph.add_edge(table_name, table_name)

        print(f"✓ Added self-referencing relationship in '{table_name}': {primary_key_col} -> {foreign_key_col}")

    except Exception as e:
        print(f"✗ Error adding self-referencing relationship: {e}")

# Add methods to the class
RecursiveMultiTableSynthesizer.add_foreign_key_relationship = add_foreign_key_relationship
RecursiveMultiTableSynthesizer.add_self_referencing_relationship = add_self_referencing_relationship

print("✓ Relationship methods added to class")

✓ Relationship methods added to class


## Cell 5: Define Helper Methods (Part 3)

In [7]:
def _detect_recursive_relationships(self):
    """
    Detect and handle recursive relationships in the table structure.
    """
    recursive_tables = []

    # Find self-loops (recursive relationships)
    for table in self.dependency_graph.nodes():
        if self.dependency_graph.has_edge(table, table):
            recursive_tables.append(table)

    # Find cycles in dependency graph
    try:
        cycles = list(nx.simple_cycles(self.dependency_graph))
        if cycles:
            print(f"Detected circular dependencies: {cycles}")

    except Exception as e:
        print(f"Error detecting cycles: {e}")

    return recursive_tables

def _get_synthesis_order(self) -> List[str]:
    """
    Determine the order for synthesizing tables based on dependencies.
    """
    try:
        # Create a copy of the graph without self-loops for topological sort
        temp_graph = self.dependency_graph.copy()
        self_loops = list(nx.selfloop_edges(temp_graph))
        temp_graph.remove_edges_from(self_loops)

        # Topological sort to get dependency order
        synthesis_order = list(nx.topological_sort(temp_graph))

        # Add any tables not in the dependency graph
        all_tables = set(self.real_data.keys())
        ordered_tables = set(synthesis_order)
        remaining_tables = all_tables - ordered_tables
        synthesis_order.extend(list(remaining_tables))

        return synthesis_order

    except nx.NetworkXError:
        # If there are cycles, use a heuristic approach
        print("Circular dependencies detected. Using heuristic ordering.")
        return list(self.real_data.keys())

# Add helper methods to the class
RecursiveMultiTableSynthesizer._detect_recursive_relationships = _detect_recursive_relationships
RecursiveMultiTableSynthesizer._get_synthesis_order = _get_synthesis_order

print("✓ Helper methods added to class")

✓ Helper methods added to class


## Cell 6: Define Synthesis Methods (Part 4)

In [8]:
def prepare_for_synthesis(self):
    """
    Prepare the data and metadata for synthesis.
    """
    # Detect recursive relationships
    recursive_tables = self._detect_recursive_relationships()
    if recursive_tables:
        print(f"Detected recursive relationships in tables: {recursive_tables}")

    print("\nMetadata Summary:")
    try:
        # Print metadata info
        metadata_dict = self.metadata.to_dict()
        print(f"Tables: {list(metadata_dict.get('tables', {}).keys())}")
        relationships = metadata_dict.get('relationships', [])
        print(f"Relationships: {len(relationships)}")
        for rel in relationships:
            print(f"  - {rel}")
    except Exception as e:
        print(f"Could not display metadata: {e}")

    # Validate metadata
    try:
        self.metadata.validate()
        print("✓ Metadata validation successful!")
    except Exception as e:
        print(f"✗ Metadata validation error: {e}")
        raise

    # Initialize synthesizer
    try:
        print(f"Initializing {self.synthesizer_type} synthesizer...")
        self.synthesizer = HMASynthesizer(metadata=self.metadata)
        print("✓ Synthesizer initialized successfully!")

    except Exception as e:
        print(f"✗ Error initializing synthesizer: {e}")
        raise

def fit(self):
    """
    Train the multi-table synthesizer on the real data.
    """
    if not self.synthesizer:
        self.prepare_for_synthesis()

    print("Training multi-table synthesizer...")
    try:
        self.synthesizer.fit(self.real_data)
        print("✓ Training completed successfully!")
    except Exception as e:
        print(f"✗ Training error: {e}")
        raise

# Add synthesis methods to the class
RecursiveMultiTableSynthesizer.prepare_for_synthesis = prepare_for_synthesis
RecursiveMultiTableSynthesizer.fit = fit

print("✓ Synthesis preparation methods added to class")

✓ Synthesis preparation methods added to class


## 7: Define Generation and Validation Methods (Part 5)

In [9]:
def generate_synthetic_data(self, num_rows: Optional[Dict[str, int]] = None,
                          scale: float = 1.0) -> Dict[str, pd.DataFrame]:
    """
    Generate synthetic data for all tables.
    """
    if not self.synthesizer:
        raise ValueError("Synthesizer not trained. Call fit() first.")

    print(f"Generating synthetic data...")

    try:
        if num_rows:
            # Calculate average scale from num_rows
            total_original = sum(len(df) for df in self.real_data.values())
            total_requested = sum(num_rows.get(table, len(df)) for table, df in self.real_data.items())
            calculated_scale = total_requested / total_original if total_original > 0 else 1.0
            print(f"Converting num_rows to scale factor: {calculated_scale:.2f}")
            self.synthetic_data = self.synthesizer.sample(scale=calculated_scale)

            # Then trim tables to requested sizes
            for table_name, requested_rows in num_rows.items():
                if table_name in self.synthetic_data:
                    current_rows = len(self.synthetic_data[table_name])
                    if current_rows > requested_rows:
                        self.synthetic_data[table_name] = self.synthetic_data[table_name].head(requested_rows)

        else:
            self.synthetic_data = self.synthesizer.sample(scale=scale)

        print("✓ Synthetic data generation completed!")

        # Print summary
        for table_name, df in self.synthetic_data.items():
            print(f"  - {table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

        return self.synthetic_data

    except Exception as e:
        print(f"✗ Generation error: {e}")
        # Try with default parameters
        try:
            print("Retrying with scale=1.0...")
            self.synthetic_data = self.synthesizer.sample(scale=1.0)
            print("✓ Fallback generation successful!")
            return self.synthetic_data
        except Exception as e2:
            print(f"✗ Fallback also failed: {e2}")
            raise

def validate_relationships(self) -> Dict[str, bool]:
    """
    Validate that relationships are maintained in synthetic data.
    """
    validation_results = {}

    if not self.synthetic_data:
        print("No synthetic data to validate.")
        return validation_results

    print("\nValidating relationships...")

    for rel in self.relationships:
        parent_table = rel['parent_table']
        parent_col = rel['parent_column']
        child_table = rel['child_table']
        child_col = rel['child_column']

        if parent_table not in self.synthetic_data or child_table not in self.synthetic_data:
            continue

        parent_df = self.synthetic_data[parent_table]
        child_df = self.synthetic_data[child_table]

        if parent_col in parent_df.columns and child_col in child_df.columns:
            # Get valid parent values (excluding None/NaN)
            parent_values = set(parent_df[parent_col].dropna().values)
            child_fk_values = set(child_df[child_col].dropna().values)

            # Check referential integrity
            invalid_refs = child_fk_values - parent_values
            is_valid = len(invalid_refs) == 0

            relationship_name = f"{parent_table}.{parent_col} -> {child_table}.{child_col}"
            validation_results[relationship_name] = is_valid

            status = "✓ Valid" if is_valid else f"✗ Invalid ({len(invalid_refs)} bad refs)"
            print(f"  {relationship_name}: {status}")

    return validation_results

def get_summary_statistics(self) -> Dict:
    """
    Get summary statistics comparing real and synthetic data.
    """
    summary = {
        'real_data': {},
        'synthetic_data': {},
        'comparison': {}
    }

    for table_name in self.real_data.keys():
        real_df = self.real_data[table_name]
        summary['real_data'][table_name] = {
            'shape': real_df.shape,
            'numeric_columns': len(real_df.select_dtypes(include=[np.number]).columns),
            'categorical_columns': len(real_df.select_dtypes(include=['object']).columns)
        }

        if table_name in self.synthetic_data:
            synthetic_df = self.synthetic_data[table_name]
            summary['synthetic_data'][table_name] = {
                'shape': synthetic_df.shape,
                'numeric_columns': len(synthetic_df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': len(synthetic_df.select_dtypes(include=['object']).columns)
            }

            # Compare basic statistics
            summary['comparison'][table_name] = {
                'row_ratio': synthetic_df.shape[0] / real_df.shape[0],
                'column_match': synthetic_df.shape[1] == real_df.shape[1]
            }

    return summary

# Add final methods to the class
RecursiveMultiTableSynthesizer.generate_synthetic_data = generate_synthetic_data
RecursiveMultiTableSynthesizer.validate_relationships = validate_relationships
RecursiveMultiTableSynthesizer.get_summary_statistics = get_summary_statistics

print("✓ Generation and validation methods added to class")
print("✓ RecursiveMultiTableSynthesizer class is now complete!")

✓ Generation and validation methods added to class
✓ RecursiveMultiTableSynthesizer class is now complete!


## Cell 8: Create Sample Data Function

In [10]:
def create_comprehensive_sample_data():
    """
    Create comprehensive sample data with 5 levels of relationships and recursive structures.
    """
    np.random.seed(42)  # For reproducible results

    # LEVEL 1: Companies (Root table)
    companies = pd.DataFrame({
        'company_id': range(1, 21),  # 20 companies
        'company_name': [f'Company_{i}' for i in range(1, 21)],
        'industry': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Retail'], 20),
        'founded_year': np.random.randint(1990, 2020, 20),
        'headquarters': np.random.choice(['NYC', 'SF', 'LA', 'Chicago', 'Boston'], 20)
    })

    # LEVEL 2: Departments (depends on companies)
    departments = pd.DataFrame({
        'dept_id': range(1, 101),  # 100 departments
        'company_id': np.random.choice(companies['company_id'], 100),
        'dept_name': np.random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'], 100),
        'budget': np.round(np.random.uniform(50000, 2000000, 100), 2),
        'manager_name': [f'Manager_{i}' for i in range(1, 101)]
    })

    # LEVEL 3: Employees (depends on departments)
    employees = pd.DataFrame({
        'employee_id': range(1, 501),  # 500 employees
        'dept_id': np.random.choice(departments['dept_id'], 500),
        'first_name': [f'FirstName_{i}' for i in range(1, 501)],
        'last_name': [f'LastName_{i}' for i in range(1, 501)],
        'salary': np.round(np.random.uniform(40000, 150000, 500), 2),
        'hire_date': pd.date_range('2020-01-01', periods=500, freq='D')[:500],
        'position': np.random.choice(['Junior', 'Senior', 'Lead', 'Manager'], 500),
        # Self-referencing: some employees report to other employees
        'reports_to': [None] * 400 + list(np.random.choice(range(1, 401), 100))
    })

    # LEVEL 4: Projects (depends on employees as project managers)
    projects = pd.DataFrame({
        'project_id': range(1, 201),  # 200 projects
        'manager_id': np.random.choice(employees['employee_id'], 200),
        'project_name': [f'Project_{i}' for i in range(1, 201)],
        'start_date': pd.date_range('2023-01-01', periods=200, freq='3D')[:200],
        'budget': np.round(np.random.uniform(10000, 500000, 200), 2),
        'status': np.random.choice(['Planning', 'In Progress', 'Completed', 'On Hold'], 200),
        'priority': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 200)
    })

    # LEVEL 5: Tasks (depends on projects)
    tasks = pd.DataFrame({
        'task_id': range(1, 1001),  # 1000 tasks
        'project_id': np.random.choice(projects['project_id'], 1000),
        'assigned_to': np.random.choice(employees['employee_id'], 1000),
        'task_name': [f'Task_{i}' for i in range(1, 1001)],
        'description': [f'Task description {i}' for i in range(1, 1001)],
        'estimated_hours': np.random.randint(1, 40, 1000),
        'actual_hours': np.random.randint(1, 50, 1000),
        'status': np.random.choice(['Todo', 'In Progress', 'Review', 'Done'], 1000),
        # Self-referencing: some tasks depend on other tasks
        'depends_on_task': [None] * 800 + list(np.random.choice(range(1, 801), 200))
    })

    # ADDITIONAL: Employee Skills (Many-to-Many relationship simulation)
    skills = pd.DataFrame({
        'skill_id': range(1, 51),  # 50 different skills
        'skill_name': [f'Skill_{i}' for i in range(1, 51)],
        'category': np.random.choice(['Technical', 'Management', 'Communication', 'Domain'], 50),
        'difficulty_level': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 50)
    })

    # Employee-Skills mapping (represents many-to-many)
    employee_skills = pd.DataFrame({
        'emp_skill_id': range(1, 1501),  # 1500 skill assignments
        'employee_id': np.random.choice(employees['employee_id'], 1500),
        'skill_id': np.random.choice(skills['skill_id'], 1500),
        'proficiency': np.random.choice(['Beginner', 'Intermediate', 'Advanced', 'Expert'], 1500),
        'years_experience': np.random.randint(1, 10, 1500),
        'certified': np.random.choice([True, False], 1500)
    })

    # Project Teams (Many-to-Many: Projects to Employees)
    project_teams = pd.DataFrame({
        'team_id': range(1, 801),  # 800 team assignments
        'project_id': np.random.choice(projects['project_id'], 800),
        'employee_id': np.random.choice(employees['employee_id'], 800),
        'role': np.random.choice(['Developer', 'Tester', 'Analyst', 'Designer'], 800),
        'allocation_percent': np.random.randint(25, 100, 800),
        'start_date': pd.date_range('2023-01-01', periods=800, freq='2D')[:800]
    })

    return {
        'companies': companies,
        'departments': departments,
        'employees': employees,
        'projects': projects,
        'tasks': tasks,
        'skills': skills,
        'employee_skills': employee_skills,
        'project_teams': project_teams
    }

print("✓ Sample data creation function defined")

✓ Sample data creation function defined


## Cell 9: Create Sample Data and Initialize Synthesizer

In [11]:
print("=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===\n")

# Create comprehensive sample data with 5 levels of relationships
data_tables = create_comprehensive_sample_data()
print("✓ Comprehensive sample data created with 5 levels of relationships")

# Show data summary
print("\n=== DATA SUMMARY ===")
for table_name, df in data_tables.items():
    print(f"{table_name}: {df.shape[0]} rows, {df.shape[1]} columns")

# Initialize synthesizer
synthesizer = RecursiveMultiTableSynthesizer()
print("✓ Synthesizer initialized")

=== COMPREHENSIVE MULTI-TABLE RECURSIVE SYNTHETIC DATA GENERATION ===

✓ Comprehensive sample data created with 5 levels of relationships

=== DATA SUMMARY ===
companies: 20 rows, 5 columns
departments: 100 rows, 5 columns
employees: 500 rows, 8 columns
projects: 200 rows, 7 columns
tasks: 1000 rows, 9 columns
skills: 50 rows, 4 columns
employee_skills: 1500 rows, 6 columns
project_teams: 800 rows, 6 columns
✓ Synthesizer initialized


## Cell 10: Add Tables to Synthesizer

In [12]:
# Add tables in dependency order (parents first)
print("\n=== ADDING TABLES ===")
synthesizer.add_table_data('companies', data_tables['companies'], primary_key='company_id')
synthesizer.add_table_data('departments', data_tables['departments'], primary_key='dept_id')
synthesizer.add_table_data('employees', data_tables['employees'], primary_key='employee_id')
synthesizer.add_table_data('projects', data_tables['projects'], primary_key='project_id')
synthesizer.add_table_data('tasks', data_tables['tasks'], primary_key='task_id')
synthesizer.add_table_data('skills', data_tables['skills'], primary_key='skill_id')
synthesizer.add_table_data('employee_skills', data_tables['employee_skills'], primary_key='emp_skill_id')
synthesizer.add_table_data('project_teams', data_tables['project_teams'], primary_key='team_id')


=== ADDING TABLES ===


## Cell 11: Define Relationships

In [13]:
# Add relationships - LEVEL 1 to 5 hierarchy
print("\n=== ADDING RELATIONSHIPS ===")
print("Level 1->2: Companies to Departments")
synthesizer.add_foreign_key_relationship('departments', 'company_id', 'companies', 'company_id')

print("Level 2->3: Departments to Employees")
synthesizer.add_foreign_key_relationship('employees', 'dept_id', 'departments', 'dept_id')

print("Level 3->4: Employees to Projects (as managers)")
synthesizer.add_foreign_key_relationship('projects', 'manager_id', 'employees', 'employee_id')

print("Level 4->5: Projects to Tasks")
synthesizer.add_foreign_key_relationship('tasks', 'project_id', 'projects', 'project_id')

print("Additional: Tasks assigned to Employees")
synthesizer.add_foreign_key_relationship('tasks', 'assigned_to', 'employees', 'employee_id')

# Many-to-Many relationships
print("Many-to-Many: Employee Skills")
synthesizer.add_foreign_key_relationship('employee_skills', 'employee_id', 'employees', 'employee_id')
synthesizer.add_foreign_key_relationship('employee_skills', 'skill_id', 'skills', 'skill_id')

print("Many-to-Many: Project Teams")
synthesizer.add_foreign_key_relationship('project_teams', 'project_id', 'projects', 'project_id')
synthesizer.add_foreign_key_relationship('project_teams', 'employee_id', 'employees', 'employee_id')

# Recursive/Self-referencing relationships
print("\nRecursive Relationships:")
synthesizer.add_self_referencing_relationship('employees', 'reports_to', 'employee_id')
synthesizer.add_self_referencing_relationship('tasks', 'depends_on_task', 'task_id')


=== ADDING RELATIONSHIPS ===
Level 1->2: Companies to Departments
✗ Error adding relationship: Unknown table name ('company_id').
  Parent table columns: ['company_id', 'company_name', 'industry', 'founded_year', 'headquarters']
  Child table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
Level 2->3: Departments to Employees
✗ Error adding relationship: Unknown table name ('dept_id').
  Parent table columns: ['dept_id', 'company_id', 'dept_name', 'budget', 'manager_name']
  Child table columns: ['employee_id', 'dept_id', 'first_name', 'last_name', 'salary', 'hire_date', 'position', 'reports_to']
Level 3->4: Employees to Projects (as managers)
✗ Error adding relationship: Unknown table name ('manager_id').
  Parent table columns: ['employee_id', 'dept_id', 'first_name', 'last_name', 'salary', 'hire_date', 'position', 'reports_to']
  Child table columns: ['project_id', 'manager_id', 'project_name', 'start_date', 'budget', 'status', 'priority']
Level 4->5: Proj

## Cell 12: Train the Synthesizer

In [14]:
# Train the synthesizer
print("\n" + "="*60)
print("TRAINING SYNTHESIZER")
print("="*60)
synthesizer.fit()


TRAINING SYNTHESIZER

Metadata Summary:
Tables: ['companies', 'departments', 'employees', 'projects', 'tasks', 'skills', 'employee_skills', 'project_teams']
Relationships: 0
✓ Metadata validation successful!
Initializing gaussian_copula synthesizer...
✓ Synthesizer initialized successfully!
Training multi-table synthesizer...


Preprocess Tables: 100%|██████████| 8/8 [00:02<00:00,  2.68it/s]



Learning relationships:



Modeling Tables: 100%|██████████| 8/8 [00:02<00:00,  2.72it/s]

✓ Training completed successfully!





## Cell 13: Generate Synthetic Data

In [15]:
# Generate synthetic data with custom scaling
print("\n" + "="*60)
print("GENERATING SYNTHETIC DATA")
print("="*60)

# Generate different scales for different table types
custom_rows = {
    'companies': 15,      # Fewer companies
    'departments': 80,    # Fewer departments
    'employees': 400,     # Scale down employees
    'projects': 150,      # Scale down projects
    'tasks': 800,         # Scale down tasks
    'skills': 40,         # Fewer skills
    'employee_skills': 1200,  # Scale down skill assignments
    'project_teams': 600      # Scale down team assignments
}

synthetic_data = synthesizer.generate_synthetic_data(num_rows=custom_rows)


GENERATING SYNTHETIC DATA
Generating synthetic data...
Converting num_rows to scale factor: 0.79
✓ Synthetic data generation completed!
  - project_teams: 600 rows, 6 columns
  - employees: 394 rows, 8 columns
  - employee_skills: 1182 rows, 6 columns
  - companies: 15 rows, 5 columns
  - skills: 39 rows, 4 columns
  - departments: 79 rows, 5 columns
  - tasks: 788 rows, 9 columns
  - projects: 150 rows, 7 columns


## Cell 14: Display Results and Validate

In [16]:
# Show sample results from each level
print("\n" + "="*60)
print("=== SAMPLE SYNTHETIC DATA BY LEVEL ===")

level_tables = [
    ('Level 1 - Companies', 'companies'),
    ('Level 2 - Departments', 'departments'),
    ('Level 3 - Employees', 'employees'),
    ('Level 4 - Projects', 'projects'),
    ('Level 5 - Tasks', 'tasks')
]

for level_name, table_name in level_tables:
    if table_name in synthetic_data:
        print(f"\n{level_name.upper()}:")
        df = synthetic_data[table_name]
        print(f"Shape: {df.shape}")
        print(df.head(2))


=== SAMPLE SYNTHETIC DATA BY LEVEL ===

LEVEL 1 - COMPANIES:
Shape: (15, 5)
   company_id company_name    industry  founded_year headquarters
0     4780961   Company_18  Healthcare          2003          NYC
1      834739   Company_18      Retail          2009          NYC

LEVEL 2 - DEPARTMENTS:
Shape: (79, 5)
    dept_id  company_id  dept_name      budget manager_name
0  16183965           7  Marketing  1658565.38   Manager_15
1  10585028          15  Marketing  1494141.55   Manager_35

LEVEL 3 - EMPLOYEES:
Shape: (394, 8)
   employee_id  dept_id first_name    last_name     salary  hire_date  \
0      8586549        4      Keith  Fitzpatrick   53015.65 2021-03-05   
1     13964787       90    Matthew        Evans  114789.39 2020-05-02   

  position  reports_to  
0   Senior       192.0  
1   Senior         NaN  

LEVEL 4 - PROJECTS:
Shape: (150, 7)
   project_id  manager_id project_name start_date     budget       status  \
0    15000355          51  Project_155 2024-04-04  146838.3

## Cell 15: Show Recursive Relationships and Final Summary

In [17]:
# Show recursive relationship examples
print("\n=== RECURSIVE RELATIONSHIPS EXAMPLES ===")

# Employee hierarchy
emp_df = synthetic_data['employees']
reporting_relationships = emp_df[emp_df['reports_to'].notna()][['employee_id', 'reports_to']].head(5)
if not reporting_relationships.empty:
    print("\nEmployee Reporting Structure:")
    print(reporting_relationships)

# Task dependencies
task_df = synthetic_data['tasks']
task_dependencies = task_df[task_df['depends_on_task'].notna()][['task_id', 'depends_on_task']].head(5)
if not task_dependencies.empty:
    print("\nTask Dependencies:")
    print(task_dependencies)

# Validate all relationships
print("\n" + "="*60)
print("VALIDATING RELATIONSHIPS")
print("="*60)
validation = synthesizer.validate_relationships()

# Get comprehensive summary
print("\n=== COMPREHENSIVE SUMMARY ===")
summary = synthesizer.get_summary_statistics()
print("Data Generation Ratios:")
for table_name, comparison in summary['comparison'].items():
    ratio = comparison['row_ratio']
    print(f"  {table_name:15}: {ratio:.2f}x original size")

print(f"\n✓ 5-Level Multi-table Recursive Synthesis Completed!")
print(f"✓ Generated data for {len(synthetic_data)} interconnected tables")
print(f"✓ Maintained {len(validation)} relationship constraints")

# Show relationship validation summary
valid_relationships = sum(1 for v in validation.values() if v)
total_relationships = len(validation)
print(f"✓ Relationship integrity: {valid_relationships}/{total_relationships} valid")

if valid_relationships == total_relationships:
    print("🎉 All relationships maintained perfectly!")
elif valid_relationships > total_relationships * 0.8:
    print("✨ Most relationships maintained well!")
else:
    print("⚠️  Some relationship issues detected - check validation details above")


=== RECURSIVE RELATIONSHIPS EXAMPLES ===

Employee Reporting Structure:
   employee_id  reports_to
0      8586549       192.0
3     16140471       234.0
4      1244123       183.0
6       681809       214.0
9     11851889       251.0

Task Dependencies:
     task_id  depends_on_task
3   10923461            471.0
4    8546174            450.0
7    3872162            472.0
10   2812097            166.0
13   7051891            247.0

VALIDATING RELATIONSHIPS

Validating relationships...

=== COMPREHENSIVE SUMMARY ===
Data Generation Ratios:
  companies      : 0.75x original size
  departments    : 0.79x original size
  employees      : 0.79x original size
  projects       : 0.75x original size
  tasks          : 0.79x original size
  skills         : 0.78x original size
  employee_skills: 0.79x original size
  project_teams  : 0.75x original size

✓ 5-Level Multi-table Recursive Synthesis Completed!
✓ Generated data for 8 interconnected tables
✓ Maintained 0 relationship constraints
✓ Re