# Synthetic Data Generation Notebook

This notebook demonstrates how to generate synthetic datasets for the Azure ML Fabric Demo predictive analytics MVP.

## Objectives
- Generate realistic manufacturing production data
- Create time-series data with seasonal patterns
- Simulate equipment telemetry with controlled variations
- Inject controlled anomalies for model validation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime, timedelta
import random
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

## 1. Manufacturing Production Data Generation

In [None]:
def generate_manufacturing_data(num_records=1000, start_date='2024-01-01'):
    """Generate synthetic manufacturing production data"""
    
    data = []
    start = datetime.strptime(start_date, '%Y-%m-%d')
    
    production_lines = ['LINE_A', 'LINE_B', 'LINE_C', 'LINE_D']
    operators = [f'OP{i:03d}' for i in range(1, 21)]  # 20 operators
    shifts = ['DAY', 'NIGHT', 'EVENING']
    
    for i in range(num_records):
        # Time progression (hourly data)
        timestamp = start + timedelta(hours=i)
        
        # Line-specific base performance
        line = random.choice(production_lines)
        base_output = {
            'LINE_A': 1200,
            'LINE_B': 1000,
            'LINE_C': 1350,
            'LINE_D': 900
        }[line]
        
        # Shift and time-based variations
        shift = shifts[timestamp.hour // 8]  # 8-hour shifts
        shift_multiplier = {'DAY': 1.0, 'EVENING': 0.95, 'NIGHT': 0.9}[shift]
        
        # Add random variations and trends
        output_quantity = int(base_output * shift_multiplier * (0.8 + 0.4 * np.random.random()))
        
        # Correlated defect rate (higher output often means higher defects)
        base_defect_rate = 0.02
        if output_quantity > base_output * 1.1:
            defect_rate = base_defect_rate * (1 + np.random.exponential(0.5))
        else:
            defect_rate = base_defect_rate * (0.5 + 0.5 * np.random.random())
        defect_rate = min(defect_rate, 0.1)  # Cap at 10%
        
        # Machine efficiency (inversely related to defect rate)
        base_efficiency = 0.92
        efficiency_factor = 1 - (defect_rate - base_defect_rate) * 2
        machine_efficiency = base_efficiency * efficiency_factor * (0.9 + 0.2 * np.random.random())
        machine_efficiency = max(0.75, min(1.0, machine_efficiency))
        
        data.append({
            'timestamp': timestamp.isoformat() + 'Z',
            'production_line': line,
            'output_quantity': output_quantity,
            'defect_rate': round(defect_rate, 4),
            'machine_efficiency': round(machine_efficiency, 3),
            'operator_id': random.choice(operators),
            'shift': shift
        })
    
    return data

# Generate manufacturing data
manufacturing_data = generate_manufacturing_data(500)
print(f"Generated {len(manufacturing_data)} manufacturing records")
print("Sample records:")
for i in range(3):
    print(json.dumps(manufacturing_data[i], indent=2))

## 2. Time-Series Data with Seasonal Patterns

In [None]:
def generate_seasonal_timeseries(days=400, start_date='2024-01-01'):
    """Generate time-series data with clear seasonal patterns"""
    
    data = []
    start = datetime.strptime(start_date, '%Y-%m-%d')
    
    for i in range(days):
        date = start + timedelta(days=i)
        day_of_year = date.timetuple().tm_yday
        
        # Multiple seasonal components
        yearly_cycle = 20 * np.sin(2 * np.pi * day_of_year / 365)  # Annual cycle
        weekly_cycle = 5 * np.sin(2 * np.pi * (i % 7) / 7)  # Weekly cycle
        
        # Combine seasonal components
        seasonal_component = yearly_cycle + weekly_cycle
        
        # Trend component
        trend = 100 + (i * 0.05) + (0.001 * i**1.1)  # Slight acceleration
        
        # Noise component
        noise = np.random.normal(0, 3)
        
        # Combined value
        value = trend + seasonal_component + noise
        
        data.append({
            'date': date.strftime('%Y-%m-%d'),
            'value': round(value, 2),
            'seasonal_component': round(seasonal_component, 2),
            'trend': round(trend, 2),
            'day_of_year': day_of_year
        })
    
    return data

# Generate seasonal time-series data
timeseries_data = generate_seasonal_timeseries(450)
print(f"Generated {len(timeseries_data)} time-series records")

# Visualize seasonal pattern
df_temp = pd.DataFrame(timeseries_data[:100])
plt.figure(figsize=(12, 4))
plt.plot(df_temp['date'], df_temp['value'], label='Total Value')
plt.plot(df_temp['date'], df_temp['seasonal_component'], label='Seasonal Component')
plt.title('Time-series with Seasonal Pattern (First 100 days)')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

## 3. Equipment Telemetry Data Generation

In [None]:
def generate_equipment_telemetry(num_records=200, equipment_types=None):
    """Generate realistic equipment telemetry data"""
    
    if equipment_types is None:
        equipment_types = {
            'PUMP': {'temp_range': (70, 85), 'vib_range': (1.5, 3.0), 'press_range': (10, 20)},
            'MOTOR': {'temp_range': (65, 80), 'vib_range': (1.0, 2.5), 'press_range': (5, 15)},
            'COMPRESSOR': {'temp_range': (75, 90), 'vib_range': (2.0, 4.0), 'press_range': (20, 35)},
            'GENERATOR': {'temp_range': (60, 75), 'vib_range': (1.2, 2.8), 'press_range': (8, 18)}
        }
    
    data = []
    start_time = datetime.now() - timedelta(days=7)  # Last week's data
    
    equipment_instances = []
    for eq_type, params in equipment_types.items():
        for i in range(1, 4):  # 3 instances of each type
            equipment_instances.append((f'{eq_type}_{i:03d}', params))
    
    for i in range(num_records):
        # Select random equipment
        equipment_id, params = random.choice(equipment_instances)
        
        # Time progression (15-minute intervals)
        timestamp = start_time + timedelta(minutes=15 * i)
        
        # Generate base readings within normal ranges
        temp_min, temp_max = params['temp_range']
        vib_min, vib_max = params['vib_range']
        press_min, press_max = params['press_range']
        
        temperature = np.random.uniform(temp_min, temp_max)
        vibration = np.random.uniform(vib_min, vib_max)
        pressure = np.random.uniform(press_min, press_max)
        
        # Add some correlation between metrics
        if temperature > (temp_min + temp_max) / 2:
            vibration *= 1.2  # Higher temp = more vibration
        
        # Determine status based on readings
        status = 'NORMAL'
        if temperature > temp_max * 0.9 or vibration > vib_max * 0.9:
            status = 'WARNING'
        if temperature > temp_max or vibration > vib_max:
            status = 'CRITICAL'
        
        # Flow rate (only for pumps and compressors)
        if 'PUMP' in equipment_id or 'COMPRESSOR' in equipment_id:
            flow_rate = np.random.uniform(100, 250)
        else:
            flow_rate = 0
        
        # Runtime hours (cumulative)
        base_runtime = random.randint(500, 3000)
        runtime_hours = base_runtime + (i * 0.25)  # 15 minutes = 0.25 hours
        
        data.append({
            'equipment_id': equipment_id,
            'timestamp': timestamp.isoformat() + 'Z',
            'temperature': round(temperature, 1),
            'vibration': round(vibration, 1),
            'pressure': round(pressure, 1),
            'flow_rate': round(flow_rate, 1),
            'status': status,
            'runtime_hours': round(runtime_hours, 2)
        })
    
    return data

# Generate equipment telemetry data
telemetry_data = generate_equipment_telemetry(150)
print(f"Generated {len(telemetry_data)} telemetry records")

# Show equipment distribution
equipment_ids = [record['equipment_id'] for record in telemetry_data]
unique_equipment = list(set(equipment_ids))
print(f"Unique equipment: {len(unique_equipment)}")
print(f"Equipment types: {sorted(unique_equipment)}")

## 4. Controlled Anomaly Generation

In [None]:
def generate_controlled_anomalies(num_anomalies=50, num_normal=30):
    """Generate controlled anomalies for model validation"""
    
    data = []
    start_time = datetime.now() - timedelta(days=3)
    equipment_ids = ['PUMP_001', 'MOTOR_002', 'COMPRESSOR_003', 'GENERATOR_004']
    
    anomaly_types = [
        'TEMPERATURE_SPIKE',
        'VIBRATION_ANOMALY', 
        'PRESSURE_DROP',
        'EFFICIENCY_DROP',
        'FLOW_IRREGULARITY'
    ]
    
    severity_levels = ['LOW', 'MEDIUM', 'HIGH']
    
    # Generate anomalies
    for i in range(num_anomalies):
        timestamp = start_time + timedelta(minutes=random.randint(0, 4320))  # 3 days
        equipment_id = random.choice(equipment_ids)
        anomaly_type = random.choice(anomaly_types)
        severity = random.choice(severity_levels)
        
        # Generate anomalous values based on type
        if anomaly_type == 'TEMPERATURE_SPIKE':
            value = random.uniform(90, 120)  # High temperature
            threshold = 85.0
        elif anomaly_type == 'VIBRATION_ANOMALY':
            value = random.uniform(4.5, 8.0)  # High vibration
            threshold = 4.0
        elif anomaly_type == 'PRESSURE_DROP':
            value = random.uniform(2, 8)  # Low pressure
            threshold = 10.0
        elif anomaly_type == 'EFFICIENCY_DROP':
            value = random.uniform(0.3, 0.7)  # Low efficiency
            threshold = 0.85
        else:  # FLOW_IRREGULARITY
            value = random.uniform(50, 80)  # Low flow
            threshold = 100.0
        
        # Adjust severity
        if severity == 'HIGH':
            if anomaly_type in ['TEMPERATURE_SPIKE', 'VIBRATION_ANOMALY']:
                value *= 1.3
            else:
                value *= 0.7
        elif severity == 'LOW':
            if anomaly_type in ['TEMPERATURE_SPIKE', 'VIBRATION_ANOMALY']:
                value *= 0.9
            else:
                value *= 0.9
        
        data.append({
            'timestamp': timestamp.isoformat() + 'Z',
            'equipment_id': equipment_id,
            'anomaly_type': anomaly_type,
            'severity': severity,
            'is_anomaly': True,
            'value': round(value, 2),
            'threshold': threshold,
            'description': f'{anomaly_type.replace("_", " ").title()} detected with {severity.lower()} severity'
        })
    
    # Generate normal readings
    for i in range(num_normal):
        timestamp = start_time + timedelta(minutes=random.randint(0, 4320))
        equipment_id = random.choice(equipment_ids)
        
        # Normal operating values
        value = random.uniform(70, 85)  # Normal range
        threshold = 85.0
        
        data.append({
            'timestamp': timestamp.isoformat() + 'Z',
            'equipment_id': equipment_id,
            'anomaly_type': 'NORMAL_OPERATION',
            'severity': 'NONE',
            'is_anomaly': False,
            'value': round(value, 2),
            'threshold': threshold,
            'description': 'Equipment operating within normal parameters'
        })
    
    # Sort by timestamp
    data.sort(key=lambda x: x['timestamp'])
    return data

# Generate controlled anomalies
anomaly_data = generate_controlled_anomalies(40, 20)
print(f"Generated {len(anomaly_data)} anomaly records")

# Show distribution
anomaly_count = sum(1 for record in anomaly_data if record['is_anomaly'])
normal_count = len(anomaly_data) - anomaly_count
print(f"Anomalies: {anomaly_count}, Normal: {normal_count}")

# Show severity distribution
severity_dist = {}
for record in anomaly_data:
    sev = record['severity']
    severity_dist[sev] = severity_dist.get(sev, 0) + 1
print(f"Severity distribution: {severity_dist}")

## 5. Save Generated Data

In [None]:
# Save all generated datasets
import os

# Ensure directories exist
os.makedirs('../data/synthetic', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Save synthetic data
with open('../data/synthetic/manufacturing_data.json', 'w') as f:
    json.dump(manufacturing_data, f, indent=2)

with open('../data/synthetic/timeseries_seasonal.json', 'w') as f:
    json.dump(timeseries_data, f, indent=2)

with open('../data/synthetic/equipment_telemetry.json', 'w') as f:
    json.dump(telemetry_data, f, indent=2)

with open('../data/synthetic/anomalies_controlled.json', 'w') as f:
    json.dump(anomaly_data, f, indent=2)

print("All synthetic datasets saved successfully!")
print("Files created:")
print("- manufacturing_data.json")
print("- timeseries_seasonal.json")
print("- equipment_telemetry.json")
print("- anomalies_controlled.json")

## 6. Data Summary and Statistics

In [None]:
# Generate comprehensive data summary
print("=== SYNTHETIC DATA GENERATION SUMMARY ===")
print(f"\n1. Manufacturing Data:")
print(f"   - Records: {len(manufacturing_data)}")
print(f"   - Date range: {manufacturing_data[0]['timestamp']} to {manufacturing_data[-1]['timestamp']}")
print(f"   - Production lines: {len(set(r['production_line'] for r in manufacturing_data))}")

print(f"\n2. Time-series Data:")
print(f"   - Records: {len(timeseries_data)}")
print(f"   - Date range: {timeseries_data[0]['date']} to {timeseries_data[-1]['date']}")
print(f"   - Seasonal range: {min(r['seasonal_component'] for r in timeseries_data):.1f} to {max(r['seasonal_component'] for r in timeseries_data):.1f}")

print(f"\n3. Equipment Telemetry:")
print(f"   - Records: {len(telemetry_data)}")
print(f"   - Unique equipment: {len(set(r['equipment_id'] for r in telemetry_data))}")
status_dist = {}
for r in telemetry_data:
    status_dist[r['status']] = status_dist.get(r['status'], 0) + 1
print(f"   - Status distribution: {status_dist}")

print(f"\n4. Controlled Anomalies:")
print(f"   - Total records: {len(anomaly_data)}")
print(f"   - Anomalies: {sum(1 for r in anomaly_data if r['is_anomaly'])}")
print(f"   - Normal: {sum(1 for r in anomaly_data if not r['is_anomaly'])}")

print("\n=== DATA READY FOR ML PIPELINE ===")