## Generate energy production dataset

In [None]:
import numpy as np
import pandas as pd

# Generate synthetic dataset with complex relationships
n_samples = 1000
np.random.seed(42)

# Generate base features
time_of_day = np.random.uniform(0, 24, n_samples)
temperature = np.random.uniform(10, 35, n_samples)
cloud_cover = np.random.uniform(0, 100, n_samples)
wind_speed = np.random.uniform(0, 30, n_samples)

# Create complex non-linear relationships for energy production
energy_production = (
    # Complex daily cycle with multiple peaks
    7 * np.sin(time_of_day * np.pi / 12) +
    3 * np.sin(time_of_day * np.pi / 6) +
    
    # Temperature effect with threshold and quadratic relationship
    0.3 * (temperature - 25)**2 * np.where(temperature > 25, -1, 1) +
    
    # Cloud cover with sudden changes and plateaus
    np.where(cloud_cover < 30, 10, 0) +
    np.where((cloud_cover >= 30) & (cloud_cover < 70), 5, 0) +
    
    # Wind speed with optimal range and diminishing returns
    5 * np.sin(wind_speed * np.pi / 15) +
    
    # Interaction terms
    0.2 * temperature * np.sin(wind_speed * np.pi / 10) +
    -0.1 * cloud_cover * np.cos(time_of_day * np.pi / 12) +
    
    # Random variations
    np.random.normal(0, 2, n_samples)
)

# Add some outliers
outlier_idx = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
energy_production[outlier_idx] += np.random.uniform(-10, 10, size=len(outlier_idx))

# Ensure no negative values
energy_production = np.maximum(energy_production, 0)

# Create DataFrame
data = pd.DataFrame({
    'time_of_day': time_of_day,
    'temperature': temperature,
    'cloud_cover': cloud_cover,
    'wind_speed': wind_speed,
    'energy_production': energy_production
})

#data.to_csv('energy_production_dataset.csv')

## Generate non linear dataset

In [None]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n = 1000

# Generate base features
X1 = np.random.uniform(-5, 5, n)
X2 = np.random.uniform(-5, 5, n)
X3 = np.random.uniform(-5, 5, n)

# Create complex non-linear target variable
y = (
    # Polynomial terms
    3 * X1**2 - 
    0.5 * X1**3 +
    
    # Sine waves
    5 * np.sin(X2) + 
    3 * np.cos(X2 * 2) +
    
    # Exponential terms
    2 * np.exp(-X3**2) +
    
    # Interaction terms
    1.5 * X1 * X2 * np.sin(X3) +
    
    # Step function
    4 * np.where(X2 > 0, 1, -1) +
    
    # Logistic function
    5 * (1 / (1 + np.exp(-X3))) +
    
    # Periodic patterns
    3 * np.sin(X1 * X2) +
    
    # Threshold effects
    2 * np.maximum(X3, 0)**2 +
    
    # Add some noise
    np.random.normal(0, 0.5, n)
)

# Create DataFrame
data = pd.DataFrame({
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'target': y
})

# Save to CSV
data.to_csv('nonlinear_dataset.csv', index=False)

# Print some statistics to verify non-linearity
print("\nCorrelation matrix:")
print(data.corr())

# Calculate additional non-linear correlation metrics
print("\nSpearman correlation (captures monotonic relationships):")
print(data.corr(method='spearman')['target'])

# Print summary statistics
print("\nSummary statistics:")
print(data.describe())