In [2]:
import numpy as np
import pandas as pd
import math

In [7]:
def generate_variable_data(name, var_type, n, seed):
    np.random.seed(seed)
    if var_type == 'continuous':
        return np.random.normal(loc=0, scale=1, size=n)
    elif var_type == 'discrete':
        return np.random.choice([0, 1], size=n)
    else:
        raise ValueError(f"Unsupported type for variable {name}: {var_type}")
    
def apply_dependency(inputs, dep_type, noise_std=0.1):
    data = np.zeros_like(inputs[0])

    if dep_type == 'linear':
        for i, inp in enumerate(inputs):
            data += (i + 1) * inp
    elif dep_type == 'nonlinear':
        for inp in inputs:
            data += np.sin(inp) + np.log(np.abs(inp) + 1)
    elif dep_type == 'multiplicative':
        data = np.ones_like(inputs[0])
        for inp in inputs:
            data *= inp
    elif dep_type == 'conditional':
        # expects at least 2 inputs
        cond = inputs[0]
        base = inputs[1]
        data = np.where(cond > 0, base + np.random.normal(0, noise_std, len(cond)),
                        np.random.normal(0, 1, len(cond)))
    else:
        raise ValueError(f"Unsupported dependency type: {dep_type}")

    return data + np.random.normal(0, noise_std, len(data))

def build_synthetic_data(var: dict, dependencies: dict, n_samples=1_000_000, seed=42):
    np.random.seed(seed)
    df = pd.DataFrame()
    
    # Track already generated vars
    for var_name in var:
        if var_name not in dependencies:
            df[var_name] = generate_variable_data(var_name, var[var_name], n_samples, seed + hash(var_name) % 1000)

    for var_name in dependencies:
        inputs = [df[input_var] for input_var in dependencies[var_name]['depends_on']]
        dep_type = dependencies[var_name]['type']
        raw_output = apply_dependency(inputs, dep_type)

        if var[var_name] == 'discrete':
            df[var_name] = (raw_output > np.median(raw_output)).astype(int)
        else:
            df[var_name] = raw_output

    return df

In [23]:
var = {
    'X1': 'continuous',
    'X2': 'discrete',
    'X3': 'continuous',
    'X4': 'discrete',
    'Y1': 'continuous',
    'Y2': 'discrete',
    'Y3': 'continuous',
    'Y4': 'discrete',
}

dependencies = {
    'Y1': {'depends_on': ['X1', 'X2'], 'type': 'nonlinear'},
    'Y2': {'depends_on': ['X3', 'Y1'], 'type': 'conditional'},
    'Y3': {'depends_on': ['Y1', 'Y2'], 'type': 'multiplicative'},
    'Y4': {'depends_on': ['Y2', 'X4'], 'type': 'linear'}
}

# Generate data
df = build_synthetic_data(var, dependencies, n_samples=5_000_000)

# Write to Parquet for efficient downstream processing
df.to_parquet("synthetic_data.parquet", index=False)

print("✅ Dataset created with shape:", df.shape)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/Aaqib/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/homebrew/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/Aaqib/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/A

AttributeError: _ARRAY_API not found

✅ Dataset created with shape: (5000000, 8)


In [24]:
pd.read_parquet('synthetic_data_new.parquet').count()  # Display the first few rows of the generated dataset


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/Aaqib/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/homebrew/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/Aaqib/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/A

AttributeError: _ARRAY_API not found

X1    5000000
X2    5000000
X3    5000000
X4    5000000
Y1    5000000
Y2    5000000
Y3    5000000
Y4    5000000
dtype: int64