In [None]:
# Default parameters (will be overridden by Papermill)
test_size = 0.2
random_state = 42
dataset = 'synthetic'
output_dir = 'outputs'

In [None]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

print(f"Loading data with parameters:")
print(f"  - dataset: {dataset}")
print(f"  - test_size: {test_size}")
print(f"  - random_state: {random_state}")

In [None]:
# Generate synthetic regression data
n_samples = 500
n_features = 10

X, y = make_regression(
    n_samples=n_samples,
    n_features=n_features,
    noise=20,
    random_state=random_state
)

# Create DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nBasic statistics:")
print(df.describe())

In [None]:
# Split data into train and test sets
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size,
    random_state=random_state
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Train target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

In [None]:
# Save data for next steps
os.makedirs(output_dir, exist_ok=True)

data_dir = os.path.join(output_dir, 'data')
os.makedirs(data_dir, exist_ok=True)

# Save as CSV
X_train.to_csv(os.path.join(data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(data_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(data_dir, 'y_train.csv'), index=False, header=['target'])
y_test.to_csv(os.path.join(data_dir, 'y_test.csv'), index=False, header=['target'])

# Save metadata
metadata = {
    'n_samples': n_samples,
    'n_features': n_features,
    'n_train': len(X_train),
    'n_test': len(X_test),
    'feature_names': feature_names,
    'test_size': test_size,
    'random_state': random_state
}

with open(os.path.join(data_dir, 'metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\nData saved to {data_dir}")
print(f"Files created:")
print(f"  - X_train.csv")
print(f"  - X_test.csv")
print(f"  - y_train.csv")
print(f"  - y_test.csv")
print(f"  - metadata.json")