In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

#IN ALL CASES, Y WILL BE THE SAME FUNCTION OF X!

# Parameters
n_obs = 10000  # number of observations
n_features = 10  # number of features

# 1) Normally distributed features
normal_data = np.random.normal(loc=0, scale=1, size=(n_obs, n_features))
df_normal = pd.DataFrame(normal_data, columns=[f"feature_{i+1}" for i in range(n_features)])
df_normal.to_csv("../toy_datasets/dataset_normal.csv", index=False)
print("Normal dataset created: dataset_normal.csv")

# 2) Extremely multimodal features
def generate_multimodal(n, modes, p):
    """Generate data from a mixture of normal distributions.
       'modes' is a list of tuples (mean, std)."""
    data = np.zeros((n, p))
    for i in range(p):
        # Randomly choose a mode for each observation
        mode_choices = np.random.choice(len(modes), size=n)
        # For each observation, draw from the chosen normal distribution
        data[:, i] = np.array([np.random.normal(loc=modes[j][0], scale=modes[j][1])
                                 for j in mode_choices])
    return data

# Define modes: four distinct modes for an "extremely multimodal" distribution.
modes = [(-10, 1), (-3, 1), (3, 1), (10, 1)]
multimodal_data = generate_multimodal(n_obs, modes, n_features)
df_multimodal = pd.DataFrame(multimodal_data, columns=[f"feature_{i+1}" for i in range(n_features)])
df_multimodal.to_csv("../toy_datasets/dataset_multimodal.csv", index=False)
print("Multimodal dataset created: dataset_multimodal.csv")

# 3) Very highly skewed features (<1% are outliers but huge outliers)
# Start with a normally distributed dataset...
skewed_data = np.random.normal(loc=0, scale=1, size=(n_obs, n_features))
# Then, for each feature, replace <1% of the data with huge outliers.
n_outliers = max(1, int(n_obs * 0.01))  # ensure at least one outlier per feature
for col in range(n_features):
    # Select random indices for outliers
    outlier_indices = np.random.choice(n_obs, n_outliers, replace=False)
    # Amplify these values to simulate huge outliers
    skewed_data[outlier_indices, col] *= 100
df_skewed = pd.DataFrame(skewed_data, columns=[f"feature_{i+1}" for i in range(n_features)])
df_skewed.to_csv("../toy_datasets/dataset_skewed.csv", index=False)
print("Skewed dataset created: dataset_skewed.csv")

# 4) Uniformly distributed features
uniform_data = np.random.uniform(low=-10, high=10, size=(n_obs, n_features))
df_uniform = pd.DataFrame(uniform_data, columns=[f"feature_{i+1}" for i in range(n_features)])
df_uniform.to_csv("dataset_uniform.csv", index=False)
print("Uniform dataset created: dataset_uniform.csv")


Normal dataset created: dataset_normal.csv
Multimodal dataset created: dataset_multimodal.csv
Skewed dataset created: dataset_skewed.csv
Uniform dataset created: dataset_uniform.csv


In [None]:
from lightgbm import LGBMRegressor
from src import DataBinner
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint, loguniform, ttest_rel
from sklearn.pipeline import make_pipeline


# Parameter distribution for LightGBM
param_dist_lgbm = {
    'lgbmregressor__n_estimators': randint(20, 150),
    'lgbmregressor__learning_rate': loguniform(0.001, 0.5),
    'lgbmregressor__num_leaves': randint(8, 64),
    'lgbmregressor__subsample': uniform(0.5, 0.5),
    'lgbmregressor__colsample_bytree': uniform(0.5, 0.5)
}

for dataset_name, dataset in zip(["normal", "multimodal", "skewed", "uniform"],
                                 [df_normal, df_multimodal, df_skewed, df_uniform]):
    print(f"---- {dataset_name} dataset ----")
    X = np.array(dataset)
    y = np.sum(X, axis = 1) + np.random.normal(loc=0, scale=1, size=n_obs)
    
    errors_quantile = []
    errors_kmeans = []
    errors_linspace = []
    for random_state in range(10):
        print(f"Random state: {random_state}")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        original_cols = dataset.columns
        
        quantile_binner = DataBinner(method='quantile', n_bins = 255)
        kmeans_binner = DataBinner(method='kmeans', n_bins = 255)
        linspace_binner = DataBinner(method = 'linspace', n_bins=255)
        
        #Quantile
        quantile_pipeline = make_pipeline(quantile_binner, LGBMRegressor(verbosity=-1, n_jobs=1, random_state=42))
        quantile_search = RandomizedSearchCV(quantile_pipeline, param_distributions=param_dist_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=random_state)
        quantile_search.fit(X_train, y_train)
        errors_quantile.append(mean_squared_error(y_test, quantile_search.predict(X_test)))
        
        #Kmeans
        kmeans_pipeline = make_pipeline(kmeans_binner, LGBMRegressor(verbosity=-1, n_jobs=1, random_state=42))
        kmeans_search = RandomizedSearchCV(kmeans_pipeline, param_distributions=param_dist_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=random_state)
        kmeans_search.fit(X_train, y_train)
        errors_kmeans.append(mean_squared_error(y_test, kmeans_search.predict(X_test)))
        
        #Linspace
        linspace_pipeline = make_pipeline(linspace_binner, LGBMRegressor(verbosity=-1, n_jobs=1, random_state=42))
        linspace_search = RandomizedSearchCV(linspace_pipeline, param_distributions=param_dist_lgbm, n_iter=30, cv=5, n_jobs=-1, random_state=random_state)
        linspace_search.fit(X_train, y_train)
        errors_linspace.append(mean_squared_error(y_test, linspace_search.predict(X_test)))

    
    print(f"Mean squared error (quantile): {np.mean(errors_quantile)}")
    print(f"Mean squared error (kmeans): {np.mean(errors_kmeans)}")
    print(f"Mean squared error (linspace): {np.mean(errors_linspace)}")
    print(f"Relative kmeans improvement: {100*(np.mean(errors_quantile) - np.mean(errors_kmeans))/np.mean(errors_quantile)}%")
    print(f"p-value kmeans: {ttest_rel(errors_quantile, errors_kmeans).pvalue}")
    print(f"Relative linspace improvement: {100*(np.mean(errors_quantile) - np.mean(errors_linspace))/np.mean(errors_quantile)}%")
    print(f"p-value linspace: {ttest_rel(errors_quantile, errors_linspace).pvalue}")



---- normal dataset ----
Random state: 0




house\_sales & *** & *** & *** & *** &  &  \\
sulfur &  &  &  &  &  &  \\
medical\_charges & *** & *** & *** & *** & *** & *** \\
MiamiHousing2016 &  &  & * &  &  &  \\
