# Datasets Analysis Tables

In [1]:
import numpy as np
import pandas as pd
import os

from configs.config import DATASETS
import openml

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer

In [None]:
source = "openml_ctr23"
table_data = []

datasets_to_run = DATASETS.get(source, {})


for dataset_key, dataset_info_dict in datasets_to_run.items():
    dataset_name = dataset_info_dict.get('name', dataset_key)

    # https://docs.openml.org/intro/
    task = openml.tasks.get_task(int(dataset_key))
    dataset = task.get_dataset()
    X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name)  
    train_indices, test_indices = task.get_train_test_split_indices(fold=0)
    
    train_size = len(train_indices)
    test_size = len(test_indices)

    num_input_features = X.shape[1]


    if y.ndim == 1:
        num_output_features = 1
    else:
        num_output_features = y.shape[1]

    num_categorical_features = sum(categorical_indicator)



    missing_values_per_feature = X.isnull().sum()
    total_missing_cells_in_X = missing_values_per_feature.sum()
    num_input_features_with_missing = (missing_values_per_feature > 0).sum()

    total_cells_in_X = X.size
    percent_missing_overall = (total_missing_cells_in_X / total_cells_in_X) * 100 if total_cells_in_X > 0 else 0

    table_data.append({
        "ID": int(dataset_key),
        "Name": dataset_name,
        "Train Size": train_size,
        "Test Size": test_size,
        "Input Features": num_input_features,
        "Output Features": num_output_features,
        "Categorical Input Features": num_categorical_features,
        "Input Features with Missing Values": num_input_features_with_missing,
        "% Missing Values": f"{percent_missing_overall:.2f}%"
    })


df_openml = pd.DataFrame(table_data)
#df_openml.to_excel("openml_benchmark.xlsx", index=False)
print(df_openml.to_string())
    

        ID                           Name  Train Size  Test Size  Input Features  Output Features  Categorical Input Features  Input Features with Missing Values % Missing Values
0   361251                 grid_stability        9000       1000              12                1                           0                                   0            0.00%
1   361252              video_transcoding       61905       6879              18                1                           2                                   0            0.00%
2   361253                    wave_energy       64800       7200              48                1                           0                                   0            0.00%
3   361254                         sarcos       44039       4894              21                1                           0                                   0            0.00%
4   361255             california_housing       18576       2064               8                1        

In [None]:
def load_uci_data_segment(filepath_data,
                          filepath_index_columns,
                          filepath_index_rows,
                          data_delimiter=None,
                          index_columns_delimiter=None,
                          index_rows_delimiter=None):
    """
    Loads a segment of UCI data based on data file and index files for rows and columns.
    Mimics the behavior of the provided UCIDataSet._load method.
    """
    # Load the entire data matrix
    data_full = np.loadtxt(filepath_data, delimiter=data_delimiter)
    df_full = pd.DataFrame(data_full)

    # Load column indices and reshape to be 1D
    index_columns = np.loadtxt(filepath_index_columns, dtype=np.int32, delimiter=index_columns_delimiter)
    index_columns = index_columns.reshape(-1)

    # Load row indices and reshape to be 1D
    index_rows = np.loadtxt(filepath_index_rows, dtype=np.int32, delimiter=index_rows_delimiter)
    index_rows = index_rows.reshape(-1)

    # Select the specified rows and columns
    return df_full.iloc[index_rows, index_columns]

def get_uci_scalers():
    """
    Returns the feature and target scalers as defined in ResFlowDataModule.
    """

    feature_scaler = Pipeline(
            [("quantile", QuantileTransformer(output_distribution="normal")),
             ("standarize", StandardScaler()),])

    target_scaler = MinMaxScaler(feature_range=(-1, 1))
    return feature_scaler, target_scaler

def fit_and_transform_data(X_train_raw: pd.DataFrame,
                             y_train_raw: pd.DataFrame,
                             X_test_raw: pd.DataFrame,
                             y_test_raw: pd.DataFrame):
    """
    Fits the scalers on training data and transforms train and test sets.
    Returns processed X_train, y_train, X_test, y_test as NumPy arrays,
    and the fitted scalers.
    """
    feature_scaler, target_scaler = get_uci_scalers()

    # Prepare data for scikit-learn (NumPy arrays)
    X_train_np = X_train_raw.to_numpy()
    y_train_np = y_train_raw.to_numpy()
    X_test_np = X_test_raw.to_numpy()
    y_test_np = y_test_raw.to_numpy()

    # Reshape y if it's 1D for scaler compatibility
    if y_train_np.ndim == 1:
        y_train_np = y_train_np.reshape(-1, 1)
    if y_test_np.ndim == 1:
        y_test_np = y_test_np.reshape(-1, 1)

    # Fit scalers on training data
    feature_scaler.fit(X_train_np)
    target_scaler.fit(y_train_np)

    # Transform data
    X_train_processed = feature_scaler.transform(X_train_np)
    y_train_processed = target_scaler.transform(y_train_np)
    X_test_processed = feature_scaler.transform(X_test_np)
    y_test_processed = target_scaler.transform(y_test_np)

    return (X_train_processed, y_train_processed,
            X_test_processed, y_test_processed,
            feature_scaler, target_scaler)

In [13]:
source = "uci"
table_data = []

datasets_to_run = DATASETS.get(source, {})
fold = 0

for dataset_key, dataset_info_dict in datasets_to_run.items():

    dataset_name = dataset_info_dict.get('name', dataset_key)

    current_dataset_path = os.path.join("downloaded_datasets/UCI", dataset_key)

    # Define file paths
    fp_data = os.path.join(current_dataset_path, "data.txt")
    fp_index_features = os.path.join(current_dataset_path, "index_features.txt")
    fp_index_target = os.path.join(current_dataset_path, "index_target.txt")
    fp_index_train_rows = os.path.join(current_dataset_path, f"index_train_{fold}.txt")
    fp_index_test_rows = os.path.join(current_dataset_path, f"index_test_{fold}.txt")

    required_files_info = {
        "Data File": fp_data, "Feature Index": fp_index_features,
        "Target Index": fp_index_target, "Train Row Index": fp_index_train_rows,
        "Test Row Index": fp_index_test_rows
    }

    all_files_present = True
    for name, path in required_files_info.items():
        if not os.path.exists(path):
            print(f"  ERROR: {name} not found at {path}")
            all_files_present = False
            break

    # Load raw data segments (no try-except here as per previous pattern)
    x_train_raw = load_uci_data_segment(fp_data, fp_index_features, fp_index_train_rows)
    y_train_raw = load_uci_data_segment(fp_data, fp_index_target, fp_index_train_rows)
    x_test_raw = load_uci_data_segment(fp_data, fp_index_features, fp_index_test_rows)
    y_test_raw = load_uci_data_segment(fp_data, fp_index_target, fp_index_test_rows)

    # Get characteristics from RAW loaded data
    train_size = x_train_raw.shape[0]
    test_size = x_test_raw.shape[0]
    num_input_features = x_train_raw.shape[1]

    if y_train_raw.ndim == 1:
        num_output_features = 1
    else:
        num_output_features = y_train_raw.shape[1]

    num_categorical = x_train_raw.select_dtypes(include=['object', 'category']).shape[1]



    # We combine X_train and X_test for this fold to get a view of missingness in the features
    x_combined_fold_raw = pd.concat([x_train_raw, x_test_raw], axis=0, ignore_index=True)

    missing_per_feature_combined = x_combined_fold_raw.isnull().sum()
    num_features_with_any_missing = (missing_per_feature_combined > 0).sum()
    total_missing_values_combined = missing_per_feature_combined.sum()

    total_cells_combined = x_combined_fold_raw.size # Total number of cells in the combined X data
    percent_missing_combined = (total_missing_values_combined / total_cells_combined) * 100 if total_cells_combined > 0 else 0

    table_data.append({
        "ID": dataset_key,
        "Name": dataset_name,
        "Train Size": train_size,
        "Test Size": test_size,
        "Input Features": num_input_features,
        "Output Features": num_output_features,
        "Categorical Input Features": num_categorical,
        "Input Features w/ Missing": num_features_with_any_missing,
        "% Missing": f"{percent_missing_combined:.2f}%"
    })


df_uci = pd.DataFrame(table_data)


print(df_uci.to_string())
#df_uci.to_excel("uci_benchmark.xlsx", index=False)
print("\nPreprocessing steps identified from ResFlowDataModule (applied after loading):")
print("  Features: 1. QuantileTransformer(output_distribution='normal') -> 2. StandardScaler()")
print("  Target:   1. MinMaxScaler(feature_range=(-1, 1))")

                           ID                                                      Name  Train Size  Test Size  Input Features  Output Features  Categorical Input Features  Input Features w/ Missing % Missing
0                    concrete                             Concrete Compressive Strength         927        103               8                1                           0                          0     0.00%
1                      energy                                         Energy Efficiency         691         77               8                1                           0                          0     0.00%
2                      kin8nm                                            Kinematics 8nm        7373        819               8                1                           0                          0     0.00%
3      naval-propulsion-plant    Condition Based Maintenance of Naval Propulsion Plants       10741       1193              16                1                     

# Final Table

In [17]:
print("-" * 50, "UCI", "-" * 50)
print(df_uci.to_string())

print("\n\n","-" * 50, "OpenML CTR 23", "-" * 50)
print(df_openml.to_string())

-------------------------------------------------- UCI --------------------------------------------------
                           ID                                                      Name  Train Size  Test Size  Input Features  Output Features  Categorical Input Features  Input Features w/ Missing % Missing
0                    concrete                             Concrete Compressive Strength         927        103               8                1                           0                          0     0.00%
1                      energy                                         Energy Efficiency         691         77               8                1                           0                          0     0.00%
2                      kin8nm                                            Kinematics 8nm        7373        819               8                1                           0                          0     0.00%
3      naval-propulsion-plant    Condition Based Maintenan

In [19]:
from utils.data_loader import load_preprocessed_data
from configs.config import DATASETS

source = "uci"
print_info = True 
train_model_flag = True

# For looping through all datasets in the source
datasets_to_run = DATASETS.get(source, {})

test = False
if test:
    datasets_to_run = DATASETS.get(source, {}).get("wine", None)
    if datasets_to_run:
        datasets_to_run = {"wine": datasets_to_run}
    else:
        print("Could not find a default dataset for testing. Please check DATASETS structure.")
        datasets_to_run = {}

for dataset_key, dataset_info_dict in datasets_to_run.items():
    dataset_display_name = dataset_info_dict.get('name', dataset_key)
    print(f"\n--- Processing dataset: {dataset_key} ({dataset_display_name}) ---")

  
    train_loader, val_loader, test_loader, dataset_name = \
            load_preprocessed_data(source, dataset_key, batch_size=64)

    num_numerical_features = train_loader.dataset.tensors[0].shape[1]  # Assuming tensor dataset
    print(f"Number of numerical features: {num_numerical_features}")
    print(f"Train loader batches: {len(train_loader)}, Val loader batches: {len(val_loader)}, Test loader batches: {len(test_loader)}")


22:28:29 - INFO: fetching Concrete Compressive Strength (concrete) locally.
22:28:29 - INFO: pre-processing concrete with feature/taget scaling of [-1,1].
22:28:29 - INFO: Data split for concrete: Train X: (741, 8), Train Y: (741, 1), Validation X: (186, 8), Validation Y: (186, 1)
22:28:29 - INFO: PyTorch DataLoaders created for UCI dataset concrete.
22:28:29 - INFO: fetching Energy Efficiency (energy) locally.
22:28:29 - INFO: pre-processing energy with feature/taget scaling of [-1,1].
22:28:29 - INFO: Data split for energy: Train X: (552, 8), Train Y: (552, 1), Validation X: (139, 8), Validation Y: (139, 1)
22:28:29 - INFO: PyTorch DataLoaders created for UCI dataset energy.
22:28:29 - INFO: fetching Kinematics 8nm (kin8nm) locally.
22:28:29 - INFO: pre-processing kin8nm with feature/taget scaling of [-1,1].



--- Processing dataset: concrete (Concrete Compressive Strength) ---
Number of numerical features: 8
Train loader batches: 12, Val loader batches: 3, Test loader batches: 2

--- Processing dataset: energy (Energy Efficiency) ---
Number of numerical features: 8
Train loader batches: 9, Val loader batches: 3, Test loader batches: 2

--- Processing dataset: kin8nm (Kinematics 8nm) ---


22:28:30 - INFO: Data split for kin8nm: Train X: (5898, 8), Train Y: (5898, 1), Validation X: (1475, 8), Validation Y: (1475, 1)
22:28:30 - INFO: PyTorch DataLoaders created for UCI dataset kin8nm.
22:28:30 - INFO: fetching Condition Based Maintenance of Naval Propulsion Plants (naval-propulsion-plant) locally.
22:28:30 - INFO: pre-processing naval-propulsion-plant with feature/taget scaling of [-1,1].


Number of numerical features: 8
Train loader batches: 93, Val loader batches: 24, Test loader batches: 13

--- Processing dataset: naval-propulsion-plant (Condition Based Maintenance of Naval Propulsion Plants) ---


22:28:30 - INFO: Data split for naval-propulsion-plant: Train X: (8592, 16), Train Y: (8592, 1), Validation X: (2149, 16), Validation Y: (2149, 1)
22:28:30 - INFO: PyTorch DataLoaders created for UCI dataset naval-propulsion-plant.
22:28:30 - INFO: fetching Combined Cycle Power Plant (power-plant) locally.
22:28:30 - INFO: pre-processing power-plant with feature/taget scaling of [-1,1].
22:28:30 - INFO: Data split for power-plant: Train X: (6888, 4), Train Y: (6888, 1), Validation X: (1723, 4), Validation Y: (1723, 1)
22:28:30 - INFO: PyTorch DataLoaders created for UCI dataset power-plant.
22:28:30 - INFO: fetching Physicochemical Properties of Protein Tertiary Structure (protein-tertiary-structure) locally.
22:28:30 - INFO: pre-processing protein-tertiary-structure with feature/taget scaling of [-1,1].


Number of numerical features: 16
Train loader batches: 135, Val loader batches: 34, Test loader batches: 19

--- Processing dataset: power-plant (Combined Cycle Power Plant) ---
Number of numerical features: 4
Train loader batches: 108, Val loader batches: 27, Test loader batches: 15

--- Processing dataset: protein-tertiary-structure (Physicochemical Properties of Protein Tertiary Structure) ---


22:28:31 - INFO: Data split for protein-tertiary-structure: Train X: (32925, 9), Train Y: (32925, 1), Validation X: (8232, 9), Validation Y: (8232, 1)
22:28:31 - INFO: PyTorch DataLoaders created for UCI dataset protein-tertiary-structure.
22:28:31 - INFO: fetching Wine Quality (wine-quality-red) locally.
22:28:31 - INFO: pre-processing wine-quality-red with feature/taget scaling of [-1,1].
22:28:31 - INFO: Data split for wine-quality-red: Train X: (1151, 11), Train Y: (1151, 1), Validation X: (288, 11), Validation Y: (288, 1)
22:28:31 - INFO: PyTorch DataLoaders created for UCI dataset wine-quality-red.
22:28:31 - INFO: fetching Yacht Hydrodynamics (yacht) locally.
22:28:31 - INFO: pre-processing yacht with feature/taget scaling of [-1,1].
22:28:31 - INFO: Data split for yacht: Train X: (221, 6), Train Y: (221, 1), Validation X: (56, 6), Validation Y: (56, 1)
22:28:31 - INFO: PyTorch DataLoaders created for UCI dataset yacht.


Number of numerical features: 9
Train loader batches: 515, Val loader batches: 129, Test loader batches: 72

--- Processing dataset: wine-quality-red (Wine Quality) ---
Number of numerical features: 11
Train loader batches: 18, Val loader batches: 5, Test loader batches: 3

--- Processing dataset: yacht (Yacht Hydrodynamics) ---
Number of numerical features: 6
Train loader batches: 4, Val loader batches: 1, Test loader batches: 1
