In [1]:
# standard import
import pandas as pd
import numpy as np

# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import resample

# miscilaneous models
import openml
from ucimlrepo import fetch_ucirepo
import os

# import warnings
# warnings.simplefilter(action="ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=DeprecationWarning)

# Data


In [2]:
def process_data(X, y, cat_feat, num_feat):
    # one-hot encode categorical features
    X_processed = pd.get_dummies(X, columns=cat_feat, drop_first=True, dtype=int)

    # save categorical features after one-hot encoding
    cat_feat_dummy = X_processed.drop(columns=num_feat).columns.to_numpy()

    # get feature importance
    # random forest
    imp_model_rf = RandomForestRegressor(
        min_samples_leaf=5, max_features=0.33, n_estimators=100, random_state=777
    )
    feat_imp_rf = imp_model_rf.fit(X_processed, y).feature_importances_

    imp_df = pd.DataFrame(
        {
            "feature": X_processed.columns,
            "importance_rf": feat_imp_rf,
        }
    ).sort_values("importance_rf", ascending=False)

    X_binned = X_processed[num_feat].apply(
        lambda c: pd.qcut(c, q=4, duplicates="drop"), axis=0
    )
    X_binned = pd.concat([X_binned, X_processed[cat_feat_dummy]], axis=1)

    subgroup_dict = {
        "num_feat": num_feat,
        "cat_feat": cat_feat_dummy,
        "importance": imp_df,
        "binned_df": X_binned,
    }

    return X_processed, y, subgroup_dict

# Parkinsons

[Parkinsons Telemonitoring](https://archive.ics.uci.edu/dataset/189/parkinsons+telemonitoring)

#### Dataset Information:

This dataset is composed of a range of biomedical voice measurements from 42 people with early-stage Parkinson's disease recruited to a six-month trial of a telemonitoring device for remote symptom progression monitoring. The recordings were automatically captured in the patient's homes.

#### Introductory Paper:

[Accurate Telemonitoring of Parkinson's Disease Progression by Noninvasive Speech Tests](https://www.semanticscholar.org/paper/Accurate-Telemonitoring-of-Parkinson's-Disease-by-Tsanas-Little/1fdf33b6d8b1bdb38866ba824c1dcaecdfb6bdd6)

By A. Tsanas, Max A. Little, P. McSharry, L. Ramig, 2009


In [3]:
data = "data_parkinsons"
# fetch dataset
parkinsons_telemonitoring = fetch_ucirepo(id=189)

X_orig = parkinsons_telemonitoring.data.features.drop(columns="test_time")
y = parkinsons_telemonitoring.data.targets.total_UPDRS.to_numpy()

# save categorical and numerical features
cat_feat_parkinsons = ["sex"]
num_feat_parkinsons = X_orig.drop(columns=cat_feat_parkinsons).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_parkinsons, num_feat_parkinsons)
bin_df = subgroups["binned_df"]

new_bin_df = bin_df.copy(deep=True)
new_bin_df["DFA"] = (X.DFA <= 0.68).astype(int)
subgroups["new_binned_df"] = new_bin_df

# if not os.path.exists(f"data/{data}"):
#     os.mkdir(f"data/{data}")

# bin_df_path = f"data/{data}/bin_df.csv"
# X_path = f"data/{data}/X.csv"
# y_path = f"data/{data}/y.csv"

# # Save dataframes or arrays
# bin_df.to_csv(bin_df_path, index=False)
# X.to_csv(X_path, index=False)
# np.savetxt(y_path, y, delimiter=',')

In [5]:
# get column types in X
col_types = X.dtypes
col_types

age                int64
Jitter(%)        float64
Jitter(Abs)      float64
Jitter:RAP       float64
Jitter:PPQ5      float64
Jitter:DDP       float64
Shimmer          float64
Shimmer(dB)      float64
Shimmer:APQ3     float64
Shimmer:APQ5     float64
Shimmer:APQ11    float64
Shimmer:DDA      float64
NHR              float64
HNR              float64
RPDE             float64
DFA              float64
PPE              float64
sex_1              int64
dtype: object

In [10]:
# read in the data
from os.path import join as oj
dir_data = "data/data_parkinsons"
X = pd.read_csv(oj(dir_data, "X.csv"))
# X = np.loadtxt(oj(dir_data, "X.csv"), delimiter=",")
# y = np.loadtxt(oj(dir_data, "y.csv"), delimiter=",")
X.to_numpy()

array([[7.2000e+01, 6.6200e-03, 3.3800e-05, ..., 5.4842e-01, 1.6006e-01,
        0.0000e+00],
       [7.2000e+01, 3.0000e-03, 1.6800e-05, ..., 5.6477e-01, 1.0810e-01,
        0.0000e+00],
       [7.2000e+01, 4.8100e-03, 2.4600e-05, ..., 5.4405e-01, 2.1014e-01,
        0.0000e+00],
       ...,
       [6.1000e+01, 3.4900e-03, 2.4700e-05, ..., 5.7888e-01, 1.4157e-01,
        0.0000e+00],
       [6.1000e+01, 2.8100e-03, 2.0300e-05, ..., 5.6327e-01, 1.4204e-01,
        0.0000e+00],
       [6.1000e+01, 2.8200e-03, 2.1100e-05, ..., 5.7077e-01, 1.5336e-01,
        0.0000e+00]])

In [13]:
y = pd.read_csv(oj(dir_data, "y.csv")).to_numpy().flatten()
y

array([34.894, 35.389, 35.81 , ..., 32.495, 32.007, 31.513])

In [17]:
y = pd.read_csv(oj(dir_data, "y.csv"), header=None)
y

Unnamed: 0,0
0,34.398
1,34.894
2,35.389
3,35.810
4,36.375
...,...
5870,33.485
5871,32.988
5872,32.495
5873,32.007


In [14]:
X.shape

(5875, 18)

In [15]:
y.shape

(5874,)

# Airfoil

[Airfoil Self-Noise - UCI](https://archive.ics.uci.edu/dataset/291/airfoil+self+noise)

#### Dataset Information:

NASA data set, obtained from a series of aerodynamic and acoustic tests of two and three-dimensional airfoil blade sections conducted in an anechoic wind tunnel.


In [66]:
data = "data_airfoil"
# fetch dataset
airfoil_self_noise = fetch_ucirepo(id=291)

# data (as pandas dataframes)
X_orig = airfoil_self_noise.data.features
y = airfoil_self_noise.data.targets["scaled-sound-pressure"].to_numpy()

# save categorical and numerical features
cat_feat_airfoil = []
num_feat_airfoil = X_orig.drop(columns=cat_feat_airfoil).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_airfoil, num_feat_airfoil)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# California Housing

[California Housing Prices](https://www.kaggle.com/datasets/camnugent/california-housing-prices/data)

#### Dataset Information:

This is the dataset used in the second chapter of Aurélien Géron's recent book 'Hands-On Machine learning with Scikit-Learn and TensorFlow'. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome.

The data contains information from the 1990 California census. So although it may not help you with predicting current housing prices like the Zillow Zestimate dataset, it does provide an accessible introductory dataset for teaching people about the basics of machine learning.


In [21]:
data = "data_cal_housing"

# fetch data
housing = pd.read_csv("../data/data_cal_housing/cal_housing.data", delimiter=",", names = ["lon", "lat", "med_age", "total_rooms", "total_beds", "population", "households", "med_income", "med_price"])

# data
X_orig = housing.drop(columns="med_price")
y = housing.med_price.to_numpy()

X_orig, y = resample(X_orig, y, replace=False, n_samples=5000, random_state=777)
X_orig = X_orig.reset_index(drop=True)

# save categorical and numerical features
cat_feat_ca_housing = []
num_feat_ca_housing = X_orig.drop(columns = cat_feat_ca_housing).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_ca_housing, num_feat_ca_housing)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')


In [15]:
# get california housing from imodels
from imodels import get_clean_dataset
X, y, colnames = get_clean_dataset("california_housing", data_source="sklearn")

fetching california_housing from sklearn


In [16]:
X = pd.DataFrame(X, columns=colnames)

In [19]:
# write to csv
X.to_csv("data/data_california/X.csv", index=False)
np.savetxt("data/data_california/y.csv", y, delimiter=',')

# Computer

[cpu_act - OpenML](https://www.openml.org/search?type=data&status=active&id=197&sort=runs)

#### Dataset Information:

The Computer Activity databases are a collection of computer systems activity measures. The data was collected from a Sun Sparcstation 20/712 with 128 Mbytes of memory running in a multi-user university department. Users would typically be doing a large variety of tasks ranging from accessing the internet, editing files or running very cpu-bound programs. The data was collected continuously on two separate occasions. On both occassions, system activity was gathered every 5 seconds. The final dataset is taken from both occasions with equal numbers of observations coming from each collection epoch.


In [16]:
data = "data_computer"

# fetch dataset
computer = openml.datasets.get_dataset(197)

# data
X_orig, y, cat_ind, col_names = computer.get_data(
    target=computer.default_target_attribute, dataset_format="dataframe"
)
y = y.to_numpy()

# save categorical and numerical features
cat_feat_computer = []
num_feat_computer = X_orig.drop(columns=cat_feat_computer).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_computer, num_feat_computer)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Concrete

[Concrete Compressive Strength - UCI](https://archive.ics.uci.edu/dataset/165/concrete+compressive+strength)

#### Dataset Information:

Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients.


In [68]:
data = "data_concrete"

# fetch dataset
concrete_compressive_strength = fetch_ucirepo(id=165)

# data (as pandas dataframes)
X_orig = concrete_compressive_strength.data.features
y = concrete_compressive_strength.data.targets[
    "Concrete compressive strength"
].to_numpy()

# save categorical and numerical features
cat_feat_concrete = []
num_feat_concrete = X_orig.drop(columns=cat_feat_concrete).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_concrete, num_feat_concrete)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Powerplant

[Combined Cycle Power Plant - UCI](https://archive.ics.uci.edu/dataset/294/combined+cycle+power+plant)

#### Dataset Information:

The dataset contains 9568 data points collected from a Combined Cycle Power Plant over 6 years (2006-2011), when the plant was set to work with full load.


In [69]:
data = "data_powerplant"

# fetch dataset
combined_cycle_power_plant = fetch_ucirepo(id=294)

# data (as pandas dataframes)
X_orig = combined_cycle_power_plant.data.features
y = combined_cycle_power_plant.data.targets.PE.to_numpy()

# save categorical and numerical features
cat_feat_powerplant = []
num_feat_powerplant = X_orig.drop(columns=cat_feat_powerplant).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_powerplant, num_feat_powerplant)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Miami Housing

#### Dataset Information:

The dataset contains information on 13,932 single-family homes sold in Miami .


In [5]:
data = "data_miami_housing"

# fetch dataset
miami_housing = openml.datasets.get_dataset(43093)

# data
X_orig, y, cat_ind, col_names = miami_housing.get_data(
    target=miami_housing.default_target_attribute, dataset_format="dataframe"
)
X_orig, y = resample(X_orig, y, replace=False, n_samples=5000, random_state=777)
X_orig = X_orig.drop(columns="PARCELNO").reset_index(drop=True)
y = y.to_numpy()

# log-transform target
y = np.log(y)

# save categorical and numerical features
cat_feat_miami_housing = ["avno60plus", "month_sold", "structure_quality"]
num_feat_miami_housing = X_orig.drop(columns=cat_feat_miami_housing).columns.to_numpy()

X, y, subgroups = process_data(
    X_orig, y, cat_feat_miami_housing, num_feat_miami_housing
)
bin_df = subgroups["binned_df"]

# if not os.path.exists(f"../data/{data}"):
#     os.mkdir(f"../data/{data}")

# bin_df_path = f"../data/{data}/bin_df.csv"
# X_path = f"../data/{data}/X.csv"
# y_path = f"../data/{data}/y.csv"

# # Save dataframes or arrays
# bin_df.to_csv(bin_df_path, index=False)
# X.to_csv(X_path, index=False)
# np.savetxt(y_path, y, delimiter=',')


  miami_housing = openml.datasets.get_dataset(43093)


In [None]:
# # write to csv
# X.to_csv("data/data_miami/X.csv", index=False)
# np.savetxt("data/data_miami/y.csv", y, delimiter=",")

# Insurance

[Insurance - pycaret](https://github.com/pycaret/pycaret/blob/master/datasets/insurance.csv)


In [8]:
data = "data_insurance"

# fetch data
insurance = pd.read_csv(
    "https://raw.githubusercontent.com/pycaret/datasets/main/data/common/insurance.csv"
)

# data
X_orig = insurance.drop(columns="charges")
y = insurance.charges.to_numpy()

# save categorical and numerical features
cat_feat_insurance = ["sex", "smoker", "region"]
num_feat_insurance = X_orig.drop(columns=cat_feat_insurance).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_insurance, num_feat_insurance)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# QSAR

[QSAR - openML](https://www.openml.org/search?type=data&sort=runs&id=33368)

#### Dataset Information

This dataset contains QSAR data (from ChEMBL version 17) showing activity values (unit is pseudo-pCI50) of several compounds on drug target ChEMBL_ID: CHEMBL205 (TID: 15), and it has 3666 rows and 72 features (not including molecule IDs and class feature: molecule_id and pXC50). The features represent Molecular Descriptors which were generated from SMILES strings. Missing value imputation was applied to this dataset (By choosing the Median). Feature selection was also applied.


In [73]:
data = "data_qsar"

# fetch data
qsar = openml.tasks.get_task(360932)

# data
X_orig, y = qsar.get_X_and_y(dataset_format="dataframe")
y = y.to_numpy()

# select features with variance
k = 500
X_array = X_orig.to_numpy()
top_k_columns = X_orig.columns[np.argsort(np.var(X_array, axis=0))[-k:]]
X_orig = X_orig[top_k_columns]

X_orig, y = resample(X_orig, y, replace=False, n_samples=5000, random_state=777)

# save categorical and numerical features
cat_feat_qsar = X_orig.columns.tolist()
num_feat_qsar = X_orig.drop(columns=cat_feat_qsar).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_qsar, num_feat_qsar)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Allstate

[Allstate Insurance - openML](https://api.openml.org/d/45064)


In [74]:
data = "data_allstate"

# fetch dataset
allstate = openml.datasets.get_dataset(42571)

# data
X_orig, y, cat_ind, col_names = allstate.get_data(
    target=allstate.default_target_attribute, dataset_format="dataframe"
)
y = y.to_numpy()

X_orig, y = resample(X_orig, y, replace=False, n_samples=5000, random_state=777)

# save categorical and numerical features
cat_feat_allstate = np.array(col_names)[cat_ind].tolist()
num_feat_allstate = X_orig.drop(columns=cat_feat_allstate).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_allstate, num_feat_allstate)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')


# Mercedes

[Mercedes_Benz_Greener_Manufacturing - OpenML](https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=Mercedes_Benz_Greener_Manufacturing)

#### Dataset Information

Datasets provide training data for machine learning models. OpenML datasets are uniformly formatted and come with rich meta-data to allow automated processing. You can sort or filter them by a range of different properties.


In [75]:
data = "data_mercedes"

# fetch dataset
mercedes = openml.datasets.get_dataset(42570)

# data
X_orig, y, cat_ind, col_names = mercedes.get_data(
    target=mercedes.default_target_attribute, dataset_format="dataframe"
)
y = y.to_numpy()

# save categorical and numerical features
cat_feat_mercedes = X_orig.columns.tolist()
num_feat_mercedes = X_orig.drop(columns=cat_feat_mercedes).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_mercedes, num_feat_mercedes)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Transaction


In [76]:
data = "data_transaction"

# fetch dataset
transaction = openml.datasets.get_dataset(42572)

# data
X_orig, y, cat_ind, col_names = transaction.get_data(
    target=transaction.default_target_attribute, dataset_format="dataframe"
)

# select features with variance
k = 500
X_variance = X_orig.var()
top_k_columns = X_variance.sort_values(ascending=False).head(k).index
X_orig = X_orig[top_k_columns]

y = y.to_numpy()

# save categorical and numerical features
cat_feat_transaction = []
num_feat_transaction = X_orig.drop(columns=cat_feat_transaction).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_transaction, num_feat_transaction)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Energy Efficiency

[Energy Efficiency - UCI](https://archive.ics.uci.edu/dataset/242/energy+efficiency)

#### Dataset Information:

This study looked into assessing the heating load and cooling load requirements of buildings (that is, energy efficiency) as a function of building parameters.


In [6]:
data = "data_energy_efficiency"
energy_efficiency = pd.read_csv("../data/data_energy_efficiency/energy_efficiency.csv")

energy_efficiency.columns = [
    'Relative_Compactness',
    'Surface_Area',
    'Wall_Area',
    'Roof_Area',
    'Overall_Height',
    'Orientation',
    'Glazing_Area',
    'Glazing_Area_Distribution',
    'Heating_Load',
    'Cooling_Load'
]

X_orig = energy_efficiency.drop(columns=['Heating_Load', 'Cooling_Load'])
y_1 = energy_efficiency[['Heating_Load']].values.ravel()
y_2 = energy_efficiency[['Cooling_Load']].values.ravel()
# save categorical and numerical features
cat_feat = ['Orientation']
num_feat = X_orig.drop(columns=cat_feat).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y_1, cat_feat, num_feat)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# kin8nm

[kin8nm - OpenML](https://www.openml.org/search?type=data&sort=runs&id=189&status=active)

#### Dataset Information:

This is data set is concerned with the forward kinematics of an 8 link robot arm. Among the existing variants of this data set we have used the variant 8nm, which is known to be highly non-linear and medium noisy.

Original source: DELVE repository of data. Source: collection of regression datasets by Luis Torgo (ltorgo@ncc.up.pt) at http://www.ncc.up.pt/~ltorgo/Regression/DataSets.html Characteristics: 8192 cases, 9 attributes (0 nominal, 9 continuous).


In [7]:
data = "data_kin8nm"
kin8nm = openml.datasets.get_dataset(189)
X_orig, y, _, _ = kin8nm.get_data(target=kin8nm.default_target_attribute)

cat_feat = []
num_feat = X_orig.drop(columns=cat_feat).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat, num_feat)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Protein Structure

[Protein Structure - UCI](https://archive.ics.uci.edu/dataset/265/physicochemical+properties+of+protein+tertiary+structure)

#### Dataset Information:

This is a data set of Physicochemical Properties of Protein Tertiary Structure. The data set is taken from CASP 5-9. There are 45730 decoys and size varying from 0 to 21 armstrong.


In [8]:
data = "data_protein_structure"
protein_structure = pd.read_csv("../data/data_protein_structure/protein_structure.csv")

column_names = ['RMSD', 'F1_Total_surface_area', 'F2_Non_polar_exposed_area', 
                    'F3_Fractional_area_of_exposed_non_polar_residue', 
                    'F4_Fractional_area_of_exposed_non_polar_part_of_residue',
                    'F5_Molecular_mass_weighted_exposed_area', 
                    'F6_Average_deviation_from_standard_exposed_area_of_residue',
                    'F7_Euclidean_distance', 'F8_Secondary_structure_penalty', 
                    'F9_Spatial_Distribution_constraints']

protein_structure.columns = column_names

X_orig = protein_structure.drop(columns=['RMSD'])
y = protein_structure[['RMSD']].values.ravel()

cat_feat = []
num_feat = X_orig.drop(columns=cat_feat).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat, num_feat)
bin_df = subgroups["binned_df"]
new_bin_df = bin_df.copy(deep=True)
new_bin_df["DFA"] = (X.F6_Average_deviation_from_standard_exposed_area_of_residue <= 181.87).astype(int)
subgroups["new_binned_df"] = new_bin_df

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')

# Naval Propulsion

[Naval Propulsion - Kaggle](https://archive.ics.uci.edu/dataset/316/condition+based+maintenance+of+naval+propulsion+plants)

#### Dataset Information:

Data have been generated from a sophisticated simulator of a Gas Turbines (GT), mounted on a Frigate characterized by a COmbined Diesel eLectric And Gas (CODLAG) propulsion plant type.


In [9]:
data = 'data_naval_propulsion'

column_names = ['Lever_position', 'Ship_speed', 'Gas_Turbine_shaft_torque', 
                'Gas_Turbine_rate_of_revolutions', 'Gas_Generator_rate_of_revolutions',
                'Starboard_Propeller_Torque', 'Port_Propeller_Torque', 
                'HP_Turbine_exit_temperature', 'GT_Compressor_inlet_air_temperature', 
                'GT_Compressor_outlet_air_temperature', 'HP_Turbine_exit_pressure',
                'GT_Compressor_inlet_air_pressure', 'GT_Compressor_outlet_air_pressure', 
                'Gas_Turbine_exhaust_gas_pressure', 'Turbine_Injection_Control',
                'Fuel_flow', 'GT_Compressor_decay_state_coefficient', 'GT_Turbine_decay_state_coefficient']

naval_propulsion = pd.read_csv("../data/data_naval_propulsion/naval_propulsion.txt", delim_whitespace=True, header=None, names=column_names)

X_orig = naval_propulsion.drop(columns=['Fuel_flow'])
y = naval_propulsion[['Fuel_flow']].values.ravel()
# save categorical and numerical features
cat_feat = ['Ship_speed']
num_feat = X_orig.drop(columns=cat_feat).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat, num_feat)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')


# Superconductor

[superconduct - OpenML](https://openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=superconduct&id=44148)

#### Dataset Information:

Dataset used in the tabular data benchmark https://github.com/LeoGrin/tabular-benchmark, transformed in the same way. This dataset belongs to the "regression on numerical features" benchmark. Original description:

The data contains information on 21263 superconductors. The first 81 columns contain extracted features and the 82nd column contains the critical temperature which is used as the target variable. The original data from which the features were extracted comes from http://supercon.nims.go.jp/index_en.html, which is public.


In [17]:
data = "data_superconductor"

# fetch dataset
superconductor = openml.datasets.get_dataset(44148)

data
X_orig, y, cat_ind, col_names = superconductor.get_data(target=superconductor.default_target_attribute, dataset_format="dataframe")
y = y.to_numpy()

# save categorical and numerical features
cat_feat_superconductor = []
num_feat_superconductor = X_orig.drop(columns = cat_feat_superconductor).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_superconductor, num_feat_superconductor)
bin_df = subgroups["binned_df"]

if not os.path.exists(f"../data/{data}"):
    os.mkdir(f"../data/{data}")

bin_df_path = f"../data/{data}/bin_df.csv"
X_path = f"../data/{data}/X.csv"
y_path = f"../data/{data}/y.csv"

# Save dataframes or arrays
bin_df.to_csv(bin_df_path, index=False)
X.to_csv(X_path, index=False)
np.savetxt(y_path, y, delimiter=',')



# Diamond

[Diamond - OpenML](https://openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=diamonds&id=42225)

#### Dataset Information:

This classic dataset contains the prices and other attributes of almost 54,000 diamonds. It's a great dataset for beginners learning to work with data analysis and visualization.


In [20]:
data = "data_diamond"

# fetch dataset
diamond = openml.datasets.get_dataset(42225)

# data
X_orig, y, cat_ind, col_names = diamond.get_data(target=diamond.default_target_attribute, dataset_format="dataframe")
y = y.to_numpy()

# subsample data
X_orig, y = resample(X_orig, y, replace=False, n_samples=7000, random_state=777)
X_orig = X_orig.reset_index(drop=True)

# save categorical and numerical features
cat_feat_diamond = np.array(col_names)[cat_ind].tolist()
num_feat_diamond = X_orig.drop(columns = cat_feat_diamond).columns.to_numpy()

X, y, subgroups = process_data(X_orig, y, cat_feat_diamond, num_feat_diamond)
bin_df = subgroups["binned_df"]

# if not os.path.exists(f"../data/{data}"):
#     os.mkdir(f"../data/{data}")

# bin_df_path = f"../data/{data}/bin_df.csv"
# X_path = f"../data/{data}/X.csv"
# y_path = f"../data/{data}/y.csv"

# # Save dataframes or arrays
# bin_df.to_csv(bin_df_path, index=False)
# X.to_csv(X_path, index=False)
# np.savetxt(y_path, y, delimiter=',')

  diamond = openml.datasets.get_dataset(42225)


In [21]:
# write to csv
X.to_csv("data/data_diamond/X.csv", index=False)
np.savetxt("data/data_diamond/y.csv", y, delimiter=',')