In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision import datasets, transforms
import numpy as np
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
class TorchDataset(torch.utils.data.Dataset):

        def __init__(self, *data, **options):
            
            n_data = len(data)
            if n_data == 0:
                raise ValueError("At least one set required as input")

            self.data = data
            means = options.pop('means', None)
            stds = options.pop('stds', None)
            self.transform = options.pop('transform', None)
            self.test = options.pop('test', False)
            
            if options:
                raise TypeError("Invalid parameters passed: %s" % str(options))
            
            if means is not None:
                assert stds is not None, "must specify both <means> and <stds>"

                self.normalize = lambda data: [(d - m) / s for d, m, s in zip(data, means, stds)]

            else:
                self.normalize = lambda data: data

        def __len__(self):
            return len(self.data[0])

        def __getitem__(self, idx):
            data = self.normalize([s[idx] for s in self.data])
            if self.transform:

                if self.test:
                    data = sum([[self.transform.test_transform(d)] * 2 for d in data], [])
                else:
                    data = sum([self.transform(d) for d in data], [])
                
            return data

In [None]:
#@title Synthetic data
def set_npseed(seed):
    np.random.seed(seed)


def set_torchseed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


#classification data

def data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        
    set_npseed(seed=seed)

    # Construct a complete decision tree with 2**num_levels-1 internal nodes,
    # e.g. num_levels=2 means there are 3 internal nodes.
    # w_list, b_list is a list of size equal to num_internal_nodes
    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0
    num_internal_nodes = 2**num_levels - 1
    num_leaf_nodes = 2**num_levels
    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present

    if vals is None: #when val i.e., labels are not provided make the labels dynamically
        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd
        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted

    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically
        w_list = np.random.standard_normal((num_internal_nodes, dim))
        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects
        b_list = np.zeros((num_internal_nodes))

    '''
    np.random.random_sample
    ========================
    Return random floats in the half-open interval [0.0, 1.0).

    Results are from the "continuous uniform" distribution over the
    stated interval.  To sample :math:`Unif[a, b), b > a` multiply
    the output of `random_sample` by `(b-a)` and add `a`::

        (b - a) * random_sample() + a
    '''

#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1
#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route
#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6

    data_x = np.random.standard_normal((num_data, dim))
    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))
    relevant_stats = data_x @ w_list.T + b_list
    curr_index = np.zeros(shape=(num_data), dtype=int)
    
    for level in range(num_levels):
        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))
        for el in nodes_curr_level:
#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])
            relevant_stats[:,el] += b_list[el]
        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint

        # Go down and right if wx+b>0 down and left otherwise.
        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise
        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable
        

    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints
    thres = threshold
    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree

    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)
    #removes all the datapoints that passes through a node hyperplane
    labels_pruned = labels[bound_dist>thres]
    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not
    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data

    for node in range(num_internal_nodes+num_leaf_nodes):
        if node==0:
            stats[node]=len(relevant_stats) #for root node all datapoints are present
            nodes_active[:,0]=1 #root node all data points active status is +1
            continue
        parent = (node-1)//2
        nodes_active[:,node]=nodes_active[:,parent]
        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4
        #finds if it is a right child or left of the parent
        if right_child==1:
            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active
        if right_child==0:
            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active
        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)
    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)

class Dataset_syn:
    def __init__(self, dataset, data_path='./DATA'):
        if dataset =="syn":
            self.X_train = train_data
            self.y_train = train_data_labels
            self.X_valid = vali_data
            self.y_valid = vali_data_labels
            self.X_test = test_data
            self.y_test = test_data_labels
        self.data_path = data_path
        self.dataset = dataset

In [None]:
'''
# Use the data in DLGN and its variations from https://arxiv.org/abs/2010.04627
'''
#Imports for Latent Tree data

import random
import requests
import os
import torch
from tqdm import tqdm
import numpy as np
import gzip
import shutil
import tarfile
import bz2
import pandas as pd
import gzip
import shutil
import warnings

from pathlib import Path
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_swiss_roll
from sklearn.preprocessing import QuantileTransformer

from category_encoders import LeaveOneOutEncoder
from category_encoders.ordinal import OrdinalEncoder
import os
import zipfile
import shutil
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import NearestCentroid
from scipy.io import arff
# !pip install numpy==1.22.0  # Install a compatible version of NumPy
# !pip install scipy  # Install or update SciPy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances
from mpl_toolkits.axes_grid1 import make_axes_locatable
from itertools import product as cartesian_prod
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import pairwise_distances

from sklearn import tree
from sklearn import cluster, mixture


np.set_printoptions(precision=4)


#@title Importing Packages
import os
import random
import pandas as pd

import torchvision
import torchvision.transforms as transforms

import time
import sys

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)



def preprocess_data_adult(data_path):
    # Read the data into a DataFrame
    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
        "hours-per-week", "native-country", "income"
    ]
    df = pd.read_csv(data_path, names=columns, na_values=[" ?"])

    # Drop rows with missing values
    df.dropna(inplace=True)

    # Convert categorical features using Label Encoding
    categorical_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Encode the target variable
    df["income"] = df["income"].apply(lambda x: 1 if x == " >50K" else 0)

    return df

def preprocess_data_bank_marketing(data):
    # Convert categorical features using Label Encoding
    label_encoders = {}
    for col in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

    return data

def preprocess_data_credit_card_defaults(data):
    # Convert categorical features using one-hot encoding
    data = pd.get_dummies(data, columns=["SEX", "EDUCATION", "MARRIAGE"], drop_first=True)

    # Standardize numerical features
    scaler = StandardScaler()
    data[["LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1",
          "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2",
          "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]] = scaler.fit_transform(
        data[["LIMIT_BAL", "AGE", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1",
               "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2",
               "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]])

    return data



def fetch_ADULT(data_dir="./ADULT_DATA"):
    print("---------------------ADULT--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/2/adult.zip"
    zip_file_path = os.path.join(data_dir, "adult.zip")
    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)
    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)

    # Preprocess the data
    train_data_path = os.path.join(data_dir, "adult.data")
#     test_data_path = os.path.join(data_dir, "adult.test")
   
    df_train = preprocess_data_adult(train_data_path)
#     df_test = preprocess_data_adult(test_data_path)

    # Split the data into train, validation, and test sets
    X = df_train.drop("income", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df_train["income"]
    
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
#     X_test = df_test.drop("income", axis=1)
#     y_test = df_test["income"]

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents using shutil.rmtree()
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val, X_test=X_test.astype('float32'), y_test=y_test
    )

def fetch_bank_marketing(data_dir="./BANK"):
    print("---------------------BANK--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
    zip_file_path = os.path.join(data_dir, "bank_marketing.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
    
    zip_file_path_bank_add = os.path.join(data_dir, "bank-additional.zip")
    with zipfile.ZipFile(zip_file_path_bank_add, "r") as zip_ref:
        zip_ref.extractall(data_dir)

    # Get the extracted directory path
    extracted_dir = os.path.join(data_dir, "bank-additional")

    # Read the dataset
    data = pd.read_csv(os.path.join(extracted_dir, "bank-additional-full.csv"), sep=';')

    # Preprocess the data
    data = preprocess_data_bank_marketing(data)

    # Split the data into train, validation, and test sets
    X = data.drop("y", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["y"]
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,X_test=X_test.astype('float32'), y_test=y_test, X_valid = X_val.astype('float32'), y_valid = y_val
    )

def fetch_credit_card_defaults(data_dir="./CREDIT"):
    print("---------------------CREDIT--------------------------------------")
    # Create the data directory if it doesn't exist
    !pip install xlrd
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip"
    zip_file_path = os.path.join(data_dir, "credit_card_defaults.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)

#     # Get the extracted directory path
#     extracted_dir = os.path.join(data_dir, "default+of+credit+card+clients")

    # Read the dataset
    data = pd.read_excel(os.path.join(data_dir, "default of credit card clients.xls"), skiprows=1)

    # Preprocess the data
    data = preprocess_data_credit_card_defaults(data)

    # Split the data into train, validation, and test sets
    X = data.drop("default payment next month", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["default payment next month"]
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val , X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_gamma_telescope(data_dir="./TELESCOPE"):
    print("---------------------TELESCOPE--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip"
    zip_file_path = os.path.join(data_dir, "magic_gamma_telescope.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
    
    # Load the data from CSV
    data_path = os.path.join(data_dir, "magic04.data")
    columns = [
        "fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long",
        "fM3Trans", "fAlpha", "fDist", "class"
    ]
    data = pd.read_csv(data_path, header=None, names=columns)
    
    # Convert the class labels to binary format (g = gamma, h = hadron)
    data["class"] = data["class"].map({"g": 1, "h": 0})
    
    # Split the data into features (X) and target (y)
    X = data.drop("class", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["class"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)
    
    return data_splits

def fetch_rice_dataset(data_dir="./RICE"):
    print("---------------------RICE--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/545/rice+cammeo+and+osmancik.zip"
    zip_file_path = os.path.join(data_dir, "rice_dataset.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
        
    # Load the data from CSV
    arff_file_name = os.path.join(data_dir, "Rice_Cammeo_Osmancik.arff")

    
    # Load the ARFF file using SciPy
    data, meta = arff.loadarff(arff_file_name)
    
    df = pd.DataFrame(data)
    print("df",df)
    df["Class"] = df["Class"].map({b'Cammeo': 1, b'Osmancik': 0})
    
    # Split the data into features (X) and target (y)
    X = df.drop("Class", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df["Class"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)
    
    return data_splits

def fetch_german_credit_data(data_dir="./GERMAN"):
    print("---------------------GERMAN--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "http://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip"
    zip_file_path = os.path.join(data_dir, "german_credit_data.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
        
    # Load the data from CSV
    data_path = os.path.join(data_dir, "german.data")

    columns = [
        "checking_account_status", "duration_months", "credit_history", "purpose",
        "credit_amount", "savings_account_bonds", "employment", "installment_rate",
        "personal_status_sex", "other_debtors_guarantors", "present_residence",
        "property", "age", "other_installment_plans", "housing", "existing_credits",
        "job", "num_dependents", "own_telephone", "foreign_worker", "class"
    ]
    data = pd.read_csv(data_path, sep=' ', header=None, names=columns)
    
    # Convert the class labels to binary format (1 = Good, 2 = Bad)
    data["class"] = data["class"].map({1: 1, 2: 0})
    
    # Handle null values (replace with appropriate values)
    data.fillna(method='ffill', inplace=True)  # Forward fill
    
    # Convert categorical variables to dummy variables
    categorical_columns = [
        "checking_account_status", "credit_history", "purpose", "savings_account_bonds",
        "employment", "personal_status_sex", "other_debtors_guarantors", "property",
        "other_installment_plans", "housing", "job", "own_telephone", "foreign_worker"
    ]
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
    
    # Split the data into features (X) and target (y)
    X = data.drop("class", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["class"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)
    
    return data_splits

def fetch_spambase_dataset(data_dir="./SPAM"):
    print("---------------------SPAM--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "http://archive.ics.uci.edu/static/public/94/spambase.zip"
    zip_file_path = os.path.join(data_dir, "spambase.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
        
    # Load the data from CSV
    data_path = os.path.join(data_dir, "spambase.data")

    columns = [
        f"f{i}" for i in range(57)
    ] + ["spam"]
    data = pd.read_csv(data_path, header=None, names=columns)
    
    # Split the data into features (X) and target (y)
    X = data.drop("spam", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["spam"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)
    
    return data_splits

def fetch_accelerometer_gyro_dataset(data_dir="./GYRO"):
    print("---------------------GYRO--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/755/accelerometer+gyro+mobile+phone+dataset.zip"
    zip_file_path = os.path.join(data_dir, "accelerometer_gyro_dataset.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
        
    # Load the data from CSV
    data_path = os.path.join(data_dir, "accelerometer_gyro_mobile_phone_dataset.csv")
    
    data = pd.read_csv(data_path)
    
    # Convert categorical column to numeric (e.g., label encoding)
    data["timestamp"] = data["timestamp"].astype("category").cat.codes
    
    # Split the data into features (X) and target (y)
    X = data.drop("Activity", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["Activity"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)

    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir)
    
    return data_splits

def fetch_swarm_behaviour(data_dir="./SWARM"):
    print("---------------------SWARM--------------------------------------")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # URL of the dataset zip file
    url = "https://archive.ics.uci.edu/static/public/524/swarm+behaviour.zip"
    zip_file_path = os.path.join(data_dir, "swarm_behaviour.zip")

    # Download the zip file
    urllib.request.urlretrieve(url, zip_file_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
        
    # Load the data from CSV
    data_path = os.path.join(data_dir, "Swarm Behavior Data/Grouped.csv")
    
    data = pd.read_csv(data_path)
    
    # Split the data into features (X) and target (y)
    X = data.drop("Class", axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = data["Class"]
    
    # Split the data into train, test, and validation sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')
    
    # Create a dictionary to store the data splits
    data_splits = {
        "X_train": X_train.astype('float32'), "y_train": y_train,
        "X_valid": X_valid.astype('float32'), "y_valid": y_valid,
        "X_test": X_test.astype('float32'), "y_test": y_test
    }
    
    # Remove the zip file
    os.remove(zip_file_path)
    # Remove the extracted directory and its contents
    shutil.rmtree(data_dir) 
    return data_splits


def fetch_openml_credit_data(data_dir="./OpenML_Credit"):
    print("---------------------OpenML_Credit DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103185/credit.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "credit.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

    df[last_column] = df[last_column].astype(int)
    
#     print("df",df)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_electricity_data(data_dir="./OpenML_Electricity"):
    print("---------------------OpenML_Electricity DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103245/electricity.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "electricity.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

    df[last_column] = df[last_column].map({b'DOWN': 0, b'UP': 1})
    
#     print("df",df)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_covertype_data(data_dir="./OpenML_Covertype"):
    print("---------------------OpenML_Covertype DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103246/covertype.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "covertype.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

    df[last_column] = df[last_column].astype(int)
    
#     print("df",df)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_pol_data(data_dir="./OpenML_Pol"):
    print("---------------------OpenML_Pol DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103247/pol.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "pol.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

#     print("df",df)
    
    df[last_column] = df[last_column].map({b'N':0,b'P':1})
    
    

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_house_16H_data(data_dir="./OpenML_House_16H"):
    print("---------------------OpenML_House_16H DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103248/house_16H.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "house_16H.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

#     print("df",df)
    df[last_column] = df[last_column].map({b'N':0,b'P':1})
    
    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_MiniBooNE_data(data_dir="./OpenML_MiniBooNE"):
    print("---------------------OpenML_MiniBooNE DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103253/MiniBooNE.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "MiniBooNE.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

#     print("df",df)
    
    df[last_column] = df[last_column].map({b'False':0,b'True':1})

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_eye_movements_data(data_dir="./OpenML_Eye_movements"):
    print("---------------------OpenML_Eye_movements DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22103255/eye_movements.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "eye_movements.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]

#     print("df",df)
    df[last_column] = df[last_column].astype(int)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_Diabetes130US_data(data_dir="./OpenML_Diabetes130US"):
    print("---------------------OpenML_Diabetes130US DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22111908/Diabetes130US.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "Diabetes130US.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]
#     print("df",df)
    df[last_column] = df[last_column].astype(int)
    

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_jannis_data(data_dir="./OpenML_Jannis"):
    print("---------------------OpenML_Jannis DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22111907/jannis.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "jannis.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]
#     print("df",df)

    df[last_column] = df[last_column].astype(int)


    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_Bioresponse_data(data_dir="./OpenML_Bioresponse"):
    print("---------------------OpenML_Bioresponse DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22111905/Bioresponse.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "Bioresponse.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]
#     print("df",df)

    df[last_column] = df[last_column].astype(int)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_california_data(data_dir="./OpenML_California"):
    print("---------------------OpenML_California DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22111914/california.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "california.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]
#     print("df",df)

    df[last_column] = df[last_column].astype(int)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )


def fetch_openml_heloc_data(data_dir="./OpenML_Heloc"):
    print("---------------------OpenML_Heloc DATASET--------------------------------------")
    # Create the data directory if it doesn't exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_url = "https://api.openml.org/data/v1/download/22111912/heloc.arff"
    # Download the ARFF file
    arff_file_path = os.path.join(data_dir, "heloc.arff")
    urllib.request.urlretrieve(data_url, arff_file_path)

    # Load ARFF file into DataFrame
    data, meta = arff.loadarff(arff_file_path)
    df = pd.DataFrame(data)
    # Convert target variable to int
    last_column = df.columns[-1]
#     print("df",df)

    df[last_column] = df[last_column].astype(int)

    # Split the data into train, validation, and test sets
    X = df.drop(last_column, axis=1)  # Assuming "SeriousDlqin2yrs" is the target variable
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y = df[last_column]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

#     y_train = y_train.astype('int64')
#     y_test = y_test.astype('int64')
#     y_val = y_val.astype('int64')

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')
    y_val = (y_val.values.reshape(-1) == 1).astype('int64')

    # Remove the ARFF file
    os.remove(arff_file_path)

    # Remove the data directory
    shutil.rmtree(data_dir)

    return dict(
        X_train=X_train.astype('float32'), y_train=y_train,
        X_valid=X_val.astype('float32'), y_valid=y_val,
        X_test=X_test.astype('float32'), y_test=y_test
    )

#**class Dataset:**

REAL_DATASETS = {
#     'A9A': fetch_A9A,
#     'EPSILON': fetch_EPSILON,
#     'PROTEIN': fetch_PROTEIN,
#     'YEAR': fetch_YEAR,
#     'MICROSOFT': fetch_MICROSOFT,
#     'YAHOO': fetch_YAHOO,
#     'CLICK': fetch_CLICK,
#     'GLASS': fetch_GLASS,
#     'COVTYPE': fetch_COVTYPE,
#     'ALOI': fetch_ALOI,
#     'DIGITS': fetch_DIGITS,
#     'MUSH': fetch_MUSHROOMS,
#     'TTT': fetch_TICTACTOE,
    ####### 10 latest UCI datasets ########
    'ADULT': fetch_ADULT,
    'bank_marketing': fetch_bank_marketing,
    'credit_card_defaults': fetch_credit_card_defaults,
    'gamma_telescope': fetch_gamma_telescope,
    'rice_dataset': fetch_rice_dataset,
    'german_credit_data': fetch_german_credit_data,
    'spambase_dataset': fetch_spambase_dataset,
    'accelerometer_gyro_dataset': fetch_accelerometer_gyro_dataset,
    'swarm_behaviour': fetch_swarm_behaviour,
#     'HIGGS': fetch_HIGGS,
    ######## OpenML Tabular Datasets ##########
    'OpenML_Credit': fetch_openml_credit_data,
    'OpenML_Electricity': fetch_openml_electricity_data,
    'OpenML_Covertype': fetch_openml_covertype_data,
    'OpenML_Pol': fetch_openml_pol_data,
    'OpenML_House_16H': fetch_openml_house_16H_data,
    'OpenML_MiniBooNE': fetch_openml_MiniBooNE_data,
    'OpenML_Eye_movements': fetch_openml_eye_movements_data,
    'OpenML_Diabetes130US': fetch_openml_Diabetes130US_data,
    'OpenML_Jannis': fetch_openml_jannis_data,
    'OpenML_Bioresponse': fetch_openml_Bioresponse_data,
    'OpenML_California': fetch_openml_california_data,
    'OpenML_Heloc': fetch_openml_heloc_data
}

TOY_DATASETS = [
    'xor',
    'reg-xor',
    'swissroll',
]

class Dataset:
    def __init__(self, dataset, data_path='./DATA', normalize=False, normalize_target=False, quantile_transform=False, quantile_noise=1e-3, in_features=None, out_features=None, flatten=False, **kwargs):
        """
        Dataset is a dataclass that contains all training and evaluation data required for an experiment
        :param dataset: a pre-defined dataset name (see DATASETS) or a custom dataset
            Your dataset should be at (or will be downloaded into) {data_path}/{dataset}
        :param data_path: a shared data folder path where the dataset is stored (or will be downloaded into)
        :param normalize: standardize features by removing the mean and scaling to unit variance
        :param quantile_transform: whether tranform the feature distributions into normals, using a quantile transform
        :param quantile_noise: magnitude of the quantile noise
        :param in_features: which features to use as inputs
        :param out_features: which features to reconstruct as output
        :param flatten: whether flattening instances to vectors
        :param kwargs: depending on the dataset, you may select train size, test size or other params
        """

        if dataset in REAL_DATASETS:
            data_dict = REAL_DATASETS[dataset](Path(data_path) / dataset, **kwargs)

            self.X_train = data_dict['X_train']
            self.y_train = data_dict['y_train']
            self.X_valid = data_dict['X_valid']
            self.y_valid = data_dict['y_valid']
            self.X_test = data_dict['X_test']
            self.y_test = data_dict['y_test']

            if flatten:
                self.X_train, self.X_valid, self.X_test = self.X_train.reshape(len(self.X_train), -1), self.X_valid.reshape(len(self.X_valid), -1), self.X_test.reshape(len(self.X_test), -1)

            if normalize:

                print("Normalize dataset")
                axis = [0] + [i + 2 for i in range(self.X_train.ndim - 2)]
                self.mean = np.mean(self.X_train, axis=tuple(axis), dtype=np.float32)
                self.std = np.std(self.X_train, axis=tuple(axis), dtype=np.float32)

                # if constants, set std to 1
                self.std[self.std == 0.] = 1.

                if dataset not in ['ALOI']:
                    self.X_train = (self.X_train - self.mean) / self.std
                    self.X_valid = (self.X_valid - self.mean) / self.std
                    self.X_test = (self.X_test - self.mean) / self.std

            if quantile_transform:
                quantile_train = np.copy(self.X_train)
                if quantile_noise:
                    stds = np.std(quantile_train, axis=0, keepdims=True)
                    noise_std = quantile_noise / np.maximum(stds, quantile_noise)
                    quantile_train += noise_std * np.random.randn(*quantile_train.shape)

                qt = QuantileTransformer(output_distribution='normal').fit(quantile_train)
                self.X_train = qt.transform(self.X_train)
                self.X_valid = qt.transform(self.X_valid)
                self.X_test = qt.transform(self.X_test)

            if normalize_target:

                print("Normalize target value")
                self.mean_y = np.mean(self.y_train, axis=0, dtype=np.float32)
                self.std_y = np.std(self.y_train, axis=0, dtype=np.float32)

                # if constants, set std to 1
                if self.std_y == 0.:
                    self.std_y = 1.

                self.y_train = (self.y_train - self.mean_y) / self.std_y
                self.y_valid = (self.y_valid - self.mean_y) / self.std_y
                self.y_test = (self.y_test - self.mean_y) / self.std_y

            if in_features is not None:
                self.X_train_in, self.X_valid_in, self.X_test_in = self.X_train[:, in_features], self.X_valid[:, in_features], self.X_test[:, in_features]

            if out_features is not None:
                self.X_train_out, self.X_valid_out, self.X_test_out = self.X_train[:, out_features], self.X_valid[:, out_features], self.X_test[:, out_features]

        elif dataset in TOY_DATASETS:
            data_dict = toy_dataset(distr=dataset, **kwargs)

            self.X = data_dict['X']
            self.Y = data_dict['Y']
            if 'labels' in data_dict:
                self.labels = data_dict['labels']

        self.data_path = data_path
        self.dataset = dataset

class TorchDataset(torch.utils.data.Dataset):

    def __init__(self, *data, **options):
        
        n_data = len(data)
        if n_data == 0:
            raise ValueError("At least one set required as input")

        self.data = data
        means = options.pop('means', None)
        stds = options.pop('stds', None)
        self.transform = options.pop('transform', None)
        self.test = options.pop('test', False)
        
        if options:
            raise TypeError("Invalid parameters passed: %s" % str(options))
         
        if means is not None:
            assert stds is not None, "must specify both <means> and <stds>"

            self.normalize = lambda data: [(d - m) / s for d, m, s in zip(data, means, stds)]

        else:
            self.normalize = lambda data: data

    def __len__(self):
        return len(self.data[0])

    def __getitem__(self, idx):
        data = self.normalize([s[idx] for s in self.data])
        if self.transform:

            if self.test:
                data = sum([[self.transform.test_transform(d)] * 2 for d in data], [])
            else:
                data = sum([self.transform(d) for d in data], [])
            
        return data

**TAO**

In [None]:
import random
from copy import deepcopy
from queue import deque

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import get_scorer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text
from sklearn.utils import check_X_y

import pandas as pd
from sklearn.base import ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array
from sklearn.utils.multiclass import check_classification_targets
import scipy.sparse

def check_fit_arguments(model, X, y, feature_names):
    """Process arguments for fit and predict methods.
    """
    if isinstance(model, ClassifierMixin):
        model.classes_, y = np.unique(y, return_inverse=True)  # deals with str inputs
        check_classification_targets(y)

    if feature_names is None:
        if isinstance(X, pd.DataFrame):
            model.feature_names_ = X.columns
        elif isinstance(X, list):
            model.feature_names_ = ['X' + str(i) for i in range(len(X[0]))]
        else:
            model.feature_names_ = ['X' + str(i) for i in range(X.shape[1])]
    else:
        model.feature_names_ = feature_names
    if scipy.sparse.issparse(X):
        X = X.toarray()
    X, y = check_X_y(X, y)
    _, model.n_features_in_ = X.shape
    assert len(model.feature_names_) == model.n_features_in_, 'feature_names should be same size as X.shape[1]'
    y = y.astype(float)
    return X, y, model.feature_names_

def check_fit_X(X):
    """Process X argument for fit and predict methods.
    """
    if scipy.sparse.issparse(X):
        X = X.toarray()
    X = check_array(X)
    return X


class TaoTree(BaseEstimator):

    def __init__(self, model_type: str = 'CART',
                 n_iters: int = 20,
                 model_args: dict = {'max_leaf_nodes': 15},
                 randomize_tree=False,
                 update_scoring='accuracy',
                 min_node_samples_tao=3,
                 min_leaf_samples_tao=2,
                 node_model='stump',
                 node_model_args: dict = {},
                 reg_param: float = 1e-3,
                 weight_errors: bool = False,
                 verbose: int = 0,
                 ):
        """TAO: Alternating optimization of decision trees, with application to learning sparse oblique trees (Neurips 2018)
        https://proceedings.neurips.cc/paper/2018/hash/185c29dc24325934ee377cfda20e414c-Abstract.html
        Note: this implementation learns single-feature splits rather than oblique trees.

        Currently supports
        - given a CART tree, posthoc improve it with TAO
            - also works with HSTreeCV

        Todo
        - update bottom to top otherwise input points don't get updated
        - update leaf nodes
        - support regression
        - support FIGS
        - support error-weighting
        - support oblique trees
            - support generic models at decision node
            - support pruning (e.g. if weights -> 0, then remove a node)
        - support classifiers in leaves

        Parameters
        ----------

        model_type: str
            'CART' or 'FIGS'

        n_iters
            Number of iterations to run TAO

        model_args
            Arguments to pass to the model

        randomize_tree
            Whether to randomize the tree before each iteration

        min_node_samples_tao: int
            Minimum number of samples in a node to apply tao

        min_leaf_samples_tao: int

        node_model: str
            'stump' or 'linear'

        reg_param
            Regularization parameter for node-wise linear model (if node_model is 'linear')

        verbose: int
            Verbosity level
        """
        super().__init__()
        self.model_type = model_type
        self.n_iters = n_iters
        self.model_args = model_args
        self.randomize_tree = randomize_tree
        self.update_scoring = update_scoring
        self.min_node_samples_tao = min_node_samples_tao
        self.min_leaf_samples_tao = min_leaf_samples_tao
        self.node_model = node_model
        self.node_model_args = node_model_args
        self.reg_param = reg_param
        self.weight_errors = weight_errors
        self.verbose = verbose
        self._init_prediction_task()  # decides between regressor and classifier

    def _init_prediction_task(self):
        """
        TaoRegressor and TaoClassifier override this method
        to alter the prediction task. When using this class directly,
        it is equivalent to SuperCARTRegressor
        """
        self.prediction_task = 'classification'

    def fit(self, X, y=None, feature_names=None, sample_weight=None):
        """
        Params
        ------
        _sample_weight: array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
            Splits that would create child nodes with net zero or negative weight
            are ignored while searching for a split in each node.
        """
        X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
        if isinstance(self, RegressorMixin):
            raise Warning('TAO Regression is not yet tested')
        X, y = check_X_y(X, y)
        y = y.astype(float)
        if feature_names is not None:
            self.feature_names_ = feature_names
        if self.model_type == 'CART':
            if isinstance(self, ClassifierMixin):
                self.model = DecisionTreeClassifier(**self.model_args)
            elif isinstance(self, RegressorMixin):
                self.model = DecisionTreeRegressor(**self.model_args)
            self.model.fit(X, y, sample_weight=sample_weight)
            if self.verbose>1:
                print(export_text(self.model))
            # plot_tree(self.model)
            # plt.savefig('/Users/chandan/Desktop/tree.png', dpi=300)
            # plt.show()

        if self.randomize_tree:
            np.random.shuffle(self.model.tree_.feature)  # shuffle CART features
            # np.random.shuffle(self.model.tree_.threshold)
            for i in range(self.model.tree_.node_count):  # split on feature medians
                self.model.tree_.threshold[i] = np.median(
                    X[:, self.model.tree_.feature[i]])
        if self.verbose:
            print('starting score', self.model.score(X, y))
        for i in range(self.n_iters):
            num_updates = self._tao_iter_cart(X, y, self.model.tree_, sample_weight=sample_weight)
            if num_updates == 0:
                break

        return self

    def _tao_iter_cart(self, X, y, tree, X_score=None, y_score=None, sample_weight=None):
        """Updates tree by applying the tao algorithm to the tree
        Params
        ------
        X: array-like of shape (n_samples, n_features)
            The input samples.
        y: array-like of shape (n_samples,)
            The target values.
        model: DecisionTreeClassifier.tree_ or DecisionTreeRegressor.tree_
            The model to be post-hoc improved
        """

        # Tree properties
        children_left = tree.children_left
        children_right = tree.children_right
        feature = tree.feature
        threshold = tree.threshold
        value = tree.value
        
#         print("X",X)
#         print("y",y)
#         print("tree",tree)
        
#         print("children_left",children_left)
#         print("children_right",children_right)
#         print("feature",feature)
#         print("threshold",threshold)
#         print("value",value)

        # For each node, store the path to that node #######################################################
        indexes_with_prefix_paths = []  # data structure with (index, path_to_node_index)
        # e.g. if if node 3 is the left child of node 1 which is the right child of node 0
        # then we get (3, [(0, R), (1, L)])

        # start with the root node id (0) and its depth (0)
        queue = deque()
        queue.append((0, []))
        while len(queue) > 0:
            node_id, path_to_node_index = queue.popleft()
            indexes_with_prefix_paths.append((node_id, path_to_node_index))

            # If a split node, append left and right children and depth to queue
            if children_left[node_id] != children_right[node_id]:
                queue.append((children_left[node_id], path_to_node_index + [(node_id, 'L')]))
                queue.append((children_right[node_id], path_to_node_index + [(node_id, 'R')]))
#         print(indexes_with_prefix_paths)

        num_updates = 0

        # Reversing BFS queue presents nodes bottom -> top one level at a time
        for (node_id, path_to_node_index) in reversed(indexes_with_prefix_paths):
            # For each each node, try a TAO update
#             print('node_id', node_id, path_to_node_index)

            # Compute the points being input to the node ######################################
            def filter_points_by_path(X, y, path_to_node_index):
                """Returns the points in X that are in the path to the node"""
                for node_id, direction in path_to_node_index:
                    idxs = X[:, feature[node_id]] <= threshold[node_id]  ##### Change
                    if direction == 'R':
                        idxs = ~idxs
                    # print('idxs', idxs.size, idxs.sum())
                    X = X[idxs]
                    y = y[idxs]
                return X, y

            X_node, y_node = filter_points_by_path(X, y, path_to_node_index)

            if sample_weight is not None:
                sample_weight_node = filter_points_by_path(X, sample_weight, path_to_node_index)[1]
            else:
                sample_weight_node = np.ones(y_node.size)

            # Skip over leaf nodes and nodes with too few samples ######################################
            if children_left[node_id] == children_right[node_id]:  # is leaf node
                if isinstance(self, RegressorMixin) and X_node.shape[0] >= self.min_leaf_samples_tao:
                    # old_score = self.model.score(X, y)
                    value[node_id] = np.mean(y_node)
                    """
                    new_score = self.model.score(X, y)
                    if new_score > old_score:
                        print(f'\tLeaf improved score from {old_score:0.3f} to {new_score:0.3f}')
                    if new_score < old_score:
                        print(f'\tLeaf reduced score from {old_score:0.3f} to {new_score:0.3f}')
                        # raise ValueError('Leaf update reduced score')
                    """
                # print('\tshapes', X_node.shape, y_node.shape)
                # print('\tvals:', value[node_id][0][0], np.mean(y_node))
                # assert value[node_id][0][0] == np.mean(y_node), 'unless tree changed, vals should be leaf means'
                continue
            elif X_node.shape[0] < self.min_node_samples_tao:
                continue

            # Compute the outputs for these points if they go left or right ######################################
            def predict_from_node(X, node_id):
                """Returns predictions for X starting at node node_id"""

                def predict_from_node(x, node_id):
                    """Returns predictions for x starting at node node_id"""
                    if children_left[node_id] == children_right[node_id]:
                        if isinstance(self, RegressorMixin):
                            return value[node_id]
                        if isinstance(self, ClassifierMixin):
                            return np.argmax(value[node_id])  # note value stores counts for each class
                    if x[feature[node_id]] <= threshold[node_id]: ##### Change
                        return predict_from_node(x, children_left[node_id])
                    else:
                        return predict_from_node(x, children_right[node_id])

                preds = np.zeros(X.shape[0])
                for i in range(X.shape[0]):
                    preds[i] = predict_from_node(X[i], node_id)
                return preds

            y_node_left = predict_from_node(X_node, children_left[node_id])
            y_node_right = predict_from_node(X_node, children_right[node_id])
            if node_id == 0:  # root node
                assert np.all(np.logical_or(self.model.predict(X_node) == y_node_left,
                                            self.model.predict(
                                                X_node) == y_node_right)), \
                    'actual predictions should match either predict_from_node left or right'

            # Decide on prediction target (want to go left (0) / right (1) when advantageous)
            # TAO paper binarizes these (e.g. predict 0 or 1 depending on which of these is better)
            y_node_absolute_errors = np.abs(np.vstack((y_node - y_node_left,
                                                       y_node - y_node_right))).T

            # screen out indexes where going left/right has no effect
            idxs_relevant = y_node_absolute_errors[:, 0] != y_node_absolute_errors[:, 1]
            if idxs_relevant.sum() <= 1:  # nothing to change
                if self.verbose:
                    print('no errors to change')
                continue
            # assert np.all((self.model.predict(X) != y)[idxs_relevant]), 'relevant indexes should be errors'
            y_node_target = np.argmin(y_node_absolute_errors, axis=1)
            y_node_target = y_node_target[idxs_relevant]

            # here, we optionally weight these errors by the size of the error
            # if we want this to work for classification, must switch to predict_proba
            # if self.prediction_task == 'regression':
            # weight by the difference in error ###############################################################
            if self.weight_errors:
                sample_weight_node *= np.abs(y_node_absolute_errors[:, 1] - y_node_absolute_errors[:, 0])
            sample_weight_node_target = sample_weight_node[idxs_relevant]
            X_node = X_node[idxs_relevant]

            # Fit a 1-variable binary classification model on these outputs ######################################
            # Note: this could be customized (e.g. for sparse oblique trees)
            best_score = -np.inf
            best_feat_num = None
            for feat_num in range(X.shape[1]):
                if isinstance(self, ClassifierMixin):
                    if self.node_model == 'linear':
                        m = LogisticRegression(**self.node_model_args)
                    elif self.node_model == 'stump':
                        m = DecisionTreeClassifier(max_depth=1, **self.node_model_args)
                if isinstance(self, RegressorMixin):
                    if self.node_model == 'linear':
                        m = LinearRegression(**self.node_model_args)
                    elif self.node_model == 'stump':
                        m = DecisionTreeRegressor(max_depth=1, **self.node_model_args)
                X_node_single_feat = X_node[:, feat_num: feat_num + 1]
                m.fit(X_node_single_feat, y_node_target, sample_weight=sample_weight_node_target)
                score = m.score(X_node_single_feat, y_node_target, sample_weight=sample_weight_node_target)
                if score > best_score:
                    best_score = score
                    best_feat_num = feat_num
                    best_model = deepcopy(m)
                    if self.node_model == 'linear':
                        best_threshold = -best_model.intercept_ / best_model.coef_[0]
                    elif self.node_model == 'stump':
                        best_threshold = best_model.tree_.threshold[0]
            # print((feature[node_id], threshold[node_id]), '\n->',
            #       (best_feat_num, best_threshold))

            # Update the node with the new feature / threshold ######################################
            old_feat_num = feature[node_id]
            old_threshold = threshold[node_id]
            # print(X.sum(), y.sum())

            if X_score is None:
                X_score = X
            if y_score is None:
                y_score = y

            scorer = get_scorer(self.update_scoring)

            old_score = scorer(self.model, X_score, y_score)

            feature[node_id] = best_feat_num
            threshold[node_id] = best_threshold
            new_score = scorer(self.model, X_score, y_score)

            # debugging
            if self.verbose > 1:
                if old_score == new_score:
                    print('\tno change', best_feat_num, old_feat_num)
                print(f'\tscore_total {old_score:0.4f} -> {new_score:0.4f}')
            if old_score >= new_score:
                feature[node_id] = old_feat_num
                threshold[node_id] = old_threshold
            else:
                # (Track if any updates were necessary)
                num_updates += 1
                if self.verbose > 0:
                    print(f'Improved score from {old_score:0.4f} to {new_score:0.4f}')

            # debugging snippet (if score_m_new > score_m_old, then new_score should be > old_score, but it isn't!!!!)
            if self.verbose > 1:
                """
                X_node_single_feat = X_node[:, best_feat_num: best_feat_num + 1]
                score_m_new = best_model.score(X_node_single_feat, y_node_target, sample_weight=sample_weight)
                best_model.tree_.feature[0] = old_feat_num
                best_model.tree_.threshold[0] = old_threshold
                X_node_single_feat = X_node[:, old_feat_num: old_feat_num + 1]
                score_m_old = best_model.score(X_node_single_feat, y_node_target, sample_weight=sample_weight)
                print('\t\t', f'score_local {score_m_old:0.4f} -> {score_m_new:0.4f}')
                """

        return num_updates

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.model.score(X, y)


class TaoTreeRegressor(TaoTree, RegressorMixin):
    pass

class TaoTreeClassifier(TaoTree, ClassifierMixin):
    pass


**Change the below cell to run on Synthetic or Tabular dataset on different hyperparameters**

In [None]:
DATA_NAME="syn" #"Tabular" use "Tabular" to run on tabular datasets

In [None]:
if DATA_NAME == "syn":
    np.random.seed(13)
    random.seed(13)
    seed=365
    num_levels=4
    threshold = 0 #data seperation distance



    data_configs = [
        {"input_dim": 20, "num_data": 40000},
        {"input_dim": 100, "num_data": 60000},
        {"input_dim": 500, "num_data": 100000}
    ]

    # Code block to run for each dictionary
    for config in data_configs:
        input_dim = config["input_dim"]
        num_data = config["num_data"]

        print(f"Running code for input_dim={input_dim}, num_data={num_data}")

        ((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(
                                                    dim=input_dim, seed=seed, num_levels=num_levels,
                                                    num_data=num_data)
        seed_set=seed
        w_list_old = np.array(w_list)
        b_list_old = np.array(b_list)
        print(sum(labels==1))
        print(sum(labels==0))
    #     print(labels.shape)
    #     print(vals)
    #     print(stats)
        print("Seed= ",seed_set)
        num_data = len(data_x)
        num_train= num_data//2
        num_vali = num_data//4
        num_test = num_data//4
        train_data = data_x[:num_train,:]
        train_data_labels = labels[:num_train]

        vali_data = data_x[num_train:num_train+num_vali,:]
        vali_data_labels = labels[num_train:num_train+num_vali]

        test_data = data_x[num_train+num_vali :,:]
        test_data_labels = labels[num_train+num_vali :]

        m = TaoTreeClassifier(randomize_tree=False, weight_errors=False,
                              node_model='linear', model_args={'max_depth': 5},
                              verbose=1)

    #     ###Testing###
        DATA_NAME = "syn"
        data = Dataset_syn(DATA_NAME)
        print(data.X_train.shape)
        print(data.y_train.shape)

        m.fit(data.X_train, data.y_train)
        print('Train acc', np.mean(m.predict(data.X_train) == data.y_train))
        print('Test acc', np.mean(m.predict(data.X_test) == data.y_test))

else:
#     DATA_NAME=["ADULT","bank_marketing","credit_card_defaults","gamma_telescope","rice_dataset","german_credit_data","spambase_dataset","accelerometer_gyro_dataset","swarm_behaviour"]#,"HIGGS"]
    DATA_NAME=["OpenML_Credit","OpenML_Electricity","OpenML_Pol","OpenML_House_16H","OpenML_MiniBooNE","OpenML_Eye_movements","OpenML_Diabetes130US","OpenML_Jannis","OpenML_Bioresponse","OpenML_California","OpenML_Heloc","OpenML_Covertype","bank_marketing","credit_card_defaults","gamma_telescope","rice_dataset","german_credit_data","spambase_dataset","accelerometer_gyro_dataset","swarm_behaviour"]#,"HIGGS"]

#     DATA_NAME=["ADULT"]
    for data_name in DATA_NAME:
        data = Dataset(data_name)
    #     print(data)
        print("=========================================================================================")
        m = TaoTreeClassifier(randomize_tree=False, weight_errors=False,
                              node_model='linear', model_args={'max_depth': 5},
                              verbose=1)
        print(data.X_train.shape)
        print(data.y_train.shape)

        m.fit(data.X_train, data.y_train)
        print('Train acc', np.mean(m.predict(data.X_train) == data.y_train))
        print('Test acc', np.mean(m.predict(data.X_test) == data.y_test))
