**Import**

In [None]:
#@title Imports
%reset -f
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import copy
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from itertools import product as cartesian_prod
from sklearn.metrics import pairwise_distances

from sklearn import tree
from sklearn import cluster, mixture
import zipfile
import shutil
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestCentroid
from scipy.io import arff


np.set_printoptions(precision=4)


#@title Importing Packages
import os
import random
from copy import deepcopy
import torchvision
import torchvision.transforms as transforms

**Device**

In [None]:
#Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

**Synthetic Dataset**

In [None]:
#Synthetic Dataset
def set_npseed(seed):
    np.random.seed(seed)


def set_torchseed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


#classification data

def data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        
    set_npseed(seed=seed)

    # Construct a complete decision tree with 2**num_levels-1 internal nodes,
    # e.g. num_levels=2 means there are 3 internal nodes.
    # w_list, b_list is a list of size equal to num_internal_nodes
    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0
    num_internal_nodes = 2**num_levels - 1
    num_leaf_nodes = 2**num_levels
    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present

    if vals is None: #when val i.e., labels are not provided make the labels dynamically
        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd
        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted

    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically
        w_list = np.random.standard_normal((num_internal_nodes, dim))
        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects
        b_list = np.zeros((num_internal_nodes))

    

#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1
#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route
#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6

    data_x = np.random.standard_normal((num_data, dim))
    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))
    relevant_stats = data_x @ w_list.T + b_list
    curr_index = np.zeros(shape=(num_data), dtype=int)
    
    for level in range(num_levels):
        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))
        for el in nodes_curr_level:
#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])
            relevant_stats[:,el] += b_list[el]
        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint

        # Go down and right if wx+b>0 down and left otherwise.
        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise
        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable
        

    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints
    thres = threshold
    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree

    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)
    #removes all the datapoints that passes through a node hyperplane
    labels_pruned = labels[bound_dist>thres]
    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not
    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data

    for node in range(num_internal_nodes+num_leaf_nodes):
        if node==0:
            stats[node]=len(relevant_stats) #for root node all datapoints are present
            nodes_active[:,0]=1 #root node all data points active status is +1
            continue
        parent = (node-1)//2
        nodes_active[:,node]=nodes_active[:,parent]
        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4
        #finds if it is a right child or left of the parent
        if right_child==1:
            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active
        if right_child==0:
            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active
        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)
    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)

In [None]:
def evaluate_algorithm():
#     # Load the dataset using the provided fetch function
#     data_splits = data_fetch_function()

    # Algorithms to evaluate
    algorithms = [
        ("SVM", SVC()),
        ("Relu Neural Networks", MLPClassifier(
            solver='adam',  # Specify 'adam' optimizer
            hidden_layer_sizes=(100, 50, 25),
            activation='relu',
            learning_rate_init=0.001,
            max_iter=1000,
            random_state=42)),
        ("Decision Trees", DecisionTreeClassifier()),
        ("Random Forest", RandomForestClassifier())
    ]
    results = []

    # Iterate over each algorithm
    for algo_name, algo in algorithms:
        print(f"Running {algo_name}...")

        best_acc = 0.0
        best_params = {}

        # Hyperparameter tuning using GridSearchCV
        if algo_name == "Logistic Regression":
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'max_iter': [100,1000]
            }
        elif algo_name == "SVM":
            # Define parameter grid for SVC
            param_grid = {
                'C': [0.1,0.5, 1,2,5],
                'kernel': ['linear', 'rbf', 'sigmoid'],
#                 'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],
                'degree': [2, 3, 4, 5],
            }
#             param_grid = {
#                 'kernel': ['rbf']
#             }
        elif algo_name == "Naive Bayes":
            param_grid = {
                'var_smoothing': [1e-08, 1e-07, 1e-05, 1e-04, 1e-02]
            }
        elif algo_name == "K-NN":
            # Define parameter grid for KNeighborsClassifier
            param_grid = {
                'n_neighbors': [1, 3, 5, 7, 9,15],
                'weights': ['uniform', 'distance'],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }
        elif algo_name == "Relu Neural Networks":
            # Define parameter grid for MLPClassifier
            param_grid = {
                'hidden_layer_sizes': [(100,100,100),(500, 500, 500),(500,500,500,500),(1000,1000,1000,1000)],
                'learning_rate_init': [0.001,0.005,0.0001],
                'max_iter': [200,500,1000],
            }
        elif algo_name == "Decision Trees":
            # Define parameter grid for DecisionTreeClassifier
            param_grid = {
                'criterion': ['gini', 'entropy'],
                'splitter': ['best', 'random'],
                'max_depth': [3,4,5,10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2']
            }
        elif algo_name == "Random Forest":
            param_grid = {
                'n_estimators': [50, 100],
                'criterion': ['gini', 'entropy'],
                'max_depth': [3,4,5, 10],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2']
            }
        elif algo_name == "Nearest Centroid":
            # Define parameter grid for NearestCentroid (distance metric)
            param_grid = {
                'metric': ['euclidean']
            }  
        else:
            # Exclude 'priors' from the parameter grid
            param_grid = {k: v for k, v in algo.get_params().items() if k != 'priors'}

        clf = GridSearchCV(algo, param_grid, cv=3, verbose=1, n_jobs=-1)
        clf.fit(train_data,train_data_labels)

        best_acc = clf.best_score_
        best_params = clf.best_params_
        print(best_params)
        
        # Train the final model on combined training and validation data
        model = algo.set_params(**best_params)
        model.fit(train_data,train_data_labels)

        # Predict on the test set
        y_pred = model.predict(test_data)
        test_acc = accuracy_score(test_data_labels, y_pred)
        print(test_acc)
        # Store the results
        results.append({"Algorithm": algo_name, "Best Val Accuracy": best_acc, "Test Accuracy": test_acc})

    # Create a results DataFrame
    results_df = pd.DataFrame(results)

    # Print the results table
    print("\nResults Table:")
    print(results_df)

**Change this cell to run different configs**

In [None]:
#In this cell change the parameter values to train on different synthetic data with different models.
#Dataset Characteristics

seed=365
num_levels=4
threshold = 0 #data seperation distance

optimizer_name ='Adam'
modep='pwc' 
output_dim=1
num_epoch=1500 #number of epochs to run

x_epoch = 1500
saved_epochs = list(range(0,1501,10))
weight_decay=0.0
no_of_batches=10 #[1,10,100]

input_dim = 20 #Synthetic data input dimension
num_data = 40000 #Total data points



print(f"Running code for input_dim={input_dim}, num_data={num_data}")

((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(
                                            dim=input_dim, seed=seed, num_levels=num_levels,
                                            num_data=num_data)
seed_set=seed
w_list_old = np.array(w_list)
b_list_old = np.array(b_list)
print(sum(labels==1))
print(sum(labels==0))
print("Seed= ",seed_set)
num_data = len(data_x)
num_train= num_data//2
num_vali = num_data//4
num_test = num_data//4
train_data = data_x[:num_train,:]
train_data_labels = labels[:num_train]

vali_data = data_x[num_train:num_train+num_vali,:]
vali_data_labels = labels[num_train:num_train+num_vali]

test_data = data_x[num_train+num_vali :,:]
test_data_labels = labels[num_train+num_vali :]

evaluate_algorithm()