In [None]:
#@title Imports
%reset -f
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import copy
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from itertools import product as cartesian_prod
from sklearn.metrics import pairwise_distances
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import cluster, mixture
import zipfile
import shutil
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestCentroid
from scipy.io import arff
from sklearn.metrics import make_scorer
np.set_printoptions(precision=4)


#@title Importing Packages
import os
import random
from copy import deepcopy
import torchvision
import torchvision.transforms as transforms

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
#@title Synthetic data
def set_npseed(seed):
    np.random.seed(seed)


def set_torchseed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


#classification data

def data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        
    set_npseed(seed=seed)

    # Construct a complete decision tree with 2**num_levels-1 internal nodes,
    # e.g. num_levels=2 means there are 3 internal nodes.
    # w_list, b_list is a list of size equal to num_internal_nodes
    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0
    num_internal_nodes = 2**num_levels - 1
    num_leaf_nodes = 2**num_levels
    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present

    if vals is None: #when val i.e., labels are not provided make the labels dynamically
        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd
        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted

    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically
        w_list = np.random.standard_normal((num_internal_nodes, dim))
        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects
        b_list = np.zeros((num_internal_nodes))

    '''
    np.random.random_sample
    ========================
    Return random floats in the half-open interval [0.0, 1.0).

    Results are from the "continuous uniform" distribution over the
    stated interval.  To sample :math:`Unif[a, b), b > a` multiply
    the output of `random_sample` by `(b-a)` and add `a`::

        (b - a) * random_sample() + a
    '''

#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1
#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route
#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6

    data_x = np.random.standard_normal((num_data, dim))
    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))
    relevant_stats = data_x @ w_list.T + b_list
    curr_index = np.zeros(shape=(num_data), dtype=int)
    
    for level in range(num_levels):
        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))
        for el in nodes_curr_level:
#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])
            relevant_stats[:,el] += b_list[el]
        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint

        # Go down and right if wx+b>0 down and left otherwise.
        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise
        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable
        

    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints
    thres = threshold
    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree

    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)
    #removes all the datapoints that passes through a node hyperplane
    labels_pruned = labels[bound_dist>thres]
    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not
    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data

    for node in range(num_internal_nodes+num_leaf_nodes):
        if node==0:
            stats[node]=len(relevant_stats) #for root node all datapoints are present
            nodes_active[:,0]=1 #root node all data points active status is +1
            continue
        parent = (node-1)//2
        nodes_active[:,node]=nodes_active[:,parent]
        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4
        #finds if it is a right child or left of the parent
        if right_child==1:
            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active
        if right_child==0:
            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active
        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)
    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)

In [None]:
class DLGN_FC(nn.Module):
    def __init__(self, input_dim=None, output_dim=None, num_hidden_nodes=[], beta=30, dlgn_mode='dlgn_sf', mode='pwc'):
        super(DLGN_FC, self).__init__()
        self.num_hidden_layers = len(num_hidden_nodes)
        self.beta=beta  # Soft gating parameter
        self.dlgn_mode = dlgn_mode
        self.mode = mode
        self.num_nodes=[input_dim]+num_hidden_nodes+[output_dim]
        self.gating_layers=nn.ModuleList()
        self.value_layers=nn.ModuleList()

        for i in range(self.num_hidden_layers+1):
            if i!=self.num_hidden_layers:
                if self.dlgn_mode == 'dlgn_sf':
                    temp = nn.Linear(self.num_nodes[0], self.num_nodes[i+1], bias=False)
                else :
                    temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)
                self.gating_layers.append(temp)
            temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)
            self.value_layers.append(temp)
    
    def set_parameters_with_mask(self, to_copy, parameter_masks):
        # self and to_copy are DLGN_FC objects with same architecture
        # parameter_masks is compatible with dict(to_copy.named_parameters())
        for (name, copy_param) in to_copy.named_parameters():
            copy_param = copy_param.clone().detach()
            orig_param  = self.state_dict()[name]
            if name in parameter_masks:
                param_mask = parameter_masks[name]>0
                orig_param[param_mask] = copy_param[param_mask]
            else:
                orig_param = copy_param.data.detach()

    def return_gating_functions(self):
        effective_weights = []
        for i in range(self.num_hidden_layers):
            curr_weight = self.gating_layers[i].weight.detach().clone()
#             curr_weight /= torch.norm(curr_weight, dim=1, keepdim=True)
            if self.dlgn_mode=='dlgn_sf':
                effective_weights.append(curr_weight)
            else:
                if i==0:
                    effective_weights.append(curr_weight)
                else:
                    effective_weights.append(torch.matmul(curr_weight,effective_weights[-1]))
        return effective_weights
        # effective_weights (and effective biases) is a list of size num_hidden_layers


    def forward(self, x):
        gate_scores=[x]

        for el in self.parameters():
            if el.is_cuda:
                device = torch.device('cuda:0')
                # device = torch.device('cpu')
            else:
                device = torch.device('cpu')
        if self.mode=='pwc':
            values=[torch.ones(x.shape).to(device)]
        else:
            values=[x.to(device)]

        for i in range(self.num_hidden_layers):
            if self.dlgn_mode=='dlgn_sf':
                gate_scores.append( (x@self.gating_layers[i].weight.T) )
            else:
                gate_scores.append(self.gating_layers[i].to(device)(gate_scores[-1].to(device)))
            curr_gate_on_off = torch.sigmoid(self.beta * gate_scores[-1])
            values.append(self.value_layers[i](values[-1])*curr_gate_on_off)
        values.append(self.value_layers[self.num_hidden_layers](values[-1]))
        # Values is a list of size 1+num_hidden_layers+1
        #gate_scores is a list of size 1+num_hidden_layers
        return values,gate_scores

In [None]:
#@title Train DLGN model
def train_dlgn (DLGN_obj, train_data_curr,vali_data_curr,test_data_curr,
				train_labels_curr,test_labels_curr,vali_labels_curr,num_epoch=1,
				parameter_mask=dict()):
    # DLGN_obj is the initial network
    # parameter_mask is a dictionary compatible with dict(DLGN_obj.named_parameters())
    # if a key corresponding to a named_parameter is not present it is assumed to be all ones (i.e it will be updated)

    # Assuming that we are on a CUDA machine, this should print a CUDA device:
    # device='cpu'
    # Speed up of a factor of over 40 by using GPU instead of CPU
    # Final train loss of 0.02 and test acc of 74%

    print("train_data_curr inside train_dlgn:",train_data_curr.shape)
    set_torchseed(seed)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')
    DLGN_obj.to(device)
    criterion = nn.CrossEntropyLoss()
#     optimizer = optim.SGD(DLGN_obj.parameters(), lr=lr)
    optimizer = optim.Adam(DLGN_obj.parameters(), lr=lr)

    train_data_torch = torch.Tensor(train_data_curr)
    vali_data_torch = torch.Tensor(vali_data_curr)
#     test_data_torch = torch.Tensor(test_data_curr)

    train_labels_torch = torch.tensor(train_labels_curr, dtype=torch.int64)
#     test_labels_torch = torch.tensor(test_labels_curr, dtype=torch.int64)
    vali_labels_torch = torch.tensor(vali_labels_curr, dtype=torch.int64)

    num_batches = no_of_batches
    batch_size = len(train_data_curr)//num_batches
    losses=[]
    DLGN_obj_store = []
    best_vali_error = len(vali_labels_curr)
    debug_models= []
    train_losses = []
    running_loss = 0.7*num_batches # initial random loss = 0.7 

    for epoch in tqdm(range(saved_epochs[-1]+1)):  # loop over the dataset multiple times
#         if epoch in saved_epochs:
#             DLGN_obj_copy = deepcopy(DLGN_obj)
#             DLGN_obj_copy.to(torch.device('cpu'))
#             DLGN_obj_store.append(DLGN_obj_copy)
#             train_outputs_values, train_outputs_gate_scores =DLGN_obj(torch.Tensor(train_data_curr).to(device))
#             train_preds = train_outputs_values[-1]
#             criterion = nn.CrossEntropyLoss()
#             outputs = torch.cat((-1*train_preds,train_preds), dim=1)
#             targets = torch.tensor(train_labels_curr, dtype=torch.int64).to(device)
#             train_loss = criterion(outputs, targets)

#             train_losses.append(train_loss)
#             if epoch%100 == 0:
#                 print(train_loss)
#             if train_loss < 5e-6:
#                 break
#             if np.isnan(train_loss.detach().cpu().numpy()):
#                 break
        running_loss = 0.0
        for batch_start in range(0,len(train_data_curr),batch_size):
            if (batch_start+batch_size)>len(train_data_curr):
                break
            optimizer.zero_grad()
            inputs = train_data_torch[batch_start:batch_start+batch_size]
            targets = train_labels_torch[batch_start:batch_start+batch_size].reshape(batch_size)
            inputs = inputs.to(device)
            targets = targets.to(device)
            values,gate_scores = DLGN_obj(inputs)
            outputs = torch.cat((-1*values[-1], values[-1]), dim=1)
            loss = criterion(outputs, targets)			
            loss.backward()
            for name,param in DLGN_obj.named_parameters():
                parameter_mask[name] = parameter_mask[name].to(device)
                param.grad *= parameter_mask[name]   
                # if "val" in name and epoch>500:
                # 	param.grad *= 0.
            optimizer.step()
            running_loss += loss.item()


        train_outputs_values, train_outputs_gate_scores =DLGN_obj(torch.Tensor(train_data_curr).to(device))
        train_preds = train_outputs_values[-1]
        criterion = nn.CrossEntropyLoss()
        outputs = torch.cat((-1*train_preds,train_preds), dim=1)
        targets = torch.tensor(train_labels_curr, dtype=torch.int64).to(device)
        train_loss = criterion(outputs, targets)

        losses.append(train_loss.cpu().detach().clone().numpy())
        inputs = vali_data_torch.to(device)
        targets = vali_labels_torch.to(device)
        values,gate_scores =DLGN_obj(inputs)
        vali_preds = torch.cat((-1*values[-1], values[-1]), dim=1)
        vali_preds = torch.argmax(vali_preds, dim=1)
        vali_error= torch.sum(targets!=vali_preds)
        if vali_error < best_vali_error:
            DLGN_obj_return = deepcopy(DLGN_obj)
            best_vali_error = vali_error
#     plt.figure()
#     plt.title("DLGN loss vs epoch")
#     plt.plot(losses)
    # 	if not os.path.exists('figures'):
    # 		os.mkdir('figures')

    # 	filename = 'figures/'+filename_suffix +'.pdf'
    # 	plt.savefig(filename)
    DLGN_obj_return.to(torch.device('cpu'))
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # device = torch.device('cpu')
    return train_losses, DLGN_obj_return, DLGN_obj_store, losses, debug_models

**DLGN Decision Tree**

In [None]:
class Node():
    def __init__(self, feature_vals=None,feature_bias=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_vals = feature_vals
        self.feature_bias = feature_bias
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value
    #2. Check the weights' (1500x500) 1st index (0th) if it is -ve make the entire row change sign (multiply by -1) and do experiments. So the 4th coordinate datas will now become 1st coordinate,
    
    #so will get everything in +ve w not -ve w. clustering of centres will happen for only +ve.
def process_array(arr):
    mask = arr[:, 0] < 0
    arr[mask] *= -1
    return arr
    
class DecisionTreeClassifierDT():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''

        # initialize the root of the tree 
        self.root = None

        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset,dataset_val,eps,min_samples,num_hidden_nodes, curr_depth=0,curr_node=0):
        ''' recursive function to build the tree ''' 
#         if len(dataset)==0: return;

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        print("curr_depth:",curr_depth)
        print("curr_node:",curr_node)
        print("num_samples:",num_samples)
        print("num_features:",num_features)


        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset,dataset_val,eps,min_samples,curr_depth,curr_node,num_hidden_nodes)
            # check if information gain is positive
            print("best_info_gain:",best_split["info_gain"])
            if best_split["info_gain"]>0:
            # recur left
#             num_hidden_nodes=[num_hidden_nodes[0]//2**(curr_depth),num_hidden_nodes[1]//2**(curr_depth),num_hidden_nodes[2]//2**(curr_depth)]
                left_subtree = self.build_tree(best_split["dataset_left"],dataset_val,eps,min_samples,num_hidden_nodes, curr_depth+1,2*curr_node+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"],dataset_val,eps,min_samples,num_hidden_nodes, curr_depth+1,2*curr_node+2)
                # return decision node
                return Node(best_split["feature_vals"], best_split["feature_bias"],
                            left_subtree, right_subtree, best_split["info_gain"])

        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        print("leaf_value:",leaf_value)
        # return leaf node
        return Node(value=leaf_value)

    def get_best_split(self, dataset,dataset_val,eps,min_samples,curr_depth,curr_node,num_hidden_nodes):
        ''' function to find the best split '''

        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")

        X_train, Y_train = dataset[:,:-1], dataset[:,-1]
        X_valid,Y_valid = dataset_val[:,:-1],dataset_val[:,-1]
        print("X_train:",X_train.shape)
#         train_data_dt, vali_data_dt, train_data_labels_dt, vali_data_labels_dt = train_test_split(X, Y, test_size=.1, random_state=41)
        set_npseed(seed)
        set_torchseed(seed)
        ##DLGN clustering of wts

        set_torchseed(6675)
        DLGN_init= DLGN_FC(input_dim=input_dim, output_dim=1, num_hidden_nodes=num_hidden_nodes, beta=beta, dlgn_mode=dlgn_mode)
        train_parameter_masks=dict()
        for name,parameter in DLGN_init.named_parameters():
            if "val" in name:
                train_parameter_masks[name]=torch.ones_like(parameter) #*0.001 # Updating all value network layers
            if "gat" in name:
                train_parameter_masks[name]=torch.ones_like(parameter)


                # train_parameter_masks[name][:num_neurons_set] *= 0.
            train_parameter_masks[name].to(device)

        set_torchseed(5000)
        train_losses, DLGN_obj_final, DLGN_obj_store, losses , debug_models= train_dlgn(train_data_curr=X_train, vali_data_curr=X_valid,test_data_curr=X_valid, train_labels_curr=Y_train,
                                                    test_labels_curr=Y_valid,vali_labels_curr=Y_valid,DLGN_obj=deepcopy(DLGN_init),parameter_mask=train_parameter_masks)
        
#         print("Plotting train_losses",train_losses)
#         plt.plot(train_losses)
#         print("Plotting losses",losses)
#         plt.plot(losses)
        torch.cuda.empty_cache() 

        w_list = np.concatenate((w_list_old,-w_list_old),axis=0)
        effective_weights = DLGN_obj_final.return_gating_functions()
        wts_list=[] #DLGN model effective wts
        for layer in range(len(effective_weights)):
#             if layer!=0: #and layer!=1 and layer!=2:
            wts =  np.array(effective_weights[layer].cpu().data.detach().numpy())
#             wts /= np.linalg.norm(wts, axis=1)[:,None]
            wts_list.append(wts)
        wts_list = np.concatenate(wts_list)
        
        ### Sorting based on NORMS of the vectors
        
        # Calculate norms of each vector
        norms = np.linalg.norm(wts_list, axis=1)
#         print ("norms:",norms)

        # Sort indices based on norms in descending order
        sorted_indices = np.argsort(norms)[::-1]
        print("sorted_indices:",norms[sorted_indices][:40])

        # Take the best 100 vectors
        best_100_vectors_descending = wts_list[sorted_indices[:]]
        # Calculate norms of each vector
        norms = np.linalg.norm(best_100_vectors_descending, axis=1)

        # Normalize each vector by dividing it by its norm
        wts_list = best_100_vectors_descending / norms[:, np.newaxis]

        # Now, best_100_vectors contains the top 100 vectors sorted by their norms

#         norms_dt_hplane = np.linalg.norm(w_list, axis=1)
#         print("norms_dt_hplane:",norms_dt_hplane)
        
        
        wts_list = process_array(wts_list)
        print("wts_list shape:",wts_list.shape)
        
        pd1 = pairwise_distances(w_list,wts_list)
        print("Pairwise Distance of all top k hyperplanes to each labelling func hyperplane \n ")
        print("pd1:",pd1)

        min_dis = np.inf
        eps_best = 0
        min_samples_best = 0
        label_max_clus_best=0
        if curr_node ==0:
            eps = .2
            min_samples = 6
        if curr_node ==1:
            eps = .3
            min_samples = 6
        if curr_node ==2:
            eps = .3
            min_samples = 6
#         if curr_node ==3:
#             eps = .3
#             min_samples = 15
#         if curr_node ==5:
#             eps = .3
#             min_samples = 16
#         if curr_node ==13:
#             eps = .3
#             min_samples = 7
        
        core_samples, dbscan_labels = cluster.dbscan(wts_list, eps=eps, metric='euclidean', min_samples=min_samples)
        num_clusters = max(dbscan_labels)+1
        if num_clusters>0:
            print("eps:",eps,"min_samples:",min_samples)
            cluster_centres = []
            cluster_no = []
            for i in range(num_clusters):
                cluster_centres.append(wts_list[dbscan_labels==i].mean(axis=0))
                cluster_no.append(len(wts_list[dbscan_labels==i]))
            cluster_centres = np.array(cluster_centres)
            label_max_clus = np.argmax(cluster_no)
            feature_vals = cluster_centres[label_max_clus].reshape(1,-1)
            feature_bias = 0
            
            print("cluster_centres_size:",len(cluster_centres))
            print("DT data hyperplane shape",w_list.shape)
            pd2 = pairwise_distances(w_list,cluster_centres)
            print("Pairwise Distance of all halfspace cluster centres to each labelling func hyperplane \n ")
            print("pd2:",pd2)
            print("No of wts in each cluster:",cluster_no)
            print("cluster_no[label_max_clus]:",cluster_no[label_max_clus])
            print("label_max_clus:",label_max_clus)
            
            pd3 = pairwise_distances(w_list,feature_vals)
            print("Distance of closest halfspace cluster to each labelling func hyperplane \n", pd3.min(axis=1)[:])
            
             # get current split
            dataset_left, dataset_right = self.split(dataset, feature_vals,feature_bias)
            # check if childs are not null
            print("dataset_left:",len(dataset_left))
            print("dataset_right:",len(dataset_right))
            # check if childs are not null
            
            if len(dataset_left)>0 and len(dataset_right)>0:
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                print("no of 0s in left:",np.count_nonzero(left_y == 0))
                print("no of 1s in left:",np.count_nonzero(left_y == 1))
                print("no of 0s in right:",np.count_nonzero(right_y == 0))
                print("no of 1s in right:",np.count_nonzero(right_y == 1))
                # compute information gain
                curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                print("curr_info_gain:",curr_info_gain)
                # update the best split if needed

                best_split["feature_vals"] = feature_vals
                best_split["feature_bias"] = feature_bias
                best_split["dataset_left"] = dataset_left
                best_split["dataset_right"] = dataset_right
                best_split["info_gain"] = curr_info_gain
                
                max_info_gain = curr_info_gain
                eps_best = eps
                min_samples_best = min_samples
            else:
                best_split["feature_vals"] = 0
                best_split["feature_bias"] = 0
                best_split["dataset_left"] = 0
                best_split["dataset_right"] = 0
                best_split["info_gain"] = 0
                    
        else:
            # dictionary to store the best split
            print("============LAST LEVEL===========================")
           
           
#             # Create a logistic regression model
            model = LogisticRegression()
            # Check the number of unique classes
            num_classes_y = len(np.unique(Y_train))
            if num_classes_y<2:
                if Y_train[-1]==0:
                    Y_train[-1]=1
                else:
                    Y_train[-1]=0
            # Fit the model to the data
            model.fit(X_train, Y_train)

            # Get the model coefficients (weights)
            weights = model.coef_
            bias = model.intercept_
            weights = weights / np.linalg.norm(weights, axis=1)[:, None]
            print("Model Weights:")
            print(weights)
            
            feature_vals=weights[0]
            feature_bias=0
            dataset_left, dataset_right = self.split(dataset, feature_vals,feature_bias)
            print("dataset_left:",len(dataset_left))
            print("dataset_right:",len(dataset_right))
            # check if childs are not null
            if len(dataset_left)>0 and len(dataset_right)>0:
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                print("no of 0s in left:",np.count_nonzero(left_y == 0))
                print("no of 1s in left:",np.count_nonzero(left_y == 1))
                print("no of 0s in right:",np.count_nonzero(right_y == 0))
                print("no of 1s in right:",np.count_nonzero(right_y == 1))
                # compute information gain
                curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                print("curr_info_gain:",curr_info_gain)
                # update the best split if needed
                
                best_split["feature_vals"] = feature_vals
                best_split["feature_bias"] = feature_bias
                best_split["dataset_left"] = dataset_left
                best_split["dataset_right"] = dataset_right
                best_split["info_gain"] = curr_info_gain
                max_info_gain = curr_info_gain
            else:
                best_split["feature_vals"] = 0
                best_split["feature_bias"] = 0
                best_split["dataset_left"] = 0
                best_split["dataset_right"] = 0
                best_split["info_gain"] = 0
        print("best_split_info_gain:",best_split["info_gain"])
        return best_split

        # return best split
        return best_split

    def split(self, dataset, feature_vals,feature_bias):
        ''' function to split the data '''
        dataset_left = np.array([row for row in dataset if (row[:-1]@feature_vals.T<0)])
        dataset_right = np.array([row for row in dataset if (row[:-1]@feature_vals.T>=0)])
        return dataset_left, dataset_right

    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''

        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain

    def entropy(self, y):
        ''' function to compute entropy '''

        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy

    def gini_index(self, y):
        ''' function to compute gini index '''

        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini

    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''

        Y = list(Y)
        return max(Y, key=Y.count)

    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''

        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)

    def fit(self, X, Y,X_val,Y_val,eps,min_samples,num_hidden_nodes):
        ''' function to train the tree '''

        dataset = np.concatenate((X, Y), axis=1)
        dataset_val = np.concatenate((X_val, Y_val), axis=1)
        self.root = self.build_tree(dataset,dataset_val,eps,min_samples,num_hidden_nodes)

    def predict(self, X):
        ''' function to predict new dataset '''

        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''

        if tree.value!=None: return tree.value
        feature_vals = tree.feature_vals
        feature_bias = tree.feature_bias
        if x@feature_vals.T<0:#+feature_bias<0:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)


In [None]:
# Define dictionaries
seed=365
num_levels=4
threshold = 0 #data seperation distance

optimizer_name ='Adam'
modep='pwc' 
output_dim=1
num_epoch=256
# saved_epochs = [0]
# saved_epochs = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,64,128,256,512,1024,2048, 
                # 4096, 8192]# , 16384, 32768, 65536, 125000, 250000]
saved_epochs = list(range(0,num_epoch+1,num_epoch//10))
weight_decay=0.0
no_of_batches=10 #[1,10,100]

data_configs = [
    {"input_dim": 20, "num_data": 40000},
    {"input_dim": 100, "num_data": 60000},
    {"input_dim": 500, "num_data": 100000}
]

# Code block to run for each dictionary
for config in data_configs:
    input_dim = config["input_dim"]
    num_data = config["num_data"]

    
    
    ((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(
                                                dim=input_dim, seed=seed, num_levels=num_levels,
                                                num_data=num_data)
    seed_set=seed
    w_list_old = np.array(w_list)
    b_list_old = np.array(b_list)
    print(sum(labels==1))
    print(sum(labels==0))
#     print(labels.shape)
#     print(vals)
#     print(stats)
    print("Seed= ",seed_set)
    num_data = len(data_x)
    num_train= num_data//2
    num_vali = num_data//4
    num_test = num_data//4
    train_data = data_x[:num_train,:]
    train_data_labels = labels[:num_train]

    vali_data = data_x[num_train:num_train+num_vali,:]
    vali_data_labels = labels[num_train:num_train+num_vali]

    test_data = data_x[num_train+num_vali :,:]
    test_data_labels = labels[num_train+num_vali :]
    
    
    
    #DLGN DT code
    print("DLGN Decision Tree =================================================")

#             num_epoch=256
    dlgn_mode = 'dlgn'
    beta=3
    lr= 0.001
    X = train_data
    print(X.shape)
    Y = train_data_labels.reshape(-1,1)
    print(Y.shape)
    X_val = vali_data
    print(X_val.shape)
    Y_val = vali_data_labels.reshape(-1,1)
    print(Y_val.shape)

    eps_list=[0.3]
    min_samples_list=[5]
    num_hidden_nodes=[500,500,500]
    max_depth_list = [3]
    results_list = []
    for eps in eps_list:
        print("================= EPS: ",eps,"=================================")
        for min_samples in min_samples_list:
            print("================= min_samples: ",min_samples,"=================================")
            for max_depth in max_depth_list:
                print("================= max_depth: ",max_depth,"=================================")
                classifier = DecisionTreeClassifierDT(min_samples_split=int(0.01*X.shape[0]), max_depth=max_depth)
                classifier.fit(X,Y,X_val,Y_val,eps,min_samples,num_hidden_nodes)

                Y_pred = classifier.predict(test_data)
                Test_accuracy = accuracy_score(test_data_labels.reshape(-1,1), Y_pred)
                print("Test Accuracy:", Test_accuracy)


                # Append the result to the list
                results_list.append({
                    "eps": eps,
                    "min_samples": min_samples,
                    "max_depth": max_depth,
                    "Test Accuracy": round(Test_accuracy, 5)
                })
                
    # Create Pandas DataFrames
    df_dlgn = pd.DataFrame(results_list)

    # Set pandas display options to show more rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    print(f"Running code for input_dim={input_dim}, num_data={num_data}")
    # Print the full tables
    print("\nDLGN MODE: dlgn\n")
    print(df_dlgn)

    #DLGN DT code ends
    


#     print("---" * 30)
#     # Create lists to store results for each dlgn_mode and beta
#     results_list_dlgn = []
#     results_list_dlgn_sf = []
    
#     for dlgn_mode in ('dlgn','dlgn_sf'):
#         print(f"\n{'='*30} DLGN MODE: {dlgn_mode} {'='*30}\n")
#         for beta in (3,10,30):
#             print(f"\n{'='*20} BETA: {beta} {'='*20}\n")
#             # Create lists to store results for the current dlgn_mode and beta
#             results_list = []
#             for lr in (0.001,0.01,0.02):
#                 print("LEARNING RATE ====================", lr)

              
#                 #DLGN DT code
#                 print("DLGN Decision Tree =================================================")

#     #             num_epoch=256
#                 X = train_data
#                 print(X.shape)
#                 Y = train_data_labels.reshape(-1,1)
#                 print(Y.shape)

#                 eps=[0.2]
#                 min_samples=[5]
#                 num_hidden_nodes=[500,500,500,500]
#                 classifier = DecisionTreeClassifierDT(min_samples_split=1, max_depth=3)
#                 classifier.fit(X,Y,eps,min_samples,num_hidden_nodes)

#                 Y_pred = classifier.predict(test_data) 
#                 print(accuracy_score(test_data_labels, Y_pred))
#                 #DLGN DT code ENDs
           
#                 print("========================================================================")
#                 layer_configs = [
#                     {"num_hidden_layers": 3, "num_hidden_nodes": [10,10,10]},
#                     {"num_hidden_layers": 3, "num_hidden_nodes": [20,20,20]},
#                     {"num_hidden_layers": 3, "num_hidden_nodes": [50,50,50]},
#                     {"num_hidden_layers": 4, "num_hidden_nodes": [10,10,10,10]},
#                     {"num_hidden_layers": 4, "num_hidden_nodes": [20,20,20,20]},
#                     {"num_hidden_layers": 4, "num_hidden_nodes": [50,50,50,50]},
#                     {"num_hidden_layers": 5, "num_hidden_nodes": [10,10,10,10,10]},
#                     {"num_hidden_layers": 5, "num_hidden_nodes": [20,20,20,20,20]},
#                     {"num_hidden_layers": 5, "num_hidden_nodes": [50,50,50,50,50]}
                    
                    
# #                     {"num_hidden_layers": 3, "num_hidden_nodes": [100,100,100]},
# #                     {"num_hidden_layers": 3, "num_hidden_nodes": [500,500,500]},
# #                     {"num_hidden_layers": 4, "num_hidden_nodes": [500,500,500,500]},
# #                     {"num_hidden_layers": 4, "num_hidden_nodes": [100,100,100,100]},
# #                     {"num_hidden_layers": 5, "num_hidden_nodes": [100,100,100,100,100]},
# #                     {"num_hidden_layers": 5, "num_hidden_nodes": [250,250,250,250,250]},
# #                     {"num_hidden_layers": 5, "num_hidden_nodes": [500,500,500,500,500]},
# #                     {"num_hidden_layers": 6, "num_hidden_nodes": [100,100,100,100,100,100]},
# #                     {"num_hidden_layers": 6, "num_hidden_nodes": [500,500,500,500,500,500]}
#                 ]
#                 for layer_config in layer_configs:
#                     num_hidden_layers = layer_config["num_hidden_layers"]
#                     num_hidden_nodes = layer_config["num_hidden_nodes"]

#                     print(f"Running code for num_hidden_layers={num_hidden_layers}, num_hidden_nodes={num_hidden_nodes}")

#                     max_no_of_nodes=max(num_hidden_nodes)

#                     set_torchseed(6675)
#                     DLGN_init= DLGN_FC(input_dim=input_dim, output_dim=1, num_hidden_nodes=num_hidden_nodes, beta=beta, dlgn_mode=dlgn_mode)
#     #                     for name,parameter in DLGN_init.named_parameters():
#     #                         print(name)
#     #                         print(parameter.shape)

#                     train_parameter_masks=dict()
#                     for name,parameter in DLGN_init.named_parameters():
#                         if "val" in name:
#                             train_parameter_masks[name]=torch.ones_like(parameter) #*0.001 # Updating all value network layers
#                         if "gat" in name:
#                             train_parameter_masks[name]=torch.ones_like(parameter)


#                             # train_parameter_masks[name][:num_neurons_set] *= 0.
#                         train_parameter_masks[name].to(device)

#                     set_torchseed(5000)
#                     train_losses, DLGN_obj_final, DLGN_obj_store, losses , debug_models= train_dlgn(train_data_curr=train_data,
#                                                                 vali_data_curr=vali_data,
#                                                                 test_data_curr=test_data,
#                                                                 train_labels_curr=train_data_labels,
#                                                                 vali_labels_curr=vali_data_labels,
#                                                                 test_labels_curr=test_data_labels,
#                                                                 DLGN_obj=deepcopy(DLGN_init),
#                                                                 parameter_mask=train_parameter_masks)


#                     torch.cuda.empty_cache() 
#                     losses=np.array(losses)


#                     test_outputs_values, test_outputs_gate_scores =DLGN_obj_final(torch.Tensor(test_data))
#                     test_preds = test_outputs_values[-1]
#                     test_preds = test_preds.detach().numpy()
#                     Test_error=np.sum(test_data_labels != (np.sign(test_preds[:,0])+1)//2)
#                     Num_test_data=len(test_data_labels)
#                     print("Test_error=",Test_error)
#                     print("Num_test_data=",Num_test_data)
#                     test_accuracy = 1-Test_error/Num_test_data
#                     print("Test Accuracy=",test_accuracy)
#                     # Append the result to the list
#                     results_list.append({
#                         "Beta": beta,
#                         "LR": lr,
#                         "Hidden Layers": layer_config["num_hidden_layers"],
#                         "Hidden Nodes": str(layer_config["num_hidden_nodes"]),
#                         "Test Accuracy": round(test_accuracy, 5)
#                     })
                    
# # Append the results for the current dlgn_mode and beta to the overall list
#             if dlgn_mode == 'dlgn':
#                 results_list_dlgn.extend(results_list)
#             elif dlgn_mode == 'dlgn_sf':
#                 results_list_dlgn_sf.extend(results_list)
                
#     # Create Pandas DataFrames
#     df_dlgn = pd.DataFrame(results_list_dlgn)
#     df_dlgn_sf = pd.DataFrame(results_list_dlgn_sf)

#     # Set pandas display options to show more rows and columns
#     pd.set_option('display.max_rows', None)
#     pd.set_option('display.max_columns', None)
#     print(f"Running code for input_dim={input_dim}, num_data={num_data}")
#     # Print the full tables
#     print("\nDLGN MODE: dlgn\n")
#     print(df_dlgn)

#     print("\nDLGN MODE: dlgn_sf\n")
#     print(df_dlgn_sf)
# '''