# CART Implementation

### Import Packages

In [1]:
import math
import pandas as pd
import numpy as np
import requests
from io import StringIO
from gurobipy import *
from sklearn import tree
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from ucimlrepo import fetch_ucirepo as fetc
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


### Import Data and Check Data Status (From UCIMLREPO)

In [2]:
rice = fetc(id=545)
X = rice.data.features 
y = rice.data.targets
df = pd.DataFrame(X, columns=rice.data.feature_names).sample(n=800, random_state=42)
df['target'] = y
df = df.reset_index(drop=True)

#Change non numeric columns to numeric
column_names = ["Area", "Perimeter", "Major_Axis_Length", "Minor_Axis_Length","Eccentricity", "Convex_Area", "Extent", "target"]
non_numeric_cols = df.drop(['target'],axis=1).select_dtypes(exclude=[np.number]).columns
if not non_numeric_cols.empty:
    for col in non_numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
for col in column_names[:len(column_names)-1]:
    df[col] = StandardScaler().fit_transform(df[[col]])
    
for col in column_names[:len(column_names)-1]:
    df[col] = MinMaxScaler().fit_transform(df[[col]])

#Convert target column to numeric
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])
        
df.info()
#df.isnull().sum()
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area               800 non-null    float64
 1   Perimeter          800 non-null    float64
 2   Major_Axis_Length  800 non-null    float64
 3   Minor_Axis_Length  800 non-null    float64
 4   Eccentricity       800 non-null    float64
 5   Convex_Area        800 non-null    float64
 6   Extent             800 non-null    float64
 7   target             800 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 50.1 KB
       Area  Perimeter  Major_Axis_Length  Minor_Axis_Length  Eccentricity  \
0  0.507313   0.504348           0.447994           0.650306      0.528006   
1  0.503786   0.378801           0.347530           0.688925      0.401748   
2  0.551395   0.446065           0.372872           0.747982      0.363165   
3  0.574525   0.579105           0.643198           0

### Import Data and Check Data Status (From TXT)

In [32]:
df = pd.read_csv("c:\\Users\\zhuoq\\Downloads\\AMS515 Project\\Loan Application Classification matrix_train.txt", delim_whitespace=True).drop(['const'], axis=1)
df.rename(columns={'scenario_benchmark': 'target'}, inplace=True)

column_names = df.columns.tolist()
non_numeric_cols = df.drop(['target'],axis=1).select_dtypes(exclude=[np.number]).columns
if not non_numeric_cols.empty:
    for col in non_numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
for col in column_names[:len(column_names)-1]:
    df[col] = StandardScaler().fit_transform(df[[col]])

#normalize the data    
for col in column_names[:len(column_names)-1]:
    df[col] = MinMaxScaler().fit_transform(df[[col]])

#Convert target column to numeric
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

df = df.sample(n=4000, random_state=42)
df = df.reset_index(drop=True)

### Split Data into Training and Testing Sets

In [33]:
X = df.drop(['target'], axis=1)
y = df['target']
print("Total number of Class: " + str(len(np.unique(y))) + " which are: " + str(np.unique(y)))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = pd.DataFrame(x_train, columns=X.columns)
df_train['target'] = y_train
df_train = df_train.reset_index(drop=True)


Total number of Class: 2 which are: [0 1]


### Decision Tree

In [34]:
def clf(x_train, y_train, x_test, y_test,K):
    clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=K, random_state=42)
    clf_gini.fit(x_train, y_train)
    y_pred_gini = clf_gini.predict(x_test)
    y_pred_gini_train = clf_gini.predict(x_train)
    print('Testing Set Accuracy Score: ', accuracy_score(y_test, y_pred_gini), 'Training Set Accuracy Score:',accuracy_score(y_train, y_pred_gini_train))

clf(x_train, y_train, x_test, y_test,2)

Testing Set Accuracy Score:  0.89125 Training Set Accuracy Score: 0.8971875


### Tree Structure: Will be Used for Warmstart of OCT

In [35]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=42)
clf.fit(x_train, y_train)

"""
The following is for warm start in OCT
"""
feature = clf.tree_.feature
threshold = clf.tree_.threshold
feature_indices = []
threshold_indices = []
feature_indices_leaf =[]
threshold_indices_leaf = []
for i in range(len(feature)):
    if feature[i] != -2 and feature[i+1] != -2:
        feature_indices.append(int(feature[i]))
        threshold_indices.append(float(threshold[i]))
    elif feature[i] != -2 and feature[i+1] == -2:
        feature_indices_leaf.append(int(feature[i]))
        threshold_indices_leaf.append(float(threshold[i]))
    else:
        continue
feature_indices.extend(feature_indices_leaf)
threshold_indices.extend(threshold_indices_leaf)
print("Feature Indices: ", feature_indices, "Threshold Indices:", threshold_indices)
#leaf nodes
#branch nodes
print(np.sum(clf.tree_.feature == -2))
internal_nodes = np.sum(clf.tree_.feature != -2)
total_nodes = clf.tree_.node_count
print(f"Internal nodes: {internal_nodes}, Total nodes: {total_nodes}")
        

Feature Indices:  [1, 0, 0] Threshold Indices: [0.4952651560306549, 0.5, 0.4496598541736603]
4
Internal nodes: 3, Total nodes: 7


In [36]:
total_nodes = clf.tree_.node_count
leaf_nodes = round(total_nodes / 2)
branch_nodes = total_nodes // 2
print(f"Leaf nodes: {leaf_nodes}, Branch nodes: {branch_nodes}")

Leaf nodes: 4, Branch nodes: 3


# OCT Implementation

### Import Packages

In [9]:
import gurobipy as gp
from gurobipy import GRB

In [10]:
m = gp.Model('OCT')

Set parameter Username
Set parameter LicenseID to value 2643923
Academic license - for non-commercial use only - expires 2026-03-28


### Construct Constraints

In [None]:
"""
training data (X,Y)=(x_i,y_i), i=1,...,n; x_i in R^p and normalized to [0,1]^p, y_i in {1,...,K}
-n = # obervations
-p = # features
-K = # class label

-p(t) = parent nodes of node t
-A_L(t) = {left_branch ancestors of t} = {t if tmod2=0 and t/2 recursively gives the set}
-A_R(t) = {right_branch ancestors of t} = {t if (t-1)mod2=0 and (t-1)/2 recursively gives the set}


-D = max depth
-T = 2^(D+1)-1 = max # nodes
-TB = branch nodes = left nodes = {1,...,floor(T/2)} applies split is ax<b 
-TL = leaf = {floor(T/2)+1,...,T} make class prediction
-?a_t in R^p
-?b_t in R
-p = # features

-d_t = 1{node t applies a split}
-sum a_jt = d_t, j=1,...p, t in TB
-0 <= b_t <= d_t, t in TB
-a_jt in {0,1}, j=1,...p, t in TB


-d_t <= d_p(t), t in TB/{1}

-z_it = 1{x_i in node t}
-l_t = 1{leaf t contains any point}
-z_it <= l_t, t in TB
-sum(z_it) >= N_min*l_t for i=1,...,n, t in TB
-sum(z_it)=1 for t in TB, i=1,...,n

-x_j^i = ith largest value in feature j
-epsilon_j = min{x_j^(i+1)-x_j^i, i=1,...,n}
-epsilon_max = max{epsilon_j} wrt j
-epsilon  = {epsilon_1,...,epsilon_p}
-a_m(x_t + epsilon) <= b_m +(1+epsilon_max)(1-z_it), i=1,...,n, for all t in TB, for all m in A_L(t)
-a_m*x_i >= b_m - (1-z_it), i=1,...,n, for all t in TB, for all m in A_R(t)

-Y_ik = {1 if y_i=k, -1 otherwise}, k=1,...,K, i=1,...n
N_kt = 0.5*sum((1+Y_ik)z_it) for i=1,...,n; k=1,...K, t in TL
N_t = sum(z_it) for i=1,...,n; t in TL
c_t = argmax{N_kt} wrt k
c_kt = 1{c_t = k}
sum(c_kt) = l_t wrt k

L_t = N_t - max{N_kt} wrt k = min{N_t - N_kt} wrt k
L_t >= N_t - N_kt - n(1-c_kt), k=1,...K, t in TL
L_t <= N_t - N_kt + n*c_kt, k=1,...K, t in TL
L_t >= 0

L^ = baseline accuracy = #{most popular class}/n

objective: min (1/L^)sum(L_t) for t in TL + alpha*sum(d_t) for t in TB

"""

'\ntraining data (X,Y)=(x_i,y_i), i=1,...,n; x_i in R^p and normalized to [0,1]^p, y_i in {1,...,K}\n-n = # obervations\n-p = # features\n-K = # class label\n\n-p(t) = parent nodes of node t\n-A_L(t) = {left_branch ancestors of t} = {t if tmod2=0 and t/2 recursively gives the set}\n-A_R(t) = {right_branch ancestors of t} = {t if (t-1)mod2=0 and (t-1)/2 recursively gives the set}\n\n\n-D = max depth\n-T = 2^(D+1)-1 = max # nodes\n-TB = branch nodes = left nodes = {1,...,floor(T/2)} applies split is ax<b \n-TL = leaf = {floor(T/2)+1,...,T} make class prediction\n-?a_t in R^p\n-?b_t in R\n-p = # features\n\n-d_t = 1{node t applies a split}\n-sum a_jt = d_t, j=1,...p, t in TB\n-0 <= b_t <= d_t, t in TB\n-a_jt in {0,1}, j=1,...p, t in TB\n\n\n-d_t <= d_p(t), t in TB/{1}\n\n-z_it = 1{x_i in node t}\n-l_t = 1{leaf t contains any point}\n-z_it <= l_t, t in TB\n-sum(z_it) >= N_min*l_t for i=1,...,n, t in TB\n-sum(z_it)=1 for t in TB, i=1,...,n\n\n-x_j^i = ith largest value in feature j\n-epsilo

### Predetermined Variables

In [37]:

n= df_train.shape[0] # number of observations
p = df_train.drop(['target'],axis=1).shape[1] # number of features
K = len(np.unique(df_train['target'])) # number of classes
D = 2 # max depth
T = 2**(D+1)-1 # max number of nodes
L_hat = df_train['target'].value_counts().max()/n # baseline accuracy, predicting the most popular class for the dataset
N_min = math.floor(n*0.05)
X_train = df_train.drop(['target'],axis=1)

#Epsilon
epsilon=[]
for j in range(p):
    x_j = df_train.iloc[:,j].tolist()
    x_j.sort()  
    e=[]
    for i in range(n-1):
        if x_j[i+1]!=x_j[i]:
            e.append(x_j[i+1] - x_j[i])
    epsilon.append(min(e))
epsilon_max = max(epsilon)

print("epsilon: " + str(epsilon))
print("epsilon_max: " + str(epsilon_max))

#Y Matrix
Y = np.zeros([n,K], dtype = int) - 1 # Y_ik
Y[df_train.index,  y_train.tolist()] = 1  #based on the sample code, the [x,y] x is the features index, y is the class index

epsilon: [0.0013605442176869431, 0.0018939393939392257]
epsilon_max: 0.0018939393939392257


### Predetermined Sets (Note that index starts from 1)

In [38]:
left_ancestors = []
right_ancestors = []
for t in range(1,T+1):
    la_t =[]
    ra_t =[]
    tau=t
    while tau>1:
        pt = tau//2
        if tau % 2 == 0: #if t is even, then its parent is a left ancestor, else is a right ancestor
            la_t.append(pt)
        else:
            ra_t.append(pt)
        tau = pt
    la_t.sort() 
    ra_t.sort()
    left_ancestors.append(la_t)
    right_ancestors.append(ra_t)

TB = list(range(1,math.floor((T+1)/2)))  #Branch nodes
TL = list(range(math.floor((T+1)/2),T+1)) #Leaf nodes

print("Branch nodes: ", TB)
print("Leaf nodes: ", TL)

print("left_ancestors: " + str(left_ancestors))
print("right_ancestors: " + str(right_ancestors))

Branch nodes:  [1, 2, 3]
Leaf nodes:  [4, 5, 6, 7]
left_ancestors: [[], [1], [], [1, 2], [1], [3], []]
right_ancestors: [[], [], [1], [], [2], [1], [1, 3]]


### Variables

In [39]:
a = m.addVars(p,TB, vtype=GRB.BINARY, name="a_t") #dim |TB|xp
b = m.addVars(TB, vtype=GRB.CONTINUOUS, lb = 0, ub = 1, name="b_t") #dim |TB|
d = m.addVars(TB, vtype=GRB.BINARY, name="d_t") #dim |TB|
z = m.addVars(n, TL, vtype=GRB.BINARY, name="z") #dim nx|TL|
l = m.addVars(TL, vtype=GRB.BINARY, name="l_t") #dim |TL|
Nk = m.addVars(K, TL, vtype=GRB.INTEGER,name="N_kt") #dim Kx|TL|
N = m.addVars(TL, vtype=GRB.INTEGER, name="N_t") #dim |TL|
ck = m.addVars(K, TL, vtype=GRB.BINARY, name="c_kt")
L = m.addVars(TL, name="L_t") 

### Warm Start

In [40]:
# warm start using the results of CART algorithm
for i in TB:
    a[feature_indices[i-1], i].start = 1
    b[i].start = threshold_indices[i-1]

### Constraints

In [41]:
for t in TB:
    m.addConstr(a.sum("*",t) == d[t], name="sum_constraint_of_ajt") # sum of ajt = dt
    m.addConstr(b[t] <= d[t], name="bt_constraint_dt")     # bt <= dt
    m.addConstr(d[t] == 1, name="dt_constraint_d(t)")  

for t in TB[1:]:
    m.addConstr(d[t] <= d[t//2], name="dt_constraint_dp(t)") # dt <= dp(t)    

for i in range(n):
    m.addConstr(z.sum(i,"*") == 1, name="sum_of_zi(t)_constraint_1")  # sum sum of zit = 1
    
for t in TL:
    m.addConstr(z.sum("*",t) >= N_min*l[t], name="sum_of_zt_constraint_Nmin_lt") # sum of zit >= Nmin*lt
    for i in range(n):
        m.addConstr(z[i, t] <= l[t]) # zit <= lt
    
    for k in range(K):
        m.addConstr(Nk[k,t] == 1/2 * gp.quicksum(z[i,t] * (Y[i,k] + 1) for i in range(n))) #Nkt = 1/2(sum of (1+Yik)*zit #may need to be corrected

    m.addConstr(N[t] == z.sum("*",t))  # Nt = sum of zit

    m.addConstr(l[t] == ck.sum("*",t)) # sum of ckt = lt
    
    m.addConstr(l[t] == 1, name="dt_constraint_l(t)")

#m.addConstr(N.sum("*") == n, name="sum_of_N(t)_constraint") # sum of zit <= n*lt
    
for t in TL:
    l_ancestors = left_ancestors[t - 1]  # cache the list
    if l_ancestors:
        for la in l_ancestors:
            for i in range(n):
                xi = X_train.iloc[i]  # cache row once
                m.addConstr(gp.quicksum(a[j, la] * (xi[j] + epsilon[j]) for j in range(p)) <= b[la] + (1 + epsilon_max) * (1 - z[i, t]),
                    name=f"split_l_{la}_{i}_{t}"
                )
                
    r_ancestors = right_ancestors[t - 1]
    if r_ancestors:
        for r in r_ancestors:
            for i in range(n):
                xi = X_train.iloc[i]  # cache the row once
                m.addConstr(
                    gp.quicksum(a[j, r] * xi[j] for j in range(p)) >= b[r] - (1 - z[i, t]),
                    name=f"split_r_{r}_{i}_{t}"
                )
        
for t in TL:
    m.addConstr(L[t] >= 0, name="Lt_constraint3") #Lt ≥ 0
    for k in range(K):
        m.addConstr(L[t] >= N[t] - Nk[k,t] - n*(1-ck[k,t]), name="Lt_constraint1") #Lt ≤ Nt − Nkt + n(1-ckt) 
        m.addConstr(L[t] <= N[t] - Nk[k,t] + n*ck[k,t], name="Lt_constraint2")  #Lt ≤ Nt − Nkt + n*ckt 


In [42]:
m.update()
m.setObjective(L.sum('*') / L_hat + 0.5*gp.quicksum(d[t] for t in TB), GRB.MINIMIZE) #minimize L + alpha*sum(d_t) for t in TB

### Objective OCT

In [None]:
m.Params.timelimit = 3000
m.optimize()

Set parameter TimeLimit to value 3000
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (22631.2))

CPU model: Intel(R) Core(TM) Ultra 9 185H, instruction set [SSE2|AVX|AVX2]
Thread count: 16 physical cores, 22 logical processors, using up to 22 threads

Non-default parameters:
TimeLimit  3000

Optimize a model with 145765 rows, 44920 columns and 627546 nonzeros
Model fingerprint: 0x268815b9
Variable types: 21 continuous, 44899 integer (44863 binary)
Coefficient statistics:
  Matrix range     [2e-03, 4e+03]
  Objective range  [5e-01, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 4e+03]

User MIP start did not produce a new incumbent solution
MIP start from previous solve produced solution with objective 426.757 (2.61s)
MIP start from previous solve produced solution with objective 424.407 (3.02s)
Processing MIP start from previous solve: 0 nodes explored in subMIP, total elapsed time 6s
Processing MIP start from previous solve: 0 nodes explore

KeyboardInterrupt: 

Exception ignored in: 'gurobipy._core.logcallbackstub'
Traceback (most recent call last):
  File "C:\Users\zhuoq\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\iostream.py", line 655, in write
    def write(self, string: str) -> Optional[int]:  # type:ignore[override]

KeyboardInterrupt: 


     0     0    1.50000    0  648  373.89354    1.50000   100%     -  155s
     0     0    1.50000    0  914  373.89354    1.50000   100%     -  157s
     0     0    1.50000    0  643  373.89354    1.50000   100%     -  157s
     0     0    1.50000    0  448  373.89354    1.50000   100%     -  164s
     0     0    1.50000    0  372  373.89354    1.50000   100%     -  167s


### Obtain the Tree Structure (OCT)

In [None]:
"""
Tree Structure: 

Variables a, b, z, N_k and c=argmax{N_kt} are the decisive variables for the tree structure. 

Take a point x_i, if <a_1,x_i> <= b_1, then x_i is down to node 2, otherwise down to node 3;
suppose x_i gets to node 2, then if <a_2,x_i> <= b_2, then x_i will be in node 4, otherwise node 5.

Such process continues until x_i reaches a leaf node.

For each leaf node t, we then count the number of points in each class k, which is N_kt.

max{N_kt} represents the most popular class, which is c_t, in leaf node t, but not every point in 
such leaf is of class c_t.

If L_t = 0, then every point in leaf node t is indeed of class c_t, and the tree is pure. 

So L_t is the number of misclassified points in leaf node t.

Since every point can only be in one of the leaf nodes, the sum of L_t is the total number of
misclassified points in the tree.

For a testing set, we create a tree with the splitting criterion based on a and b; then N_kt and L_t are what
need to be calculated to access the accuracy of the tree on the testing set. The number of splits in each level
of the testing tree is the number of observations times the number of branch nodes. 

The problem is then how to perform the splits efficiently? 
"""

'\nTree Structure: \n\nVariables a, b, z, N_k and c=argmax{N_kt} are the decisive variables for the tree structure. \n\nTake a point x_i, if <a_1,x_i> <= b_1, then x_i is down to node 2, otherwise down to node 3;\nsuppose x_i gets to node 2, then if <a_2,x_i> <= b_2, then x_i will be in node 4, otherwise node 5.\n\nSuch process continues until x_i reaches a leaf node.\n\nFor each leaf node t, we then count the number of points in each class k, which is N_kt.\n\nmax{N_kt} represents the most popular class, which is c_t, in leaf node t, but not every point in \nsuch leaf is of class c_t.\n\nIf L_t = 0, then every point in leaf node t is indeed of class c_t, and the tree is pure. \n\nSo L_t is the number of misclassified points in leaf node t.\n\nSince every point can only be in one of the leaf nodes, the sum of L_t is the total number of\nmisclassified points in the tree.\n\nFor a testing set, we create a tree with the splitting criterion based on a and b; then N_kt and L_t are what\nnee

### Tree Structure Alternative 1

Split Criterion

In [29]:
a_matrix = np.zeros((p, len(TB))) #a[i,t]
threshold_oct = np.zeros(len(TB)) #b[i]
points_leaf = np.zeros(len(TL)) #N[i]

for i in TB:
    threshold_oct[i-1] = b[i].X #threshold 
    
for i in range(p):
    for t_idx, t in enumerate(TB):  # convert TB to indexable order
        a_matrix[i, t_idx] = a[i, t].X
        
for i in range(len(TL)):
    points_leaf[i-1] = N[i+2**D].X #N[i]

print("a_matrix(a): " + str(a_matrix))
print("threshold_oct(b): " + str(threshold_oct))
print("points_leaf(N): " + str(points_leaf))
print(x_train.shape[0])
print(sum(points_leaf)) 

a_matrix(a): [[0. 1. 0.]
 [1. 0. 1.]]
threshold_oct(b): [0.34090909 0.46530612 0.36174242]
points_leaf(N): [ 256.  208. 2366. 1170.]
4000
4000.0


Train Set Accuracy

In [30]:
Leaf_classes = [[] for _ in range(2**D)]
for i in range(x_train.shape[0]):
    j =1
    while j < 2**D:
        x_i = x_train.iloc[i].to_numpy()
        if np.dot(a_matrix[:, j-1],x_i)<threshold_oct[j-1]:
            j = 2*j
        else:
            j = 2*j + 1
    Leaf_classes[j-2**D].append(y_train.tolist()[i])

nonaccurate = 0
for leaf in Leaf_classes:
    if leaf:
        predicted_class = max(set(leaf), key=leaf.count)
        misclassified = len(leaf) - leaf.count(predicted_class)
        nonaccurate += misclassified

print("Total number of misclassified points: " + str(nonaccurate))
print("Train Set Accuracy: " + str((x_train.shape[0] - nonaccurate) / x_train.shape[0]))

def oct_accuracy(x,y,a,b,D):
    Leaf_classes = [[] for _ in range(2**D)]
    for i in range(x.shape[0]):
        j =1
        while j < 2**D:
            x_i = x.iloc[i].to_numpy()
            if np.dot(a[:, j-1],x_i)<b[j-1]:
                j = 2*j
            else:
                j = 2*j + 1
        Leaf_classes[j-2**D].append(y.tolist()[i])
    
    nonaccurate = 0
    for leaf in Leaf_classes:
        if leaf:
            predicted_class = max(set(leaf), key=leaf.count)
            misclassified = len(leaf) - leaf.count(predicted_class)
            nonaccurate += misclassified

    return (x.shape[0] - nonaccurate) / x.shape[0]

print("Train Set Accuracy: " + str(oct_accuracy(x_train,y_train,a_matrix,threshold_oct,D)))
print("Test Set Accuracy: " + str(oct_accuracy(x_test,y_test,a_matrix,threshold_oct,D)))   


Total number of misclassified points: 571
Train Set Accuracy: 0.85725
Train Set Accuracy: 0.85725
Test Set Accuracy: 0.855


Test Set Accuracy

In [31]:

Leaf_classes = [[] for _ in range(2**D)]
for i in range(x_test.shape[0]):
    j =1
    while j < 2**D:
        x_i = x_test.iloc[i].to_numpy()
        if np.dot(a_matrix[:, j-1],x_i)<threshold_oct[j-1]:
            j = 2*j
        else:
            j = 2*j + 1
    Leaf_classes[j-2**D].append(y_test.tolist()[i])

nonaccurate = 0
for leaf in Leaf_classes:
    predicted_class = max(set(leaf), key=leaf.count)
    misclassified = len(leaf) - leaf.count(predicted_class)
    nonaccurate += misclassified

print("Total number of misclassified points: " + str(nonaccurate))
print("Test Set Accuracy: " + str((x_test.shape[0] - nonaccurate) / x_test.shape[0]))



Total number of misclassified points: 145
Test Set Accuracy: 0.855


### Tree Structure Alternative 2

In [None]:
a_matrix = np.zeros((p, len(TB))) #a[i,t]
threshold_oct = np.zeros(len(TB)) #b[i]
split_position = np.zeros(len(TB)) #d[i]

for i in TB:
    split_position[i-1] = d[i].X #split or not

for i in TB:
    threshold_oct[i-1] = b[i].X #threshold 
    
for i in range(p):
    for t_idx, t in enumerate(TB):  # convert TB to indexable order
        a_matrix[i, t_idx] = a[i, t].X

print("a_matrix(a): " + str(a_matrix))
print("threshold_oct(b): " + str(threshold_oct))
print("split_position(d): " + str(split_position))

a_matrix(a): [[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
threshold_oct(b): [0.67305103 0.58484714 0.43549974]
split_position(d): [1. 1. 1.]


In [None]:
x_test

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
361,0.746643,0.724797,0.745690,0.697520,0.674321,0.742259,0.829650
73,0.291516,0.230560,0.242685,0.421303,0.575092,0.287238,0.332955
374,0.302174,0.257664,0.249048,0.431348,0.570109,0.304707,0.249723
155,0.549670,0.418815,0.286837,0.806927,0.186790,0.540795,0.411516
104,0.640695,0.643961,0.557220,0.732178,0.521176,0.643096,0.631189
...,...,...,...,...,...,...,...
266,0.371456,0.406843,0.443293,0.394480,0.737510,0.374686,0.267356
23,0.370923,0.319834,0.264266,0.542346,0.468112,0.373954,0.679757
222,0.541356,0.459264,0.363284,0.724679,0.363708,0.536820,0.518902
261,0.470049,0.465351,0.394775,0.600983,0.519104,0.486088,0.313494


In [None]:
#Covert x_test to array
X_test = x_test.astype(float).values

In [None]:
#define node
class TreeNode:
    def __init__(self, node_id, a=None, b=None, is_leaf=False):
        self.node_id = node_id
        self.a = a  
        self.b = b
        self.left = None
        self.right = None
        self.is_leaf = is_leaf 
#build the tree
def build_tree_from_solution(TB, a_matrix, threshold_oct, split_position):
    nodes = {}
    #set up nodes 
    for t_idx, t in enumerate(TB):
        is_split = split_position[t_idx] > 0.5  #d[i]=1 > 0.5 if d[i]=0 < 0.5 so we take 0.5 as a measure value
        a_vec = a_matrix[:, t_idx]   #ex. a1 = [-0. -0. -0. -0. -0. -0. -0.]
        b_val = threshold_oct[t_idx] #ex. b1 =0.42257857
        node = TreeNode(node_id=t, a=a_vec, b=b_val, is_leaf=not is_split)
        nodes[t] = node
    #connect nodes
    for t in TB:
        if split_position[TB.index(t)] > 0.5:
            nodes[t].left = nodes.get(2 * t, TreeNode(node_id=2*t, is_leaf=True))  
            nodes[t].right = nodes.get(2 * t + 1, TreeNode(node_id=2*t+1, is_leaf=True))
            nodes[2 * t] = nodes[t].left 
            nodes[2 * t + 1] = nodes[t].right

    return nodes[1]     #back to top

In [None]:
root = build_tree_from_solution(TB, a_matrix, threshold_oct, split_position)
print("Root node ID:", root.node_id)

Root node ID: 1


In [None]:
def predict_node(x_i, node):
    while not node.is_leaf:              #if it does not reach to leaf node, will be assigned to the next node until leaf node
        if np.dot(node.a, x_i) <= node.b:  # <a,x_i>   <= b
            node = node.left
        else:
            node = node.right            # <a,x_i>   > b
    return node.node_id                  #return which leaf node the x_i belongs to 

def assign_toleaf(X_data, root):
    leaf_assignments = {}
    for i, x_i in enumerate(X_data):
        leaf_id = predict_node(x_i, root)
        if leaf_id not in leaf_assignments:
            leaf_assignments[leaf_id] = []
        leaf_assignments[leaf_id].append(i)
    return leaf_assignments

In [None]:
leaf_assignments = assign_toleaf(X_test, root)
print(leaf_assignments)

{7: [0, 10, 13, 18, 21, 24, 27, 28, 37, 43, 48, 51, 53, 55, 60, 64, 70, 71, 73, 86, 88, 99, 109, 115, 117, 121, 122, 128, 139, 149], 4: [1, 2, 3, 5, 6, 7, 8, 9, 11, 14, 17, 22, 23, 25, 30, 31, 32, 33, 34, 35, 36, 39, 40, 41, 45, 46, 49, 54, 56, 57, 59, 61, 62, 63, 66, 67, 68, 72, 74, 75, 78, 81, 82, 83, 85, 87, 89, 90, 93, 94, 95, 96, 97, 98, 101, 103, 105, 106, 108, 110, 112, 113, 114, 116, 118, 119, 123, 124, 126, 127, 129, 130, 131, 133, 134, 138, 140, 141, 142, 143, 144, 146, 147, 148], 5: [4, 12, 15, 16, 20, 26, 29, 38, 42, 44, 47, 50, 58, 65, 69, 76, 77, 80, 84, 91, 92, 100, 102, 104, 107, 111, 120, 125, 132, 135, 136, 137], 6: [19, 52, 79, 145]}


In [None]:
#Accuracy
from collections import defaultdict
def accuracy(leaf_assignments, Y_data):
    Lt = 0
    Nt = len(Y_data) #number of points in the nodes (total points)
    for leaf_id, x_i in leaf_assignments.items():  #leaf node 10:[1,2,3], 10 is leaf node_id, x_i = {1,2,3}
        N_kt = defaultdict(int) 
        for i in x_i:
            N_kt[Y_data.iloc[i]] += 1 #count the number of points in each class
        L_t = len(x_i) - max(N_kt.values()) #missclasscifaction: Lt = Nt - max{Nkt}
        Lt += L_t
        print(f"Leaf {leaf_id}: class counts = {dict(N_kt)}, misclassified = {L_t}")

    accuracy = (Nt - Lt) / Nt  
    return accuracy

In [None]:
accuracy = accuracy(leaf_assignments, y_test)
print("Accuracy Result(OCT): " + str(accuracy))

Leaf 7: class counts = {np.int64(0): 30}, misclassified = 0
Leaf 4: class counts = {np.int64(1): 78, np.int64(0): 6}, misclassified = 6
Leaf 5: class counts = {np.int64(0): 31, np.int64(1): 1}, misclassified = 1
Leaf 6: class counts = {np.int64(0): 2, np.int64(1): 2}, misclassified = 2
Accuracy Result(OCT): 0.94


# OCT-H

Preliminary 

In [None]:
m_h = gp.Model('OCT-H')
mu = 0.005  #Base on the paper mu is a sufficiently small constant
alpha = 0.5
#M = 2 

Variables: Introduce s and a_hat as New Variables for OCT-H

In [None]:
a = m_h.addVars(p,TB, vtype=GRB.BINARY, name="a_t") #dim px|TB|  ???? 0/1 or -1/1
a_hat = m_h.addVars(p, TB, vtype=GRB.CONTINUOUS, lb=0, ub=1, name="a_hat") #dim px|TB|
b = m_h.addVars(TB, vtype=GRB.CONTINUOUS, lb = 0, ub = 1, name="b_t") #dim |TB|
d = m_h.addVars(TB, vtype=GRB.BINARY, name="d_t") #dim |TB|
s = m_h.addVars(p,TB, vtype=GRB.BINARY, name="s") #dim px|TB|
z = m_h.addVars(n, TL, vtype=GRB.BINARY, name="z") #dim nx|TL|
l = m_h.addVars(TL, vtype=GRB.BINARY, name="l_t") #dim |TL|
Nk = m_h.addVars(K, TL, vtype=GRB.INTEGER,name="N_kt") #dim Kx|TL|
N = m_h.addVars(TL, vtype=GRB.INTEGER, name="N_t") #dim |TL|
ck = m_h.addVars(K, TL, vtype=GRB.BINARY, name="c_kt")
L = m_h.addVars(TL, name="L_t") 

Warmstart

In [None]:
# warm start using the results of CART algorithm
for i in TB:
    a[feature_indices[i-1], i].start = 1
    b[i].start = threshold_indices[i-1]

### Variables and Constraints (OCT-H)

In [None]:
for t in TB:
    m_h.addConstr(d[t] == 1, name="dt_constraint_d(t)") 
    m_h.addConstr(b[t] <= d[t], name="bt_constraint1_dt")     # bt <= dt
    m_h.addConstr(-d[t] <= b[t], name="bt_constraint2_dt")    #-dt <= bt
    
    m_h.addConstr(s.sum("*",t) >= d[t], name="sum_constraint_of_sjt")  #sum of sjt >= dt
    for i in range(p):
        m_h.addConstr(s[i,t] <= d[t], name="s_constraint1_dt")     # sjt <= dt
        m_h.addConstr(-s[i,t] <= a[i,t], name="a_constraint1_-st")  # -sjt <= ajt
        m_h.addConstr(a[i,t] <= s[i,t], name="a_constraint2_st")   # ajt <= sjt  

    m_h.addConstr(a_hat.sum("*",t) <= d[t], name="sum_constraint_of_a_hat") #sum of a_hat_jt >= dt
    for i in range(p):
        m_h.addConstr(a_hat[i,t] >= -a[i,t], name="a_hat_constraint_-a") #a_hat >= -ajt 
        m_h.addConstr(a_hat[i,t] >= a[i,t], name="a_hat_constraint_a")   #a_hat >= ajt  

for t in TB[1:]:
    m_h.addConstr(d[t] <= d[t//2], name="dt_constraint_dp(t)") # dt <= dp(t)   

for i in range(n):
    m_h.addConstr(z.sum(i,"*") == 1, name="sum_of_zi(t)_constraint_1")  # sum sum of zit = 1
    
for t in TL:
    m_h.addConstr(z.sum("*",t) >= N_min*l[t], name="sum_of_zt_constraint_Nmin_lt") # sum of zit >= Nmin*lt
    
    for i in range(n):
        m_h.addConstr(z[i, t] <= l[t]) # zit <= lt
  
    for k in range(K):
        m_h.addConstr(Nk[k,t] == 1/2 * gp.quicksum(z[i,t] * (Y[i,k] + 1) for i in range(n))) #Nkt = 1/2(sum of (1+Yik)*zit #may need to be corrected

    m_h.addConstr(N[t] == z.sum("*",t))  # Nt = sum of zit
    m_h.addConstr(l[t] == ck.sum("*",t)) # sum of ckt = lt
    m_h.addConstr(l[t] == 1, name="dt_constraint_l(t)")
 
for t in TL:
    l_ancestors = left_ancestors[t - 1]  # cache the list
    if l_ancestors:
        for la in l_ancestors:
            for i in range(n):
                xi = X.iloc[i]  # cache row once
                m_h.addConstr(gp.quicksum(a[j,la] * xi[j] for j in range(p)) + mu <= b[la] + (2 + mu) * (1-z[i, t]),
                    name=f"split_l_{la}_{i}_{t}"
                )
                
    r_ancestors = right_ancestors[t - 1]
    if r_ancestors:
        for ra in r_ancestors:
            for i in range(n):
                xi = X.iloc[i]  # cache the row once
                m_h.addConstr(
                    gp.quicksum(a[j, ra]*xi[j] for j in range(p)) >= b[ra]-2*(1-z[i, t]),
                    name=f"split_r_{ra}_{i}_{t}"
                )
         
for t in TL:
    m_h.addConstr(L[t] >= 0, name="Lt_constraint3") #Lt ≥ 0
    for k in range(K):
        m_h.addConstr(L[t] >= N[t] - Nk[k,t] - n*(1-ck[k,t]), name="Lt_constraint1") #Lt ≤ Nt − Nkt + n(1-ckt) 
        m_h.addConstr(L[t] <= N[t] - Nk[k,t] + n*ck[k,t], name="Lt_constraint2")  #Lt ≤ Nt − Nkt + n*ckt 

Objective OCT-H and Solution

In [None]:
m_h.update()
m_h.setObjective(L.sum('*') / L_hat + alpha*gp.quicksum(gp.quicksum(s[i,t] for i in range(p)) for t in TB), GRB.MINIMIZE) #minimize m_h.setObjective(gp.quicksum(L[t]/L_hat for t in TL) + alpha*gp.quicksum(gp.quicksum(s[i,t] for i in range(p)) for t in TB), GRB.MINIMIZE) L + alpha*sum(sum(sjt)) for j in p, t in TB
m.Params.timelimit = 600
m_h.optimize()

Set parameter TimeLimit to value 600
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (22631.2))

CPU model: Intel(R) Core(TM) Ultra 9 185H, instruction set [SSE2|AVX|AVX2]
Thread count: 16 physical cores, 22 logical processors, using up to 22 threads

Optimize a model with 4370 rows, 1497 columns and 32561 nonzeros
Model fingerprint: 0x4585df75
Variable types: 28 continuous, 1469 integer (1457 binary)
Coefficient statistics:
  Matrix range     [1e-02, 4e+02]
  Objective range  [5e-01, 2e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 4e+02]

User MIP start produced solution with objective 27.2895 (0.02s)
User MIP start produced solution with objective 25.4474 (0.02s)
Loaded user MIP start with objective 25.4474

Presolve removed 1517 rows and 36 columns
Presolve time: 0.04s
Presolved: 2853 rows, 1461 columns, 26728 nonzeros
Variable types: 3 continuous, 1458 integer (1446 binary)

Root relaxation: objective 1.500000e+00, 137 iterations, 0.01 sec