# Cross Validation of OCT and Comparison with CART, XGBoost and Random Forest

### Some Notes on Implementing the OCT Model

In [13]:
''' 
Notes:
  
import_data_uci() imports data from UCIMLREPO and organize it as a dataframe, it takes the id number of the dataset from UCIMLREPO as an argument. 

cross_valid_set(df = import_data_uci(id#), # of desired observations) splits the dataset into 4 training and testing sets; for large datasets of >5000 observations or >5 features, 
it will be better to sample the dataset to smaller size by setting the second argument to the desired size, otherwise it will take a long time to run. This function will split the input df into 4 training and testing sets. 

OCT = oct(df, depth of tree, complexity parameter alpha, run time limit, warmstart for a, warmstart for b) is the fit function of the OCT model; where warmstart for a and b are obtained from CART: warmstart_cart(fitted CART model)).

OCT.compute_accuracy(X,y) returns the accuracy of the model on the test set.

OCT.plot_tree_structure() plots the tree structure of the model. However Graphviz must be installed and added to the system path.

You can also test out the OCT-H model:
OCTH = oct_h(df, depth of tree, complexity parameter alpha, run time limit) is the fit function of the OCT-H model; but note that at this stage the model takes warmstart of a as a matrix 
and the warmstart generated from CART is not effective and not compatible with the model. Furthermore the run time for any dataset >200 observations is very long.

OCTH.compute_accuracy(X,y) returns the accuracy of the model on the test set.

OCTH.plot_tree_structure() plots the tree structure of the model. However Graphviz must be installed and added to the system path.
'''


' \nNotes:\n  \nimport_data_uci() imports data from UCIMLREPO and organize it as a dataframe, it takes the id number of the dataset from UCIMLREPO as an argument. \n\ncross_valid_set(df = import_data_uci(id#), # of desired observations) splits the dataset into 4 training and testing sets; for large datasets of >5000 observations or >5 features, \nit will be better to sample the dataset to smaller size by setting the second argument to the desired size, otherwise it will take a long time to run. This function will split the input df into 4 training and testing sets. \n\nOCT = oct(df, depth of tree, complexity parameter alpha, run time limit, warmstart for a, warmstart for b) is the fit function of the OCT model; where warmstart for a and b are obtained from CART: warmstart_cart(fitted CART model)).\n\nOCT.compute_accuracy(X,y) returns the accuracy of the model on the test set.\n\nOCT.plot_tree_structure() plots the tree structure of the model. However Graphviz must be installed and adde

### Import Packages

In [14]:
import math
import pandas as pd
import numpy as np
import requests
from io import StringIO
from gurobipy import *
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
import warnings
warnings.filterwarnings("ignore")
from ucimlrepo import fetch_ucirepo as fetc
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import gurobipy as gp
from gurobipy import GRB
from sklearn.linear_model import LogisticRegression
from optimal_classification_tree1 import oct
from optimal_classification_tree1 import oct_h
from graphviz import Digraph
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier


### Data Process Functions For Cross Validation
(for Data Imported from UCIMLREPO)

In [15]:
def import_data_uci(data_id):
    dt = fetc(id = data_id)
    X = dt.data.features 
    y = dt.data.targets 
    df = pd.DataFrame(X, columns=dt.data.feature_names)
    df = df.dropna()
    target_name = 'target'
    df[target_name] = y
    df = df.reset_index(drop=True)
    
    #Change non numeric columns to numeric
    column_names = df.columns.tolist()
    non_numeric_cols = df.drop([target_name],axis=1).select_dtypes(exclude=[np.number]).columns
    if not non_numeric_cols.empty:
        for col in non_numeric_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    for col in column_names[:len(column_names)-1]:
        df[col] = StandardScaler().fit_transform(df[[col]])
    
    for col in column_names[:len(column_names)-1]:
        df[col] = MinMaxScaler().fit_transform(df[[col]])
        
    #Convert target column to numeric
    le = LabelEncoder()
    df[target_name] = le.fit_transform(df[target_name])
    
    return df

def cross_valid_set(df,observations=None):
    if observations is not None:
        df = df.sample(n=observations, random_state=42)
    df = df.reset_index(drop=True)
    
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle
    folds = np.array_split(df, 4)
    s1_in = np.concatenate([folds[i] for i in [0, 1, 2]], axis=0)
    s1_out = folds[3]
    s2_in = np.concatenate([folds[i] for i in [1, 2, 3]], axis=0)
    s2_out = folds[0]
    s3_in = np.concatenate([folds[i] for i in [0, 2, 3]], axis=0)
    s3_out = folds[1]
    s4_in = np.concatenate([folds[i] for i in [0, 1, 3]], axis=0)
    s4_out = folds[2]

    s1_in = pd.DataFrame(s1_in)
    s1_out = pd.DataFrame(s1_out)
    s2_in = pd.DataFrame(s2_in)
    s2_out = pd.DataFrame(s2_out)
    s3_in = pd.DataFrame(s3_in)
    s3_out = pd.DataFrame(s3_out)
    s4_in = pd.DataFrame(s4_in)
    s4_out = pd.DataFrame(s4_out)
    
    s1_out = s1_out.reset_index(drop=True)
    s2_out = s2_out.reset_index(drop=True)
    s3_out = s3_out.reset_index(drop=True)
    s4_out = s4_out.reset_index(drop=True)
    
    s1_in.columns = df.columns.tolist()
    s1_out.columns = df.columns.tolist()
    s2_in.columns = df.columns.tolist()
    s2_out.columns = df.columns.tolist()
    s3_in.columns = df.columns.tolist()
    s3_out.columns = df.columns.tolist()
    s4_in.columns = df.columns.tolist()
    s4_out.columns = df.columns.tolist()

    return s1_in,s1_out,s2_in,s2_out,s3_in,s3_out,s4_in,s4_out

df = import_data_uci(53)
print(f'number of observations: {len(df)}')
print(f'number of features: {len(df.columns)-1}')
print(f'number of classes: {len(df.target.unique())}')
print(df)

number of observations: 150
number of features: 4
number of classes: 3
     sepal length  sepal width  petal length  petal width  target
0        0.222222     0.625000      0.067797     0.041667       0
1        0.166667     0.416667      0.067797     0.041667       0
2        0.111111     0.500000      0.050847     0.041667       0
3        0.083333     0.458333      0.084746     0.041667       0
4        0.194444     0.666667      0.067797     0.041667       0
..            ...          ...           ...          ...     ...
145      0.666667     0.416667      0.711864     0.916667       2
146      0.555556     0.208333      0.677966     0.750000       2
147      0.611111     0.416667      0.711864     0.791667       2
148      0.527778     0.583333      0.745763     0.916667       2
149      0.444444     0.416667      0.694915     0.708333       2

[150 rows x 5 columns]


### Import Data and Check Data Status (From TXT)

In [16]:
'''
df = pd.read_csv("c:\\Users\\zhuoq\\Downloads\\AMS515 Project\\Loan Application Classification matrix_train.txt", delim_whitespace=True).drop(['const'], axis=1)
df.rename(columns={'scenario_benchmark': 'target'}, inplace=True)

column_names = df.columns.tolist()
non_numeric_cols = df.drop(['target'],axis=1).select_dtypes(exclude=[np.number]).columns
if not non_numeric_cols.empty:
    for col in non_numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
for col in column_names[:len(column_names)-1]:
    df[col] = StandardScaler().fit_transform(df[[col]])

#normalize the data    
for col in column_names[:len(column_names)-1]:
    df[col] = MinMaxScaler().fit_transform(df[[col]])

#Convert target column to numeric
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])
'''

'\ndf = pd.read_csv("c:\\Users\\zhuoq\\Downloads\\AMS515 Project\\Loan Application Classification matrix_train.txt", delim_whitespace=True).drop([\'const\'], axis=1)\ndf.rename(columns={\'scenario_benchmark\': \'target\'}, inplace=True)\n\ncolumn_names = df.columns.tolist()\nnon_numeric_cols = df.drop([\'target\'],axis=1).select_dtypes(exclude=[np.number]).columns\nif not non_numeric_cols.empty:\n    for col in non_numeric_cols:\n        df[col] = pd.to_numeric(df[col], errors="coerce")\nfor col in column_names[:len(column_names)-1]:\n    df[col] = StandardScaler().fit_transform(df[[col]])\n\n#normalize the data    \nfor col in column_names[:len(column_names)-1]:\n    df[col] = MinMaxScaler().fit_transform(df[[col]])\n\n#Convert target column to numeric\nle = LabelEncoder()\ndf[\'target\'] = le.fit_transform(df[\'target\'])\n'

### Test-Train Splits

In [17]:
cvs = cross_valid_set(df)
df_train1 = cvs[0]
df_test1 = cvs[1]

df_train2 = cvs[2]
df_test2 = cvs[3]

df_train3 = cvs[4]
df_test3 = cvs[5]

df_train4 = cvs[6]
df_test4 = cvs[7]

#set 1
x_train1 = df_train1.drop(['target'], axis=1)
y_train1 = df_train1['target'].astype('int')
x_test1 = df_test1.drop(['target'], axis=1)
y_test1 = df_test1['target'].astype('int')

#set 2
x_train2 = df_train2.drop(['target'], axis=1)
y_train2 = df_train2['target'].astype('int')
x_test2 = df_test2.drop(['target'], axis=1)
y_test2 = df_test2['target'].astype('int')

#set 3
x_train3 = df_train3.drop(['target'], axis=1)
y_train3 = df_train3['target'].astype('int')
x_test3 = df_test3.drop(['target'], axis=1)
y_test3 = df_test3['target'].astype('int')

#set 4
x_train4 = df_train4.drop(['target'], axis=1)
y_train4 = df_train4['target'].astype('int')
x_test4 = df_test4.drop(['target'], axis=1)
y_test4 = df_test4['target'].astype('int')

#Depth
D = 2
#Number of Classes
k = len(df.target.unique())

### Predefined Functions for CART Implementation

In [18]:
def warmstart_cart(cart):
    feature = cart.tree_.feature
    threshold = cart.tree_.threshold
    feature_indices = []
    threshold_indices = []
    for i in range(len(feature)):
        if feature[i] != -2:
            feature_indices.append(int(feature[i]))
            threshold_indices.append(float(threshold[i]))
    return feature_indices, threshold_indices

def cart_accuracy(x_train, y_train, x_test, y_test,cart_tree):
    cart_tree.fit(x_train, y_train)
    y_pred_gini = cart_tree.predict(x_test)
    y_pred_gini_train = cart_tree.predict(x_train)
    return accuracy_score(y_train, y_pred_gini_train),accuracy_score(y_test, y_pred_gini)

### Cross Validation Set 1

In [19]:
#CART
clf = DecisionTreeClassifier(criterion='gini', max_depth=D, random_state=42)

cart_in_ac1 = cart_accuracy(x_train1,y_train1,x_test1,y_test1,clf)[0]
cart_out_ac1 = cart_accuracy(x_train1,y_train1,x_test1,y_test1,clf)[1]
print(f'CART in-sample accuracy for set 1: {cart_in_ac1}')
print(f'CART out-sample accuracy for set 1: {cart_out_ac1}')

clf.fit(x_train1, y_train1) #change this for testing!!!!!!!!
"""
Tree Structure: Warm start for OCT
"""
feature_indices = warmstart_cart(clf)[0]
threshold_indices = warmstart_cart(clf)[1]

# OCT 
OCT = oct(df_train1,D,0.5,7200)
OCT_ac_in1 = OCT.compute_accuracy(x_train1,y_train1)
OCT_ac_out1 = OCT.compute_accuracy(x_test1,y_test1)
print(f'OCT in-sample accuracy for set 1: {OCT_ac_in1}')
print(f'OCT out-sample accuracy for set 1: {OCT_ac_out1}')

CART in-sample accuracy for set 1: 0.9646017699115044
CART out-sample accuracy for set 1: 0.9459459459459459
Set parameter TimeLimit to value 7200
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (26100.2))

CPU model: Intel(R) Core(TM) i5-9400 CPU @ 2.90GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 6 physical cores, 6 logical processors, using up to 6 threads

Non-default parameters:
TimeLimit  7200

Optimize a model with 1536 rows, 506 columns and 8272 nonzeros
Model fingerprint: 0xb2766e98
Variable types: 7 continuous, 499 integer (483 binary)
Coefficient statistics:
  Matrix range     [1e-05, 1e+02]
  Objective range  [5e-01, 3e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+02]
Presolve removed 472 rows and 7 columns
Presolve time: 0.03s
Presolved: 1064 rows, 499 columns, 6444 nonzeros
Variable types: 3 continuous, 496 integer (476 binary)

Root relaxation: objective 1.500000e+00, 229 iterations, 0.01 seconds (0.00 work units)

    N

### Cross Validation Set 2

In [20]:
#CART
clf = DecisionTreeClassifier(criterion='gini', max_depth=D, random_state=42)

cart_in_ac2 = cart_accuracy(x_train2,y_train2,x_test2,y_test2,clf)[0]
cart_out_ac2 = cart_accuracy(x_train2,y_train2,x_test2,y_test2,clf)[1]
print(f'CART in-sample accuracy for set 2: {cart_in_ac2}')
print(f'CART out-sample accuracy for set 2: {cart_out_ac2}')

clf.fit(x_train2, y_train2)

"""
Tree Structure: Warm start for OCT
"""
feature_indices = warmstart_cart(clf)[0]
threshold_indices = warmstart_cart(clf)[1]

# OCT 
OCT = oct(df_train2,D,0.5,7200,feature_indices,threshold_indices,2,0.5,1,2)
OCT_ac_in2 = OCT.compute_accuracy(x_train2,y_train2)
OCT_ac_out2 = OCT.compute_accuracy(x_test2,y_test2)
print(f'OCT in-sample accuracy for set 2: {OCT_ac_in2}')
print(f'OCT out-sample accuracy for set 2: {OCT_ac_out2}')

CART in-sample accuracy for set 2: 0.9464285714285714
CART out-sample accuracy for set 2: 0.9736842105263158
Set parameter TimeLimit to value 7200
Set parameter Cuts to value 2
Set parameter Heuristics to value 0.5
Set parameter MIPFocus to value 1
Set parameter Presolve to value 2
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (26100.2))

CPU model: Intel(R) Core(TM) i5-9400 CPU @ 2.90GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 6 physical cores, 6 logical processors, using up to 6 threads

Non-default parameters:
TimeLimit  7200
Heuristics  0.5
MIPFocus  1
Cuts  2
Presolve  2

Optimize a model with 1523 rows, 502 columns and 8208 nonzeros
Model fingerprint: 0x23b12e77
Variable types: 7 continuous, 495 integer (479 binary)
Coefficient statistics:
  Matrix range     [1e-05, 1e+02]
  Objective range  [5e-01, 3e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+02]

User MIP start did not produce a new incumbent solution

Presolve removed 4

### Cross Validation Set 3

In [21]:
#CART
clf = DecisionTreeClassifier(criterion='gini', max_depth=D, random_state=42)

cart_in_ac3 = cart_accuracy(x_train3,y_train3,x_test3,y_test3,clf)[0]
cart_out_ac3 = cart_accuracy(x_train3,y_train3,x_test3,y_test3,clf)[1]
print(f'CART in-sample accuracy for set 3: {cart_in_ac3}')
print(f'CART out-sample accuracy for set 3: {cart_out_ac3}')

clf.fit(x_train3, y_train3)

"""
Tree Structure: Warm start for OCT
"""
feature_indices = warmstart_cart(clf)[0]
threshold_indices = warmstart_cart(clf)[1]

# OCT 
OCT = oct(df_train3,D,0.5,7200,feature_indices,threshold_indices,2,0.5,1,2)
OCT_ac_in3 = OCT.compute_accuracy(x_train3,y_train3)
OCT_ac_out3 = OCT.compute_accuracy(x_test3,y_test3)
print(f'OCT in-sample accuracy for set 3: {OCT_ac_in3}')
print(f'OCT out-sample accuracy for set 3: {OCT_ac_out3}')

CART in-sample accuracy for set 3: 0.9553571428571429
CART out-sample accuracy for set 3: 0.9736842105263158
Set parameter TimeLimit to value 7200
Set parameter Cuts to value 2
Set parameter Heuristics to value 0.5
Set parameter MIPFocus to value 1
Set parameter Presolve to value 2
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (26100.2))

CPU model: Intel(R) Core(TM) i5-9400 CPU @ 2.90GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 6 physical cores, 6 logical processors, using up to 6 threads

Non-default parameters:
TimeLimit  7200
Heuristics  0.5
MIPFocus  1
Cuts  2
Presolve  2

Optimize a model with 1523 rows, 502 columns and 8204 nonzeros
Model fingerprint: 0x94193814
Variable types: 7 continuous, 495 integer (479 binary)
Coefficient statistics:
  Matrix range     [1e-05, 1e+02]
  Objective range  [5e-01, 3e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+02]

User MIP start did not produce a new incumbent solution

Presolve removed 4

### Cross Validation Set 4

In [22]:
#CART
clf = DecisionTreeClassifier(criterion='gini', max_depth=D, random_state=42)

cart_ac = cart_accuracy(x_train4,y_train4,x_test4,y_test4,clf)
cart_in_ac4 = cart_ac[0]
cart_out_ac4 = cart_ac[1]
print(f'CART in-sample accuracy for set 4: {cart_in_ac4}')
print(f'CART out-sample accuracy for set 4: {cart_out_ac4}')

clf.fit(x_train4, y_train4) #change this for testing!!!!!!!!

"""
Tree Structure: Warm start for OCT
"""
feature_indices = warmstart_cart(clf)[0]
threshold_indices = warmstart_cart(clf)[1]

# OCT 
OCT = oct(df_train4,D,0.5,7200,feature_indices,threshold_indices,2,0.5,1,2)
OCT_ac_in4 = OCT.compute_accuracy(x_train4,y_train4)
OCT_ac_out4 = OCT.compute_accuracy(x_test4,y_test4)
print(f'OCT in-sample accuracy for set 4: {OCT_ac_in4}')
print(f'OCT out-sample accuracy for set 4: {OCT_ac_out4}')

CART in-sample accuracy for set 4: 0.9823008849557522
CART out-sample accuracy for set 4: 0.8918918918918919
Set parameter TimeLimit to value 7200
Set parameter Cuts to value 2
Set parameter Heuristics to value 0.5
Set parameter MIPFocus to value 1
Set parameter Presolve to value 2
Gurobi Optimizer version 12.0.1 build v12.0.1rc0 (win64 - Windows 11.0 (26100.2))

CPU model: Intel(R) Core(TM) i5-9400 CPU @ 2.90GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 6 physical cores, 6 logical processors, using up to 6 threads

Non-default parameters:
TimeLimit  7200
Heuristics  0.5
MIPFocus  1
Cuts  2
Presolve  2

Optimize a model with 1536 rows, 506 columns and 8280 nonzeros
Model fingerprint: 0xfa432f02
Variable types: 7 continuous, 499 integer (483 binary)
Coefficient statistics:
  Matrix range     [1e-05, 1e+02]
  Objective range  [5e-01, 3e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+02]

User MIP start did not produce a new incumbent solution

Presolve removed 4

# XGBoost

In [23]:
xgboost_in_ac = []
xgboost_out_ac = []
for i in range(1, 5):  
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')
    
    if k >= 3:
        model = xgb.XGBClassifier(objective='multi:softmax',num_class=k,use_label_encoder=False, max_depth=D)
    else:
        model = xgb.XGBClassifier(objective='binary:logistic',use_label_encoder=False, max_depth=D)
        
    model.fit(x_train, y_train)

    y_fit = model.predict(x_train)
    y_pred = model.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)

    xgboost_in_ac.append(acc_train)
    xgboost_out_ac.append(acc_test)
    
    print(f'XGBoost in-sample accuracy for set {i}: {acc_train}')
    print(f'XGBoost out-sample accuracy for set {i}: {acc_test}')

XGBoost in-sample accuracy for set 1: 1.0
XGBoost out-sample accuracy for set 1: 0.9459459459459459
XGBoost in-sample accuracy for set 2: 1.0
XGBoost out-sample accuracy for set 2: 1.0
XGBoost in-sample accuracy for set 3: 1.0
XGBoost out-sample accuracy for set 3: 0.9473684210526315
XGBoost in-sample accuracy for set 4: 1.0
XGBoost out-sample accuracy for set 4: 0.8918918918918919


# Random Forest

In [24]:
rf_in_ac = []
rf_out_ac = []
for i in range(1, 5): 
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')

    rf = RandomForestClassifier(n_estimators=100, random_state=42,max_depth=D) #build 100 decision trees
    rf.fit(x_train, y_train)

    y_fit = rf.predict(x_train)
    y_pred = rf.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)
    
    rf_in_ac.append(acc_train)
    rf_out_ac.append(acc_test)

    print(f"=== Dataset {i} ===")
    print("Random Forest Train Accuracy:", acc_train)
    print("Random Forest Test Accuracy:", acc_test)
    print()

=== Dataset 1 ===
Random Forest Train Accuracy: 0.9734513274336283
Random Forest Test Accuracy: 0.9459459459459459

=== Dataset 2 ===
Random Forest Train Accuracy: 0.9464285714285714
Random Forest Test Accuracy: 1.0

=== Dataset 3 ===
Random Forest Train Accuracy: 0.9553571428571429
Random Forest Test Accuracy: 0.9736842105263158

=== Dataset 4 ===
Random Forest Train Accuracy: 0.9823008849557522
Random Forest Test Accuracy: 0.8648648648648649



# OCT-H

In [25]:
'''
from optimal_classification_tree1 import oct_h
OCTH1 = oct_h(df_train1,D,0.5,7200)
octh_in_acc1 = OCTH1.compute_accuracy(x_train1,y_train1)
octh_out_acc1 = OCTH1.compute_accuracy(x_test1,y_test1)

OCTH2 = oct_h(df_train2,D,0.5,7200)
octh_in_acc2 = OCTH2.compute_accuracy(x_train2,y_train2)
octh_out_acc2 = OCTH2.compute_accuracy(x_test2,y_test2)

OCTH3 = oct_h(df_train3,D,0.5,7200)
octh_in_acc3 = OCTH3.compute_accuracy(x_train3,y_train3)
octh_out_acc3 = OCTH3.compute_accuracy(x_test3,y_test3)

OCTH4 = oct_h(df_train4,D,0.5,7200)
octh_in_acc4 = OCTH4.compute_accuracy(x_train4,y_train4)
octh_out_acc4 = OCTH4.compute_accuracy(x_test4,y_test4)


print(f'OCT-H in-sample accuracy for set 1: {octh_in_acc1}')
print(f'OCT-H out-sample accuracy for set 1: {octh_out_acc1}')
print(f'OCT-H in-sample accuracy for set 2: {octh_in_acc2}')
print(f'OCT-H out-sample accuracy for set 2: {octh_out_acc2}')
print(f'OCT-H in-sample accuracy for set 3: {octh_in_acc3}')
print(f'OCT-H out-sample accuracy for set 3: {octh_out_acc3}')
print(f'OCT-H in-sample accuracy for set 4: {octh_in_acc4}')
print(f'OCT-H out-sample accuracy for set 4: {octh_out_acc4}')

print(f'Average in-sample accuracy for OCT-H: {(octh_in_acc1+octh_in_acc2+octh_in_acc3+octh_in_acc4)/4}')
print(f'Average out-sample accuracy for OCT-H: {(octh_out_acc1+octh_out_acc2+octh_out_acc3+octh_out_acc4)/4}')
'''

"\nfrom optimal_classification_tree1 import oct_h\nOCTH1 = oct_h(df_train1,D,0.5,7200)\nocth_in_acc1 = OCTH1.compute_accuracy(x_train1,y_train1)\nocth_out_acc1 = OCTH1.compute_accuracy(x_test1,y_test1)\n\nOCTH2 = oct_h(df_train2,D,0.5,7200)\nocth_in_acc2 = OCTH2.compute_accuracy(x_train2,y_train2)\nocth_out_acc2 = OCTH2.compute_accuracy(x_test2,y_test2)\n\nOCTH3 = oct_h(df_train3,D,0.5,7200)\nocth_in_acc3 = OCTH3.compute_accuracy(x_train3,y_train3)\nocth_out_acc3 = OCTH3.compute_accuracy(x_test3,y_test3)\n\nOCTH4 = oct_h(df_train4,D,0.5,7200)\nocth_in_acc4 = OCTH4.compute_accuracy(x_train4,y_train4)\nocth_out_acc4 = OCTH4.compute_accuracy(x_test4,y_test4)\n\n\nprint(f'OCT-H in-sample accuracy for set 1: {octh_in_acc1}')\nprint(f'OCT-H out-sample accuracy for set 1: {octh_out_acc1}')\nprint(f'OCT-H in-sample accuracy for set 2: {octh_in_acc2}')\nprint(f'OCT-H out-sample accuracy for set 2: {octh_out_acc2}')\nprint(f'OCT-H in-sample accuracy for set 3: {octh_in_acc3}')\nprint(f'OCT-H out

# Average In-Sample and Out-Sample Accuracy for CART, OCT, XGBoost, Random Forest

In [26]:
avg_in_ac_CART = (cart_in_ac1 + cart_in_ac2 + cart_in_ac3 + cart_in_ac4) / 4
avg_out_ac_CART = (cart_out_ac1 + cart_out_ac2 + cart_out_ac3 + cart_out_ac4) / 4

avg_in_ac_OCT = (OCT_ac_in1 + OCT_ac_in2 + OCT_ac_in3 + OCT_ac_in4) / 4
avg_out_ac_OCT = (OCT_ac_out1 + OCT_ac_out2 + OCT_ac_out3 + OCT_ac_out4) / 4
'''
avg_in_ac_OCTH = (octh_in_acc1 + octh_in_acc2 + octh_in_acc3 + octh_in_acc4) / 4
avg_out_ac_OCTH = (octh_out_acc1 + octh_out_acc2 + octh_out_acc3 + octh_out_acc4) / 4
'''
avg_in_ac_XGBoost = (xgboost_in_ac[0] + xgboost_in_ac[1] + xgboost_in_ac[2] + xgboost_in_ac[3]) / 4
avg_out_ac_XGBoost = (xgboost_out_ac[0] + xgboost_out_ac[1] + xgboost_out_ac[2] + xgboost_out_ac[3]) / 4

avg_in_ac_RF = (rf_in_ac[0] + rf_in_ac[1] + rf_in_ac[2] + rf_in_ac[3]) / 4
avg_out_ac_RF = (rf_out_ac[0] + rf_out_ac[1] + rf_out_ac[2] + rf_out_ac[3]) / 4

print(f'Average in-sample accuracy for CART: {avg_in_ac_CART}')
print(f'Average out-sample accuracy for CART: {avg_out_ac_CART}')

print(f'Average in-sample accuracy for OCT: {avg_in_ac_OCT}')
print(f'Average out-sample accuracy for OCT: {avg_out_ac_OCT}')
'''
print(f'Average in-sample accuracy for OCTH: {avg_in_ac_OCTH}')
print(f'Average out-sample accuracy for OCTH: {avg_out_ac_OCTH}')
'''
print(f'Average in-sample accuracy for XGBoost: {avg_in_ac_XGBoost}')
print(f'Average out-sample accuracy for XGBoost: {avg_out_ac_XGBoost}')

print(f'Average in-sample accuracy for Random Forest: {avg_in_ac_RF}')
print(f'Average out-sample accuracy for Random Forest: {avg_out_ac_RF}')

Average in-sample accuracy for CART: 0.9621720922882427
Average out-sample accuracy for CART: 0.9463015647226174
Average in-sample accuracy for OCT: 0.9621720922882427
Average out-sample accuracy for OCT: 0.9461237553342817
Average in-sample accuracy for XGBoost: 1.0
Average out-sample accuracy for XGBoost: 0.9463015647226174
Average in-sample accuracy for Random Forest: 0.9643844816687737
Average out-sample accuracy for Random Forest: 0.9461237553342817
