In [1]:
import pandas as pd
import numpy as np
import plotly_express as px

from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from bayesian_decision_tree.classification import BinaryClassificationNode
# from bayesian_decision_tree.classification import MultiClassificationNode
# from bayesian_decision_tree.regression import RegressionNode

np.random.seed(11)

# Experiment 1 - Bayesian Decision Trees exploration

Let's test the performance of bayesian decision trees vs standard decision trees vs Random Forests. In [the paper](https://arxiv.org/abs/1901.03214), the authors claim a performance similar to Random Forest.

I will use [the official implementation](https://github.com/UBS-IB/bayesian_tree/) in Python to perform the experiments.

---

## Data set

I will use the Breast Cancer Wisconsin data set to minimize the time consumption of the experiment:

In [2]:
X, y = load_breast_cancer(return_X_y=True)

In [3]:
X.shape, y.shape

((569, 30), (569,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

---

## Bayesian Decision Tree model

### Model settings
The implementation lets us choose the prior probability of each class and the prior probability of splitting a node into sub-nodes.

In [5]:
prior = np.array([1,1])

In [6]:
bdt_model = BinaryClassificationNode(partition_prior=0.8, prior=prior)

### Training the model

In [7]:
delta = 0

In [8]:
bdt_model.fit(X_train, y_train, delta)

### Exploring the model

 * Is the obtained model interpretable?  
 * What is its performance?

In [9]:
print(bdt_model)

x7=0.05128
 ┣ <0.05128: x20=16.83
 ┃ ┣ <16.83: y=1, p(y)=[ 0.02586207  0.97413793]
 ┃ ┗ ≥16.83: y=0, p(y)=[ 0.52941176  0.47058824]
 ┗ ≥0.05128: x22=114.44999999999999
    ┣ <114.44999999999999: x21=25.655
    ┃ ┣ <25.655: y=1, p(y)=[ 0.14285714  0.85714286]
    ┃ ┗ ≥25.655: y=0, p(y)=[ 0.95454545  0.04545455]
    ┗ ≥114.44999999999999: y=0, p(y)=[ 0.99137931  0.00862069]


The model is as interpretable as a standard decision tree.

In [10]:
bdt_predictions = bdt_model.predict(X_test)
bdt_acc = np.sum(bdt_predictions == y_test) / X_test.shape[0]
print("Bayesian Decision Tree Accuracy = {}".format(np.round(bdt_acc, decimals=3)))

Bayesian Decision Tree Accuracy = 0.965


## Decision Tree Model

In [11]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
df_acc = np.round(dt_model.score(X_test, y_test), decimals=3)
print("Decision Tree Accuracy = {}".format(df_acc))

Decision Tree Accuracy = 0.936


## Random Forest Model

In [12]:
rf_model = RandomForestClassifier(n_estimators=10)
rf_model.fit(X_train, y_train)
rf_acc = np.round(rf_model.score(X_test, y_test), decimals=3)
print("Random Forest Accuracy = {}".format(rf_acc))

Random Forest Accuracy = 0.959


---

# Conclusions

Apparently the bayesian decision tree performs better than the alternatives. It is also fully interpretable, so this results are promising. 

## Next steps
Experiments with larger and more complex data sets should be done.