<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Rules extraction </h1></center>

This notebook illustrates rules extraction from decision trees.

In [None]:

import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import time

import numpy as np

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, accuracy_score,roc_auc_score

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from pysirus.models.extractors import SirusDTreeClassifier,SirusRFClassifier,SirusGBClassifier

In [None]:
np.zeros((1,2))

In [None]:
from sklearn import tree

## Load data :

In [None]:
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
X

In [None]:
y

## Single tree : 

In [None]:
# Usual DecisionTree
clf_dtree = DecisionTreeClassifier(max_depth=10, random_state=0)
clf_dtree.fit(X_train, y_train)
y_pred_dtree = clf_dtree.predict(X_test)
y_pred_proba_dtree = clf_dtree.predict_proba(X_test)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_dtree))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_dtree,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_dtree))

In [None]:
## DecisionTree rules extraction
STree = SirusDTreeClassifier(max_depth=2, random_state=0)
STree.fit(X_train,y_train,quantile=10)
y_pred_sirus = STree.predict(X_test)
y_pred_proba_sirus = STree.predict_proba(X_test)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_sirus))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_sirus,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_sirus))

In [None]:
STree.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
STree.print_rules(max_rules=3)

In [None]:
STree.show_rules(max_rules=16)

In [None]:
STree.n_features_in_

## Random forest : 

In [None]:
# Usual RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=10, random_state=0)
start = time.time()
clf_rf.fit(X_train, y_train)
end = time.time()
y_pred_dtree = clf_rf.predict(X_test)
y_pred_proba_dtree = clf_rf.predict_proba(X_test)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_dtree))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_dtree,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_dtree))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(max_depth=2, random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X_train,y_train,quantile=10,batch_size_post_treatment=50,p0=0.01)
end = time.time()
y_pred_sirus = RFSirus.predict(X_test,to_add_probas_outside_rules=True)
y_pred_proba_sirus = RFSirus.predict_proba(X_test,to_add_probas_outside_rules=True)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_sirus))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_sirus,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_sirus))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(max_depth=2, random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X_train,y_train,quantile=10,batch_size_post_treatment=50,p0=0.01)
end = time.time()
y_pred_sirus = RFSirus.predict(X_test,to_add_probas_outside_rules=False)
y_pred_proba_sirus = RFSirus.predict_proba(X_test,to_add_probas_outside_rules=False)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_sirus))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_sirus,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_sirus))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
y_pred_proba_sirus

In [None]:
y_pred_proba_sirus.sum(axis=1)

In [None]:
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']

In [None]:
RFSirus.show_rules(max_rules=10)

In [None]:
from pysirus.models.optimizer import train_optimal_extractor_p0
from sklearn.metrics import roc_auc_score

In [None]:
def my_roc(y_true,y_pred_probas):
    return roc_auc_score(y_true, y_pred_probas,average='micro',multi_class='ovr')
    

In [None]:
final_sirus_model = train_optimal_extractor_p0(clf=RFSirus,X_train=X,y_train=y,quantile=20,
                           scoring=my_roc,scoring_on_probas=True,p0_exploration_grid=np.linspace(0.01, 0.05, 15),
                           n_cv_splits=5,n_cv_repeats=1)

## GB :

In [None]:
# Usual GradientBoostingClassifier
GB = GradientBoostingClassifier(n_estimators=100,max_depth=2,random_state=19)
start = time.time()
GB.fit(X_train, y_train)
end = time.time()
y_pred_dtree = GB.predict(X_test)
y_pred_proba_dtree = GB.predict_proba(X_test)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_dtree))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_dtree,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_dtree))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
# GradientBoostingClassifier rules extraction
GBSirus = SirusGBClassifier(n_estimators=100,max_depth=2, random_state=19,splitter="quantile")
start = time.time()
GBSirus.fit(X_train, y_train,p0=0.05,quantile=20)
end = time.time()
y_pred_dtree = GBSirus.predict(X_test,to_add_probas_outside_rules=True)
y_pred_proba_dtree = GBSirus.predict_proba(X_test,to_add_probas_outside_rules=True)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_dtree))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_dtree,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_dtree))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
# GradientBoostingClassifier rules extraction
GBSirus = SirusGBClassifier(n_estimators=100,max_depth=2, random_state=19,splitter="quantile")
start = time.time()
GBSirus.fit(X_train, y_train,p0=0.01,quantile=20)
end = time.time()
y_pred_gbsirus = GBSirus.predict(X_test,to_add_probas_outside_rules=False)
y_pred_proba_gbsirus = GBSirus.predict_proba(X_test,to_add_probas_outside_rules=False)
print('PR AUC :', average_precision_score(y_test, y_pred_proba_gbsirus))
print('ROC AUC :', roc_auc_score(y_test, y_pred_proba_gbsirus,average='micro',multi_class='ovr'))
print('Accuracy :', accuracy_score(y_test, y_pred_gbsirus))
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
GBSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
GBSirus.show_rules(max_rules=16)

In [None]:
GBSirus.print_rules(max_rules=10)

# Tests : 

## Essais :

In [None]:
iris.feature_names

In [None]:
y

In [None]:
y[y==1] = -1
y[y==2] = -1
y[y==0] = 1
y[y==-1]=0
y

In [None]:
y.sum()

## Essais 2:

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1,max_features=4,max_depth=2,bootstrap=False, random_state=19,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)
tree.plot_tree(RFSirus.estimators_[0])

In [None]:
def implies(single_rule_a,single_rule_b):
        """""
        Check if single_rule_a implies single_rule_b.
        Args:
            single_rule_a (tuple): First single rule in the form (var, thr, dir).
            single_rule_b (tuple): Second single rule in the form (var, thr, dir).
        """""
        if single_rule_a[0] == single_rule_b[0]:
            if single_rule_a[2] == 'L':
                if single_rule_b[2] == 'L':
                    return single_rule_a[1] <= single_rule_b[1]
                else:
                    return False
            else:
                if single_rule_b[2] == 'R':
                    return single_rule_a[1] >= single_rule_b[1]
                else:
                    return False
        else:
            return False

In [None]:
implies(single_rule_a=(2, 1.7000000476837158, 'L'),single_rule_b=(2, 1.7000000476837158, 'L'))

In [None]:
implies(single_rule_a=(2, 1.7000000476837158, 'L'),single_rule_b=(3, 0.4000000059604645, 'L'))

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1000,max_features=4,max_depth=2,bootstrap=True, random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)

## RF : 

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1000,max_features=4,max_depth=2,bootstrap=False, random_state=1,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')

In [None]:
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)

In [None]:
tree.plot_tree(RFSirus.estimators_[0])

In [None]:
tree.plot_tree(RFSirus.estimators_[800])

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1,max_features=4,max_depth=2,bootstrap=False, random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,batch_size_post_treatment=50,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)
tree.plot_tree(RFSirus.estimators_[0])
tree.plot_tree(RFSirus.estimators_[0])

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1,max_features=4,max_depth=2,bootstrap=False, random_state=19,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)
tree.plot_tree(RFSirus.estimators_[0])

In [None]:
## RandomForestClassifier
RF = RandomForestClassifier(n_estimators=1,max_features=4,max_depth=2,bootstrap=False, random_state=19)
start = time.time()
RF.fit(X,y)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
tree.plot_tree(RF.estimators_[0])

In [None]:
## RandomForestClassifier
RF = RandomForestClassifier(n_estimators=1,max_features=4,max_depth=2,bootstrap=False, random_state=0)
start = time.time()
RF.fit(X,y)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
tree.plot_tree(RF.estimators_[0])

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1000,max_features=2,max_depth=2,bootstrap=True, random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1000,max_features=2,max_depth=2,bootstrap=True, random_state=19,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)

## optimizing p0 :

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusRFClassifier(n_estimators=1000,max_features=4,max_depth=2,bootstrap=True, random_state=1,splitter="quantile")
start = time.time()
RFSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
RFSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
RFSirus.show_rules(max_rules=16)

In [None]:
from pysirus.models.optimizer import train_optimal_extractor_p0
from sklearn.metrics import roc_auc_score

In [None]:
final_sirus_model = train_optimal_extractor_p0(clf=RFSirus,X_train=X,y_train=y,
                           scoring=roc_auc_score,p0_exploration_grid=np.linspace(0.01, 0.05, 15),
                           n_cv_splits=5,n_cv_repeats=5)

In [None]:
final_sirus_model

In [None]:
final_sirus_model.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
final_sirus_model.show_rules(max_rules=16)

## GB : 

In [None]:
## SirusGBClassifier rules extraction
GBSirus = SirusGBClassifier(n_estimators=100,max_depth=2, random_state=19,splitter="quantile")
start = time.time()
GBSirus.fit(X,y,quantile=10,p0=0.01)
end = time.time()
time_2 = end-start
print('Fitting time = ',time_2 ,'s')
#GBSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
#GBSirus.show_rules(max_rules=16)

In [None]:
GBSirus.feature_names_in_ = ['sepal length','sepal width','petal length','petal width']
GBSirus.print_rules(max_rules=4)

In [None]:
tree.plot_tree(GBSirus.estimators_[0,0])

In [None]:
tree.plot_tree(GBSirus.estimators_[1,0])

In [None]:
tree.plot_tree(GBSirus.estimators_[2,0])

In [None]:
tree.plot_tree(GBSirus.estimators_[3,0])

In [None]:
tree.plot_tree(GBSirus.estimators_[4,0])

In [None]:
tree.plot_tree(GBSirus.estimators_[99,0])