In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [15]:
seed = 2022
X = pd.read_csv('data/lucas0_train.csv')
y = X.Lung_cancer.values
X.drop(['Lung_cancer'], axis=1, inplace=True)
# X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [16]:
X.head()

Unnamed: 0,Smoking,Yellow_Fingers,Anxiety,Peer_Pressure,Genetics,Attention_Disorder,Born_an_Even_Day,Car_Accident,Fatigue,Allergy,Coughing
0,0,0,1,0,0,1,0,1,0,1,0
1,0,1,0,0,0,0,1,0,1,0,1
2,1,1,1,0,1,1,1,1,1,1,1
3,0,0,0,1,0,0,1,0,0,0,0
4,1,1,1,0,0,1,1,1,1,0,0


In [17]:
X_train.shape

(1600, 11)

# Fit Global Sufficient Rules

In [38]:
from acv_explainers import ACXplainer
from sklearn.metrics import roc_auc_score, accuracy_score

# It has the same params as a Random Forest, and it should be tuned to maximize the performance.  
acv_xplainer = ACXplainer(classifier=True, n_estimators=50, max_depth=6, mtry=X.shape[1])
acv_xplainer.fit(X_train, y_train)

In [39]:
roc_auc_score(y_test, acv_xplainer.predict_proba(X_test)[:, 1])

0.9078624787286205

### 1- Compute Sufficient Explanations

In [40]:
sdp_importance, sdp_index, size, sdp = acv_xplainer.importance_ddp_rf(X_train, y_train.astype(np.double),
                                                                      X_train, y_train.astype(np.double), stop=False, 
                                                                      pi_level=0.9)

100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 253.92it/s]
100%|████████████████████████████████████████| 10/10 [1:09:54<00:00, 419.45s/it]


In [52]:
np.argmax(sdp >= 0.90)

2

In [64]:
for i in range(sdp.shape[0]):
    if sdp[i]>=0.9:
        print(list(X_train.columns[sdp_index[i, :size[i]]]))

['Coughing', 'Genetics', 'Fatigue', 'Anxiety']
['Coughing', 'Fatigue']
['Fatigue', 'Car_Accident', 'Peer_Pressure']
['Coughing', 'Smoking', 'Allergy', 'Genetics', 'Anxiety']
['Allergy', 'Fatigue']
['Coughing', 'Allergy', 'Anxiety', 'Peer_Pressure']
['Coughing', 'Genetics']
['Allergy']
['Allergy']
['Coughing', 'Allergy']
['Coughing']
['Coughing']
['Coughing', 'Smoking', 'Genetics', 'Anxiety', 'Attention_Disorder']
['Allergy', 'Fatigue', 'Car_Accident']
['Allergy']
['Attention_Disorder']
['Allergy', 'Genetics', 'Fatigue', 'Car_Accident', 'Born_an_Even_Day', 'Attention_Disorder', 'Peer_Pressure']
['Born_an_Even_Day']
['Fatigue']
['Coughing', 'Genetics', 'Fatigue', 'Anxiety', 'Born_an_Even_Day']
['Genetics']
['Coughing', 'Allergy', 'Genetics', 'Anxiety']
['Allergy']
['Coughing']
['Allergy']
['Allergy', 'Fatigue']
['Allergy']
['Car_Accident']
['Coughing', 'Genetics', 'Fatigue', 'Anxiety', 'Born_an_Even_Day']
['Allergy']
['Smoking', 'Allergy', 'Anxiety']
['Allergy', 'Fatigue']
['Smoking', 'G

In [53]:
sdp[2]

0.9029749696055255

In [54]:
sdp_index[2]

array([10,  4,  8,  2, -1, -1, -1, -1, -1, -1, -1])

In [55]:
S_star[2]

[10, 4, 8, 2]

In [62]:
for a in S_star[2]:
    print(list(X_train.columns[[1, 2, 3]]))

['Yellow_Fingers', 'Anxiety', 'Peer_Pressure']
['Yellow_Fingers', 'Anxiety', 'Peer_Pressure']
['Yellow_Fingers', 'Anxiety', 'Peer_Pressure']
['Yellow_Fingers', 'Anxiety', 'Peer_Pressure']


In [50]:
X_train.columns[4]

'Genetics'

In [48]:
from acv_explainers.utils import get_active_null_coalition_list

S_star, N_star = get_active_null_coalition_list(sdp_index, size)

### 2- Compute Sufficient Rules

In [9]:
sdp, rules, sdp_all, rules_data, w = acv_xplainer.compute_sdp_maxrules(X_train, y_train.astype(np.double),
                                                         X_train, y_train.astype(np.double), S_star, verbose=True)

100%|█████████████████████████████████████████| 455/455 [00:33<00:00, 13.69it/s]


### 3- Compute Global Sufficient Rules (G-SR)

In [10]:
acv_xplainer.fit_global_rules(X_train, y_train, rules, S_star)

#### Rules stats

- Number of rules

In [11]:
acv_xplainer.rules.shape

(93, 30, 2)

- Rules accuracy

In [12]:
print(acv_xplainer.rules_acc[:10])

[1.0, 0.9949494949494949, 0.9772727272727273, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


- Rules Coverage

In [13]:
print(acv_xplainer.rules_coverage[:10])

[0.12087912087912088, 0.4351648351648352, 0.0967032967032967, 0.006593406593406593, 0.002197802197802198, 0.004395604395604396, 0.2813186813186813, 0.14725274725274726, 0.07692307692307693, 0.3912087912087912]


### Global model prediction 

In [14]:
y_test_pred = acv_xplainer.predict_proba_global_rules(X_test.values, min_acc=0.9)

### 4-  Compute the coverage and the precision of G-SR

In [15]:
y_o = []
y_r = []
for i, ya in enumerate(y_test_pred[0]):
    if ya != None:
        y_o.append(ya)
        y_r.append(y_test.values[i])
        
y_o = np.array(y_o, dtype=int)

print('Accuracy = {} --- Test Coverage = {}'.format(accuracy_score(y_r, y_o), len(y_r)/X_test.shape[0]))


Accuracy = 0.9523809523809523 --- Test Coverage = 0.9210526315789473


In [19]:
# # Test webApp

# import acv_app
# import os

# # compile the ACXplainer
# acv_app.compile_ACXplainers(acv_xplainer, X_train, y_train, X_test, y_test, path=os.getcwd())

# # Launch the webApp
# acv_app.run_webapp(pickle_path=os.getcwd())

# Baseline models

In [19]:
from imodels import BoostedRulesClassifier, BayesianRuleListClassifier, GreedyRuleListClassifier, SkopeRulesClassifier # see more models below
from imodels import SLIMRegressor, RuleFitRegressor, RuleFitClassifier

# Rule Fit

In [20]:
rf = RuleFitClassifier()  # initialize a model
rf.fit(X_train, y_train)  

RuleFitClassifier()

In [21]:
accuracy_score(y_test, rf.predict(X_test))

0.9473684210526315

# Skoped Rule

In [26]:
rf = SkopeRulesClassifier(n_estimators=100, precision_min=0.9)  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.9210526315789473

# Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
rf = DecisionTreeClassifier()  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.9473684210526315

# Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf = RandomForestClassifier()  # initialize a model
rf.fit(X_train, y_train)  

accuracy_score(y_test, rf.predict(X_test))

0.9736842105263158