In [1]:
%load_ext autoreload
%autoreload 2
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lark import Lark
import time 
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from IPython.display import Markdown, display
display(Markdown("# Prepare dataset"))

# Prepare dataset

In [2]:
from data.objects import zoo

dataObj = zoo.Zoo()
df = dataObj.get_df()

# fix target class
target_class = [1,2,3] 
_temp = {}
for i in range(1, len(df[dataObj.target].unique())+1):
    if(i in target_class):
        _temp[i] = 1
    else:
        _temp[i] = 0
print(_temp)
df[dataObj.target] = df[dataObj.target].map(_temp)


# declaration of classifier, X and y
X = df.drop([dataObj.target], axis=1)
y = df[dataObj.target]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 70% training and 30% test


print(df.head())
display(Markdown("# Train the blackbox"))

-number of samples: (before dropping nan rows) 101
-number of samples: (after dropping nan rows) 101
{1: 1, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0}
   hair  feathers  eggs  milk  airborne  aquatic  predator  toothed  backbone  \
0     1         0     0     1         0        0         1        1         1   
1     1         0     0     1         0        0         0        1         1   
2     0         0     1     0         0        1         1        1         1   
3     1         0     0     1         0        0         1        1         1   
4     1         0     0     1         0        0         1        1         1   

   breathes  venomous  fins  legs  tail  domestic  catsize  class_type  
0         1         0     0   0.5     0         0        1           1  
1         1         0     0   0.5     1         0        1           1  
2         0         0     1   0.0     1         0        0           0  
3         1         0     0   0.5     0         0        1           1  
4   

# Train the blackbox

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import roc_auc_score

clf_rf=RandomForestClassifier(n_estimators=100)
clf_mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

clf_rf.fit(X_train,y_train)
clf_mlp.fit(X_train,y_train)


param_grid = {'max_depth': np.arange(3, 10)}
grid_tree = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
grid_tree.fit(X_train, y_train)
tree_preds = grid_tree.predict_proba(X_test)[:, 1]
tree_performance = roc_auc_score(y_test, tree_preds)
clf_dt = grid_tree.best_estimator_



print("Accuracy decision tree:",metrics.accuracy_score(y_test, clf_dt.predict(X_test)))
print("Accuracy random forest:",metrics.accuracy_score(y_test, clf_rf.predict(X_test)))
print("Accuracy neural netwrk:",metrics.accuracy_score(y_test, clf_mlp.predict(X_test)))




Accuracy decision tree: 0.9090909090909091
Accuracy random forest: 1.0
Accuracy neural netwrk: 1.0


In [4]:
from sklearn.tree import _tree
def tree_to_code( tree, feature_names):

        
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    s = "def tree({}):".format(", ".join(feature_names)) + "\n\n"
    # print("\nLearned tree -->\n")
    # print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth, s):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            s = s + "{}if {} <= {}:".format(indent, name, threshold) + "\n"
            # print("{}if {} <= {}:".format(indent, name, threshold))
            s = recurse(tree_.children_left[node], depth + 1, s)
            s = s + "{}else:".format(indent) + "\n"
            # print("{}else:".format(indent))
            s = recurse(tree_.children_right[node], depth + 1, s)
        else:
            s = s + "{}return {}".format(indent, np.argmax(tree_.value[node][0])) + "\n"
            # print("{}return {}".format(indent, np.argmax(tree_.value[node][0])))
        
        return s


    s = recurse(0, 1, s)
    return s


    
print(tree_to_code(clf_dt, X_train.columns.to_list()))
display(Markdown("## test the random generator"))

def tree(hair, feathers, eggs, milk, airborne, aquatic, predator, toothed, backbone, breathes, venomous, fins, legs, tail, domestic, catsize):

    if backbone <= 0.5:
        return 0
    else:
        if breathes <= 0.5:
            if venomous <= 0.5:
                return 0
            else:
                return 1
        else:
            if aquatic <= 0.5:
                return 1
            else:
                if legs <= 0.375:
                    return 1
                else:
                    if milk <= 0.5:
                        return 0
                    else:
                        return 1



## test the random generator

In [5]:
from data.objects import helper_functions

for i in range(5):
    x = helper_functions.random_generator(X, dataObj.attribute_type)
    print()
    print(x)
    print("random forest :", clf_rf.predict([x]))
    print("neural network:", clf_mlp.predict([x]))
    print("decision tree :", clf_dt.predict([x]))


[1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0.5163022407704372, 1, 1, 1]
random forest : [1]
neural network: [1]
decision tree : [0]

[0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0.28221343812383093, 1, 1, 1]
random forest : [1]
neural network: [1]
decision tree : [0]

[1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0.2956122184396659, 1, 0, 0]
random forest : [1]
neural network: [1]
decision tree : [1]

[1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0.7422331909417709, 0, 1, 0]
random forest : [0]
neural network: [0]
decision tree : [0]

[1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0.24904321486171244, 0, 0, 0]
random forest : [1]
neural network: [1]
decision tree : [0]


In [6]:
tree_to_code(clf_dt, X_train.columns.to_list())
display(Markdown("# Design the query"))

# Design the query

In [16]:
sys.path.append("trustable_explanation/")
import query
import operator
from blackbox_dt import DecisionTree


# our query is a halfspace and conjunction of the following
dict_query = {}
# dict_query['backbone'] = (operator.eq, 1)
# dict_query['breathes'] = (operator.eq, 0)
# dict_query['aquatic'] = (operator.eq, 0)
# dict_query['tail'] = (operator.eq, 0)
dict_query['venomous'] = (operator.eq, 1)



# We define query specilized for decision tree
bb_dt = DecisionTree(features=X_train.columns.tolist(), halfspace=dict_query)
print(bb_dt)

q = query.Query(model = None, prediction_function = bb_dt.predict_function_query)


from blackbox import BlackBox
bb = BlackBox(clf_dt, clf_dt.predict)
display(Markdown("# Teacher"))

Query is -->
	venomous = 1



# Teacher

In [17]:
from teacher import Teacher
from learner import Learner
from sygus_if import SyGuS_IF

# sgf = SyGuS_IF(feature_names=dataObj.attributes, feature_data_type=dataObj.attribute_type, function_return_type= "Bool")
# l = Learner(model = sgf, prediction_function = sgf.predict_z3, train_function = sgf.fit, X = [], y=[] )

dt_classifier = tree.DecisionTreeClassifier()
l = Learner(model = dt_classifier, prediction_function = dt_classifier.predict, train_function = dt_classifier.fit, X = [], y=[] )


t = Teacher(max_iterations=100,epsilon=0.05, delta=0.05, timeout=40)
_teach_start = time.time()
l, flag = t.teach(blackbox = bb, learner = l, query = q, random_example_generator = helper_functions.random_generator, params_generator = (X_train,dataObj.attribute_type), verbose=False)



display(Markdown("### Result"))
# print("Learned explanation =>", l.model._function_snippet)
print(str(bb_dt))
print("Learned explanation =>", tree_to_code(l.model,X_train.columns.to_list()), "\n\n")
print("-is learning complete?", flag)
print("-it took", time.time() - _teach_start, "seconds")




start_ = time.time()
cnt = 0
for example in X_test.values.tolist():

    blackbox_verdict = bb.classify_example(example)
    learner_verdict = l.classify_example(example)
    query_verdict = q.classify_example(example)

    if(learner_verdict == (blackbox_verdict and query_verdict)):
        cnt += 1
print("correct: ", cnt, "out of ", len(y_test), "examples. Percentage: ", cnt/len(y_test))
# print(time.time() - start_)


f = open("data/output/log.txt", "a")
f.write("=================================================\n")
f.write(str(bb_dt))
f.write("\n\nExplanation is -->\n")
# f.write(l.model._function_snippet)
f.write(tree_to_code(l.model,X_train.columns.to_list()))
f.write("\n")
f.write("\n-did teacher terminate? " + str(flag) + "\n\n")
f.close()




Learning complete
-total examples checked: 1792


### Result

Query is -->
	venomous = 1

Learned explanation => def tree(hair, feathers, eggs, milk, airborne, aquatic, predator, toothed, backbone, breathes, venomous, fins, legs, tail, domestic, catsize):

    if backbone <= 0.5:
        return 0
    else:
        if venomous <= 0.5:
            return 0
        else:
            if milk <= 0.5:
                if legs <= 0.3846162110567093:
                    return 1
                else:
                    if breathes <= 0.5:
                        return 1
                    else:
                        if aquatic <= 0.5:
                            return 1
                        else:
                            return 0
            else:
                return 1
 


-is learning complete? True
-it took 0.42040443420410156 seconds
correct:  11 out of  11 examples. Percentage:  1.0


## Learning on Zoo dataset using SyGus

In [11]:
from data.objects import zoo
from sygus_if import SyGuS_IF

dataObj = zoo.Zoo()
df = dataObj.get_df()

# fix target class
target_class = 1 
_temp = {}
for i in range(1, len(df[dataObj.target].unique())+1):
    if(i == target_class):
        _temp[i] = 1
    else:
        _temp[i] = 0
df[dataObj.target] = df[dataObj.target].map(_temp)

# declaration of classifier, X and y
sgf = SyGuS_IF(feature_names=dataObj.attributes, feature_data_type=dataObj.attribute_type, function_return_type= "Bool")
X = df.drop([dataObj.target], axis=1)
y = df[dataObj.target].tolist()

# train
sgf.fit(X,y)
print(sgf._function_snippet)


start_ = time.time()
y_pred_test = sgf.predict_z3(X)
print("Accuracy:",metrics.accuracy_score(y, y_pred_test))
print(time.time() - start_)


start_ = time.time()
y_pred_test = sgf.predict(X, y)
print("Accuracy:",metrics.accuracy_score(y, y_pred_test))
print(time.time() - start_)

-number of samples: (before dropping nan rows) 101
-number of samples: (after dropping nan rows) 101
 milk
Accuracy: 1.0
1.5516462326049805
Accuracy: 1.0
1.3945815563201904
