In [15]:
%load_ext autoreload
%autoreload 2
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lark import Lark
import time 
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# prepare dataset

In [17]:
from data.objects import zoo

dataObj = zoo.Zoo()
df = dataObj.get_df()

# fix target class
target_class = [1,2,3] 
_temp = {}
for i in range(1, len(df[dataObj.target].unique())+1):
    if(i in target_class):
        _temp[i] = 1
    else:
        _temp[i] = 0
print(_temp)
df[dataObj.target] = df[dataObj.target].map(_temp)


# declaration of classifier, X and y
X = df.drop([dataObj.target], axis=1)
y = df[dataObj.target]

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 70% training and 30% test


df.head()

-number of samples: (before dropping nan rows) 101
-number of samples: (after dropping nan rows) 101
{1: 1, 2: 1, 3: 1, 4: 0, 5: 0, 6: 0, 7: 0}


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,1,0,0,1,0,0,1,1,1,1,0,0,0.5,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,0.5,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0.0,1,0,0,0
3,1,0,0,1,0,0,1,1,1,1,0,0,0.5,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,0.5,1,0,1,1


# train a black-box

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.metrics import roc_auc_score

clf_rf=RandomForestClassifier(n_estimators=100)
clf_mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

clf_rf.fit(X_train,y_train)
clf_mlp.fit(X_train,y_train)


param_grid = {'max_depth': np.arange(3, 10)}
grid_tree = GridSearchCV(tree.DecisionTreeClassifier(), param_grid)
grid_tree.fit(X_train, y_train)
tree_preds = grid_tree.predict_proba(X_test)[:, 1]
tree_performance = roc_auc_score(y_test, tree_preds)
clf_dt = grid_tree.best_estimator_



print("Accuracy decision tree:",metrics.accuracy_score(y_test, clf_dt.predict(X_test)))
print("Accuracy random forest:",metrics.accuracy_score(y_test, clf_rf.predict(X_test)))
print("Accuracy neural netwrk:",metrics.accuracy_score(y_test, clf_mlp.predict(X_test)))




Accuracy decision tree: 1.0
Accuracy random forest: 1.0
Accuracy neural netwrk: 1.0


In [19]:
from sklearn.tree import _tree
def tree_to_code( tree, feature_names):

        
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("\nLearned tree -->\n")
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node][0])))

    recurse(0, 1)

    
tree_to_code(clf_dt, X_train.columns.to_list())


Learned tree -->

def tree(hair, feathers, eggs, milk, airborne, aquatic, predator, toothed, backbone, breathes, venomous, fins, legs, tail, domestic, catsize):
  if backbone <= 0.5:
    return 0
  else:  # if backbone > 0.5
    if breathes <= 0.5:
      if eggs <= 0.5:
        return 1
      else:  # if eggs > 0.5
        return 0
    else:  # if breathes > 0.5
      if tail <= 0.5:
        if eggs <= 0.5:
          return 1
        else:  # if eggs > 0.5
          return 0
      else:  # if tail > 0.5
        if aquatic <= 0.5:
          return 1
        else:  # if aquatic > 0.5
          if legs <= 0.375:
            return 1
          else:  # if legs > 0.375
            if hair <= 0.5:
              return 0
            else:  # if hair > 0.5
              return 1


# create a random generator

In [20]:

import random
def random_generator(X):
    num_attributes = len(X.columns)
    x=[]
    for i in range(num_attributes):
        x.append(random.randint(0,1))
    return x

for i in range(5):
    x = random_generator(X)
    print()
    print(x)
    print("random forest :", clf_rf.predict([x]))
    print("neural network:", clf_mlp.predict([x]))
    print("decision tree :", clf_dt.predict([x]))


[0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1]
random forest : [1]
neural network: [1]
decision tree : [0]

[0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1]
random forest : [0]
neural network: [1]
decision tree : [0]

[1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0]
random forest : [1]
neural network: [1]
decision tree : [1]

[0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1]
random forest : [1]
neural network: [1]
decision tree : [0]

[1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0]
random forest : [0]
neural network: [1]
decision tree : [0]


# implementation of the query

In [21]:
tree_to_code(clf_dt, X_train.columns.to_list())


Learned tree -->

def tree(hair, feathers, eggs, milk, airborne, aquatic, predator, toothed, backbone, breathes, venomous, fins, legs, tail, domestic, catsize):
  if backbone <= 0.5:
    return 0
  else:  # if backbone > 0.5
    if breathes <= 0.5:
      if eggs <= 0.5:
        return 1
      else:  # if eggs > 0.5
        return 0
    else:  # if breathes > 0.5
      if tail <= 0.5:
        if eggs <= 0.5:
          return 1
        else:  # if eggs > 0.5
          return 0
      else:  # if tail > 0.5
        if aquatic <= 0.5:
          return 1
        else:  # if aquatic > 0.5
          if legs <= 0.375:
            return 1
          else:  # if legs > 0.375
            if hair <= 0.5:
              return 0
            else:  # if hair > 0.5
              return 1


In [26]:
import query
import numpy as np

feature_to_index = {}
cnt = 0
for feature in X_train.columns.tolist():
        feature_to_index[feature] = cnt
        cnt += 1

def predict_function_query(x):
        if(x[feature_to_index['tail']] == 0):
                return True
        return False

q = query.Query(model = None, prediction_function = predict_function_query)


from blackbox import BlackBox
bb = BlackBox(clf_dt, clf_dt.predict)

# implementation of teacher

In [28]:
from teacher import Teacher
from learner import Learner
from sygus_if import SyGuS_IF

sgf = SyGuS_IF(feature_names=dataObj.attributes, feature_data_type=dataObj.attribute_type, function_return_type= "Bool")
l = Learner(model = sgf, prediction_function = sgf.predict_z3, train_function = sgf.fit, X = [], y=[] )

t = Teacher(max_iterations=15,epsilon=0.05, delta=0.05)
new_l, flag = t.teach(blackbox = bb, learner = l, query = q, random_example_generator = random_generator, params_generator = X_train)



print("\n\n--------------------------------%%%%%%-------------------------------")
print("Learned explanation =>", new_l.model._function_snippet)
print("Is learning complete?", flag)




start_ = time.time()
cnt = 0
for example in X_test.values.tolist():

    blackbox_verdict = bb.classify_example(example)
    learner_verdict = new_l.classify_example(example)
    query_verdict = q.classify_example(example)

    if(learner_verdict == (blackbox_verdict and query_verdict)):
        cnt += 1
print("correct: ", cnt, "out of ", len(y_test), "examples. Percentage: ", cnt/len(y_test))
# print(time.time() - start_)




-iteration: 1
-could not find True counterexample. Only found False

-iteration: 2
-found True counterexample

-iteration: 3
-found False counterexample

-iteration: 4
-found True counterexample

-iteration: 5
-found False counterexample

-iteration: 6
-found True counterexample

-iteration: 7
-found False counterexample

-iteration: 8
-could not find True counterexample. Only found False

-iteration: 9
-found True counterexample

-iteration: 10
-found False counterexample

-iteration: 11
-could not find True counterexample. Only found False

-iteration: 12
-could not find True counterexample. Only found False

-iteration: 13
-no counterexample returned

Learning complete
-total examples checked: 1011


--------------------------------%%%%%%-------------------------------
Learned explanation =>  (and backbone (not (or eggs tail)))
Is learning complete? True
correct:  11 out of  11 examples. Percentage:  1.0


## Learning on Zoo dataset using SyGus

In [15]:
from data.objects import zoo
from sygus_if import SyGuS_IF

dataObj = zoo.Zoo()
df = dataObj.get_df()

# fix target class
target_class = 1 
_temp = {}
for i in range(1, len(df[dataObj.target].unique())+1):
    if(i == target_class):
        _temp[i] = 1
    else:
        _temp[i] = 0
df[dataObj.target] = df[dataObj.target].map(_temp)

# declaration of classifier, X and y
sgf = SyGuS_IF(feature_names=dataObj.attributes, feature_data_type=dataObj.attribute_type, function_return_type= "Bool")
X = df.drop([dataObj.target], axis=1)
y = df[dataObj.target].tolist()

# train
sgf.fit(X,y)
print(sgf._function_snippet)


start_ = time.time()
y_pred_test = sgf.predict_z3(X)
print("Accuracy:",metrics.accuracy_score(y, y_pred_test))
print(time.time() - start_)


start_ = time.time()
y_pred_test = sgf.predict(X, y)
print("Accuracy:",metrics.accuracy_score(y, y_pred_test))
print(time.time() - start_)

-number of samples: (before dropping nan rows) 101
-number of samples: (after dropping nan rows) 101
 milk
Accuracy: 1.0
1.7959465980529785
Accuracy: 1.0
2.0942163467407227
