In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

from pyexplainer.pyexplainer_pyexplainer import PyExplainer
import matplotlib.pyplot as plt

import os, pickle, time
# from datetime import datetime

from my_util import *

In [2]:
data_path = './dataset/'
result_dir = './eval_result/'
dump_dataframe_dir = './dump_df/'
pyExp_dir = './pyExplainer_obj/'
other_object_dir = './other_object/'
proj_name = 'openstack' # ['openstack','qt']



if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    
if not os.path.exists(dump_dataframe_dir):
    os.makedirs(dump_dataframe_dir)
    
if not os.path.exists(pyExp_dir):
    os.makedirs(pyExp_dir)
    
if not os.path.exists(other_object_dir):
    os.makedirs(other_object_dir)

## Prepare data

In [3]:
x_train, x_test, y_train, y_test = prepare_data(proj_name, mode = 'all')

In [4]:
# # print(y_train)
# for col in x_test.columns:
#     print(col,len(x_test[col].unique()))

col = list(x_test.columns)
print(col)
print(col.index('self'))

['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self', 'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp', 'rsexp', 'asawr', 'rsawr']
9


## Train global model

In [5]:
smt = SMOTE(k_neighbors=5, random_state=42, n_jobs=24)
# enn = EditedNearestNeighbours(n_neighbors=5, n_jobs=24)
# smt_tmk = SMOTETomek(smote = smt, random_state=0)
# smt_enn = SMOTEENN(smote=smt, enn=enn, random_state=0)

new_x_train, new_y_train = smt.fit_resample(x_train, y_train)

def train_global_model(x_train,y_train):
    global_model = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=24)
    global_model.fit(x_train, y_train)

    pickle.dump(global_model, open(proj_name+'_global_model.pkl','wb'))
    print('train global model finished')
    
train_black_box = True

if train_black_box:
    train_global_model(new_x_train, new_y_train)

train global model finished


## Obtain correctly predicted defective commits

In [6]:
global_model = pickle.load(open(proj_name+'_global_model.pkl','rb'))
# x_test, y_test = prepare_data(proj_name, mode = 'test')

pred = global_model.predict(x_test)
defective_prob = global_model.predict_proba(x_test)[:,1]

prediction_df = x_test.copy()
prediction_df['pred'] = pred
prediction_df['defective_prob'] = defective_prob
prediction_df['defect'] = y_test

correctly_predict_df = prediction_df[(prediction_df['pred']==1) & (prediction_df['defect']==1)]

In [7]:
print(len(correctly_predict_df))
prediction_df.to_csv(dump_dataframe_dir+proj_name+'_prediction_result.csv')
correctly_predict_df.to_csv(dump_dataframe_dir+proj_name+'_correctly_predict_as_defective.csv')


228


## Training local model

Note: this step includes instance generation

In [8]:
global_model = pickle.load(open(proj_name+'_global_model.pkl','rb'))

load_prediction_from_file = True
class_label = ['clean', 'defect']

if load_prediction_from_file:
    correctly_predict_df = pd.read_csv(dump_dataframe_dir+proj_name+'_correctly_predict_as_defective.csv')
    correctly_predict_df = correctly_predict_df.set_index('Unnamed: 0')
    
dep = 'defect'
indep = correctly_predict_df.columns[:22]

print(correctly_predict_df.columns)
print(len(correctly_predict_df.columns))
print(indep)

Index(['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self',
       'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp',
       'rsexp', 'asawr', 'rsawr', 'pred', 'defective_prob', 'defect'],
      dtype='object')
25
Index(['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self',
       'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp',
       'rsexp', 'asawr', 'rsawr'],
      dtype='object')


In [9]:
# print(correctly_predict_df)


In [10]:
pyExp = PyExplainer(x_train,
            y_train,
            indep,
            dep,
            class_label,
            blackbox_model = global_model,
            categorical_features = ['self'])

In [11]:
feature_df = correctly_predict_df.loc[:, indep]
test_label = correctly_predict_df.loc[:, dep]
problem_index = [] # store index that cannot build pyExplainer

In [12]:
def create_pyExplainer_obj(search_function, feature_df, test_label, explainer='LRR'):
    problem_index = []
    time_spent = []
    
    for i in range(0,len(feature_df)):
        X_explain = feature_df.iloc[[i]]
        y_explain = test_label.iloc[[i]]

        row_index = str(X_explain.index[0])

        start = time.time()
        try:
            pyExp_obj = pyExp.explain(X_explain,
                                       y_explain,
                                       search_function = search_function, 
                                       top_k = 1000,
                                       max_rules=2000, 
                                       max_iter =None, 
                                       cv=5,
                                       explainer=explainer,
                                       debug = False)
            synt_pred = pyExp_obj['synthetic_predictions']
            
            print('{}: found {} defect from total {}'.format(row_index, str(np.sum(synt_pred)), 
                                                         str(len(synt_pred))))
            pickle.dump(pyExp_obj, open(pyExp_dir+proj_name+'_'+explainer+'_'+search_function+'_'+row_index+'.pkl','wb'))
        
        except Exception as e:
            problem_index.append(row_index)
            print('-'*100)
            print(e)
            print('found total {} problematic commit'.format(str(len(problem_index))))
            print('-'*100)
            
#         break

        end = time.time()

        time_spent.append(str(end-start))
#     print(row_index)
#     break
    
    
    return time_spent, problem_index

In [25]:
# time_spent_rand, problem_index_rand = create_pyExplainer_obj('randompertubation', feature_df, test_label,'LRR')
# pickle.dump(time_spent_rand, open(other_object_dir+proj_name+'_train_time_LRR_randompertubation.pkl','wb'))
# pickle.dump(problem_index_rand, open(other_object_dir+proj_name+'_problem_index_LRR_randompertubation.pkl','wb'))

# time_spent_rand, problem_index_rand = create_pyExplainer_obj('randompertubation', feature_df, test_label,'rulefit')
# pickle.dump(time_spent_rand, open(other_object_dir+proj_name+'_train_time_rulefit_randompertubation.pkl','wb'))
# pickle.dump(problem_index_rand, open(other_object_dir+proj_name+'_problem_index_rulefit_randompertubation.pkl','wb'))

9703: found 63 defect from total 1000
23348: found 31 defect from total 1000
25135: found 175 defect from total 1000
15814: found 207 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 1 problematic commit
----------------------------------------------------------------------------------------------------
1650: found 208 defect from total 1000
14991: found 104 defect from total 1000
19600: found 254 defect from total 1000




868: found 1 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 2 problematic commit
----------------------------------------------------------------------------------------------------
4619: found 153 defect from total 1000
16345: found 56 defect from total 1000
3339: found 199 defect from total 1000
3867: found 107 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 3 problematic commit
----------------------------------------------------------------------------------------------------
25262: found 154 defect from total 1000
17550: found 211 defect from total 1000
18685: found 185 defect 



8188: found 1 defect from total 1000
11408: found 550 defect from total 1000
2536: found 423 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 16 problematic commit
----------------------------------------------------------------------------------------------------
18365: found 294 defect from total 1000
19921: found 49 defect from total 1000
12346: found 402 defect from total 1000
15155: found 125 defect from total 1000
4711: found 369 defect from total 1000
11758: found 369 defect from total 1000
3151: found 185 defect from total 1000
25670: found 104 defect from total 1000
6246: found 331 defect from total 1000
11057: found 154 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_we

20091: found 294 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 30 problematic commit
----------------------------------------------------------------------------------------------------
4053: found 420 defect from total 1000
24864: found 173 defect from total 1000
17036: found 421 defect from total 1000
15545: found 5 defect from total 1000
16203: found 161 defect from total 1000
----------------------------------------------------------------------------------------------------
y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.
found total 31 problematic commit
----------------------------------------------------------------------------------------------------
---------------------------------------------------------

In [None]:
# time_spent_ci, problem_index_ci = create_pyExplainer_obj('crossoverinterpolation', feature_df, test_label)
# pickle.dump(time_spent_ci, open(other_object_dir+proj_name+'_train_time_LRR_crossoverinterpolation.pkl','wb'))
# pickle.dump(problem_index_ci, open(other_object_dir+proj_name+'_problem_index_LRR_crossoverinterpolation.pkl','wb'))

# time_spent_ci, problem_index_ci = create_pyExplainer_obj('crossoverinterpolation', feature_df, test_label,'rulefit')
# pickle.dump(time_spent_ci, open(other_object_dir+proj_name+'_train_time_rulefit_crossoverinterpolation.pkl','wb'))
# pickle.dump(problem_index_ci, open(other_object_dir+proj_name+'_problem_index_rulefit_crossoverinterpolation.pkl','wb'))

9703: found 1419 defect from total 2117
23348: found 1129 defect from total 2240
25135: found 1179 defect from total 2240
15814: found 1182 defect from total 2137
1826: found 1469 defect from total 2156
1650: found 1265 defect from total 2240
14991: found 1183 defect from total 2234
19600: found 1343 defect from total 2163
868: found 1153 defect from total 2200
18392: found 1328 defect from total 2101
4619: found 1252 defect from total 2153


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


16345: found 1135 defect from total 2240
3339: found 1378 defect from total 2196
3867: found 1194 defect from total 2240
361: found 1342 defect from total 2160
25262: found 1205 defect from total 2197
17550: found 1140 defect from total 2240
18685: found 1399 defect from total 2132
10085: found 1279 defect from total 2208
3138: found 919 defect from total 2238
24429: found 1501 defect from total 2111
22437: found 1209 defect from total 2240
23962: found 1529 defect from total 2131
3948: found 1533 defect from total 2129
16194: found 1239 defect from total 2155
22153: found 1547 defect from total 2106
9607: found 1393 defect from total 2109
16968: found 1161 defect from total 2240
15059: found 1176 defect from total 2240
19968: found 1455 defect from total 2101
12374: found 1223 defect from total 2240
5745: found 1277 defect from total 2223
2428: found 1471 defect from total 2113
3149: found 1282 defect from total 2240
17073: found 931 defect from total 2240
23154: found 1085 defect fro

## Just for testing

In [16]:
# display(feature_df)
# for c in feature_df:
#     print(c)

# for k in range(0,1):
#     print(k)

In [17]:
# test feature binarizer
# from pyexplainer.features import *
# fb = FeatureBinarizer(negations=True)
# fb.fit(x_train)
# display(fb.transform(feature_df))

In [18]:
# search_function='randompertubation'
# i = 3

# X_explain = feature_df.iloc[[i]]
# y_explain = test_label.iloc[[i]]

# row_index = str(X_explain.index[0])

# start = time.time()

# pyExp_obj = pyExp.explain(X_explain,
#                            y_explain,
#                            search_function = search_function, 
#                            top_k = 1000,
#                            max_rules=2000, 
#                            max_iter =None, 
#                            cv=5,
#                            explainer='rulefit',
#                            debug = False)

In [19]:
print(pyExp_obj.keys())

dict_keys(['synthetic_data', 'synthetic_predictions', 'X_explain', 'y_explain', 'indep', 'dep', 'local_model', 'top_k_positive_rules', 'top_k_negative_rules'])


In [24]:
# display(pyExp_obj['synthetic_data_fb'])

In [23]:
# local_model = pyExp_obj['local_model']
# print(local_model.predict(X_explain.values))
# print('------------------Explanation from local model-------------------------')
# print(local_model.explain())

[False]


In [11]:
# display(pyExp_obj['X_explain'])

In [12]:
# print(local_model.predict(pyExp_obj['X_explain_fb']))

### sample code for RQ1

In [26]:
# synthetic_instances = pyExp_obj['synthetic_data']
# sample_instance = pyExp_obj['X_explain']

In [23]:
# def agg_list(val):
#     return np.mean(val), np.median(val), np.max(val)

# cos_sim = cosine_similarity(sample_instance.values.reshape(1,-1), synthetic_instances.values)[0]
# euclid_dist = euclidean_distances(sample_instance.values.reshape(1,-1), synthetic_instances.values)[0]

### sample code for RQ2

In [35]:
# '''get prediction from global model then compare with result obtained from model (but how to compare??)'''

# local_model = pyExp_obj['local_model']
# '''In case pyExp uses logistic rule regression'''
# fb_sample_instance = pyExp.feature_binarizer.transform(sample_instance)
# # print(pyExp.feature_binarizer.transform(sample_instance))
# local_pred = local_model.predict(fb_sample_instance)
# local_pred_prob = local_model.predict_proba(fb_sample_instance)

# '''In case pyExp uses RuleFit'''
# # local_pred = local_model.predict(sample_instance)
# # local_pred_prob = local_model.predict_proba(sample_instance)

# print(local_pred, local_pred_prob)

[False] [0.06562197]


In [13]:
# test rulefit
search_function='randompertubation'
print('------------------Prediction from local model-------------------------')
for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    try:
        pyExp_obj = pyExp.explain(X_explain,
                                   y_explain,
                                   search_function = search_function, 
                                   top_k = 1000,
                                   max_rules=2000, 
                                   max_iter =None, 
                                   cv=5,
                                   explainer='LRR',
                                   debug = False)
        end = time.time()
#         print('time spent to train LRR:',str(end-start),'secs')
        
        local_model = pyExp_obj['local_model']
        print(local_model.explain())
        
#         print(global_model.predict_proba(X_explain)[:,1], local_model.predict_proba(pyExp.scaler.transform(X_explain.values))[:,1])
#         print(local_model.explain())
        print('-'*100)
    except:
        print('-'*100)
        print('there is only 1 class in the generated instances')
        print('-'*100)


------------------Prediction from local model-------------------------




           rule coefficient
0   (intercept)    0.926494
1  nrev <= 1.00  -11.265911
2    nd <= 1.00   -3.340535
----------------------------------------------------------------------------------------------------




            rule coefficient
0    (intercept)   -4.822969
1   nrev <= 1.00  -21.790036
2    ent <= 0.90    11.04842
3  asawr <= 0.16    2.480303
4    nuc <= 3.00    2.480303
----------------------------------------------------------------------------------------------------




           rule coefficient
0   (intercept)   -9.127939
1  nrev <= 2.00   12.185013
2  nrev <= 1.00   -7.495421
3   ent <= 0.63   -7.495421
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
there is only 1 class in the generated instances
----------------------------------------------------------------------------------------------------




            rule coefficient
0    (intercept)    0.721183
1   nrev <= 1.00   -8.482928
2  asawr <= 0.05   -7.399822
----------------------------------------------------------------------------------------------------
          rule coefficient
0  (intercept)   -3.748032
----------------------------------------------------------------------------------------------------




           rule coefficient
0   (intercept)   -9.149672
1  nrev <= 2.00   14.773528
2  nrev <= 1.00    -9.91744
3   ent <= 0.00   -8.247625
----------------------------------------------------------------------------------------------------




            rule coefficient
0    (intercept)  -10.434791
1   nrev <= 1.00  -15.862358
2  asawr <= 0.09   15.725896
----------------------------------------------------------------------------------------------------
          rule coefficient
0  (intercept)   -2.933625
----------------------------------------------------------------------------------------------------
            rule coefficient
0    (intercept)    1.111113
1     ld <= 0.00  -11.000013
2   nrev <= 1.00   -2.505401
3  asawr <= 0.00   -0.249322
4  asawr <= 0.00   -0.249322
----------------------------------------------------------------------------------------------------




In [23]:
# test rulefit
search_function='crossoverinterpolation'
print('------------------Prediction from local model-------------------------')
for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    start = time.time()
    try:
        pyExp_obj = pyExp.explain(X_explain,
                                   y_explain,
                                   search_function = search_function, 
                                   top_k = 1000,
                                   max_rules=2000, 
                                   max_iter =None, 
                                   cv=5,
                                   explainer='rulefit',
                                   debug = False)
        end = time.time()
#         print('time spent to train LRR:',str(end-start),'secs')
        
        local_model = pyExp_obj['local_model']
        
        print(global_model.predict_proba(X_explain)[:,1], local_model.predict_proba(X_explain.values)[:,1])
#         print(local_model.explain())
        print('-'*100)
    except:
        print('-'*100)
        print('there is only 1 class in the generated instances')
        print('-'*100)


------------------Prediction from local model-------------------------
[0.84] [0.96695341]
----------------------------------------------------------------------------------------------------
[0.73] [0.49372837]
----------------------------------------------------------------------------------------------------
[0.83] [0.99548978]
----------------------------------------------------------------------------------------------------
[0.67] [0.93530316]
----------------------------------------------------------------------------------------------------
[0.6] [0.94639136]
----------------------------------------------------------------------------------------------------
[0.66] [0.94685406]
----------------------------------------------------------------------------------------------------
[0.62] [0.77372941]
----------------------------------------------------------------------------------------------------
[0.75] [0.91593334]
---------------------------------------------------------------

In [14]:
import warnings
warnings.simplefilter("ignore")

In [30]:
# sample LRR when used with random perturbation

search_function='randompertubation'

for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    start = time.time()
    try:
        pyExp_obj = pyExp.explain(X_explain,
                                   y_explain,
                                   search_function = search_function, 
                                   top_k = 1000,
                                   max_rules=2000, 
                                   max_iter =None, 
                                   cv=5,
                                   explainer='LRR',
                                   debug = False)
        end = time.time()
        print('time spent to train LRR:',str(end-start),'secs')

#         local_model = pyExp_obj['local_model']
#         print('------------------Explanation from local model-------------------------')
#         print(local_model.explain())
        print('-'*100)
    except:
        print('-'*100)
        print('there is only 1 class in the generated instances')
        print('-'*100)
    

time spent to train LRR: 1.4275202751159668 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 1.738269329071045 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 1.0873517990112305 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 0.9002327919006348 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 1.3496229648590088 secs
----------------------------------------------------------------------------------------------------


IndexError: positional indexers are out-of-bounds

In [31]:
# sample LRR when used with crossover interpolation

search_function='crossoverinterpolation'

for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    start = time.time()
    pyExp_obj = pyExp.explain(X_explain,
                               y_explain,
                               search_function = search_function, 
                               top_k = 1000,
                               max_rules=2000, 
                               max_iter =None, 
                               cv=5,
                               explainer='LRR',
                               debug = False)
    end = time.time()
    print('time spent to train LRR:',str(end-start),'secs')
    
#     local_model = pyExp_obj['local_model']
#     print('------------------Explanation from local model-------------------------')
#     print(local_model.explain())
    print('-'*100)
    
    

time spent to train LRR: 28.87514853477478 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 26.213839769363403 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 25.021041870117188 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 29.421844244003296 secs
----------------------------------------------------------------------------------------------------
time spent to train LRR: 26.97343945503235 secs
----------------------------------------------------------------------------------------------------


IndexError: positional indexers are out-of-bounds

In [19]:
print(pyExp_obj.keys())

dict_keys(['synthetic_data', 'synthetic_predictions', 'X_explain', 'y_explain', 'X_explain_fb', 'indep', 'dep', 'local_model'])


In [20]:
# display(pyExp_obj['synthetic_data'].columns)
# print(feature_df.index)
local_model = pyExp_obj['local_model']

In [21]:
print(local_model.explain( maxCoeffs=None))
# print(local_model.z)

          rule coefficient
0  (intercept)     1.21719
1  la <= 44.00    -2.10262
2  la <= 76.00   -0.561556


In [22]:
generated_instance = pyExp_obj['synthetic_data']
print(generated_instance.columns)
display(generated_instance)

MultiIndex([(   'la', '<=',                 1.0),
            (   'la', '<=',                 2.0),
            (   'la', '<=',                 5.0),
            (   'la', '<=',                 9.0),
            (   'la', '<=',                17.0),
            (   'la', '<=',                27.0),
            (   'la', '<=',                44.0),
            (   'la', '<=',                76.0),
            (   'la', '<=',               163.0),
            (   'ld', '<=',                 0.0),
            ...
            ('asawr', '<=',   0.433364602876798),
            ('rsawr', '<=', 0.18055330452007923),
            ('rsawr', '<=',  0.2564102564102564),
            ('rsawr', '<=',  0.3196254791765793),
            ('rsawr', '<=',  0.3754889178617992),
            ('rsawr', '<=',  0.4287529047714299),
            ('rsawr', '<=',  0.4816326530612245),
            ('rsawr', '<=',  0.5758975125536251),
            ('rsawr', '<=',  0.7078384798099763),
            ('rsawr', '<=',  0.848

feature,la,la,la,la,la,la,la,la,la,ld,...,asawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr
operation,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=,...,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=
value,1.000000,2.000000,5.000000,9.000000,17.000000,27.000000,44.000000,76.000000,163.000000,0.000000,...,0.433365,0.180553,0.256410,0.319625,0.375489,0.428753,0.481633,0.575898,0.707838,0.848712
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,1,1,1,1,0,...,1,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,0,0,1,1,1,1
2117,0,0,0,0,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,1,1
2118,0,0,0,0,0,0,1,1,1,1,...,1,0,0,0,0,0,0,0,1,1
2119,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [41]:
print(generated_instance.loc[:, (generated_instance.columns.get_level_values(0)=='la') & 
                            (generated_instance.columns.get_level_values(1)=='<=') &
                            (generated_instance.columns.get_level_values(2)==44.0)  ])

feature     la
operation   <=
value     44.0
0            0
1            0
2            0
3            0
4            1
...        ...
2116         1
2117         0
2118         1
2119         1
2120         0

[2121 rows x 1 columns]


In [23]:
# # search_function = 'crossoverinterpolation' # 'randompertubation' or 'crossoverinterpolation'
# search_functions = ['randompertubation', 'crossoverinterpolation']

# for i in range(0,len(feature_df)):
#     X_explain = feature_df.iloc[[i]]
#     y_explain = test_label.iloc[[i]]
    
#     row_index = str(X_explain.index[0])
    
#     try:
#         pyExp_obj = pyExp.explain(X_explain,
#                                    y_explain,
#                                    search_function = search_function, 
#                                    top_k = 1000, 
#                                    max_rules=2000, 
#                                    max_iter =None, 
#                                    cv=5,
#                                    debug = False)
#         pickle.dump(pyExp_obj, open(pyExp_dir+search_function+'_'+row_index+'.pkl','wb'))
        
#         synt_pred = pyExp_obj['synthetic_predictions']
#         print('{}: found {} defect from total {}'.format(row_index, str(np.sum(synt_pred)), 
#                                                          str(len(synt_pred))))
# #         print('finished', row_index)
#     except:
#         problem_index.append(row_index)
# #     print(row_index)
#     break

In [24]:
# explain_index = 13
# X_explain = feature_df.iloc[[explain_index]]
# X_explain

In [25]:
# y_explain = test_label.iloc[[explain_index]]
# y_explain

In [26]:
# search_function = 'crossoverinterpolation' # 'randompertubation' or 'crossoverinterpolation''
# start = time.time()
# create_pyExp_rule_obj = pyExp.explain(X_explain,
#                                y_explain,
#                                search_function = search_function, 
#                                top_k = 1000, 
#                                max_rules=2000, 
#                                max_iter =None, 
#                                cv=5,
#                                debug = False)

# end = time.time()

In [27]:
# print('time spent {}'.format(str(end-start)))
# pickle.dump(create_pyExp_rule_obj, open(pyExp_dir+search_function+'_'+str(explain_index)+'.pkl','wb'))

In [28]:
# display(create_pyExp_rule_obj['synthetic_data'])

In [29]:
# # print(create_pyExp_rule_obj['synthetic_predictions'])
# # print(np.sum(create_pyExp_rule_obj['synthetic_predictions']))
# display(create_pyExp_rule_obj.keys())
# print(create_pyExp_rule_obj['synthetic_predictions'])