In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

from pyexplainer.pyexplainer_pyexplainer import PyExplainer
import matplotlib.pyplot as plt

import os, pickle, time
from datetime import datetime

 # import from files copied from AIX360 github

ModuleNotFoundError: No module named 'cvxpy'

In [19]:
data_path = './dataset/'
result_dir = './eval_result/'
dump_dataframe_dir = './dump_df/'
pyExp_dir = './pyExplainer_obj/'
other_object_dir = './other_object/'
proj_name = 'openstack' # ['openstack','qt']



if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    
if not os.path.exists(dump_dataframe_dir):
    os.makedirs(dump_dataframe_dir)
    
if not os.path.exists(pyExp_dir):
    os.makedirs(pyExp_dir)
    
if not os.path.exists(other_object_dir):
    os.makedirs(other_object_dir)

In [20]:
def load_change_metrics_df(cur_proj):
    if cur_proj == 'qt':
        start = 1308350292
        end = 1395090476
    elif cur_proj == 'openstack':
        start = 1322599384
        end = 1393590700
    change_metrics = pd.read_csv(data_path+cur_proj+'_metrics.csv')
    
    change_metrics = change_metrics[(change_metrics['author_date'] >= start) & 
                                    (change_metrics['author_date'] <= end)]
    
    change_metrics['self'] = [1 if s is True else 0 for s in change_metrics['self']]
    change_metrics['defect'] = change_metrics['bugcount'] > 0
#     change_metrics['new_date'] = change_metrics['author_date'].apply(lambda x: datetime.fromtimestamp(x).strftime('%m-%d-%Y'))
    change_metrics['new_date'] = change_metrics['author_date'].apply(lambda x: datetime.fromtimestamp(x))
    
    change_metrics = change_metrics.sort_values(by='new_date')
    change_metrics['new_date'] = change_metrics['new_date'].apply(lambda x: x.strftime('%m-%d-%Y'))
    
    bug_label = change_metrics['defect']
    
    change_metrics = change_metrics.drop(['author_date', 'new_date', 'commit_id', 'bugcount','fixcount','revd','tcmt','oexp','orexp','osexp','osawr','defect']
                                         ,axis=1)
    change_metrics = change_metrics.fillna(value=0)
    
    
    return change_metrics, bug_label

def split_train_test_data(feature_df, label, percent_split = 70):
    _p_percent_len = int(len(feature_df)*(percent_split/100))
    x_train = feature_df.iloc[:_p_percent_len]
    y_train = label.iloc[:_p_percent_len]
    
    x_test = feature_df.iloc[_p_percent_len:]
    y_test = label.iloc[_p_percent_len:]
    
    return x_train, x_test, y_train, y_test

## Prepare data

In [21]:
def prepare_data(proj_name, mode = 'all'):
    if mode not in ['train','test','all']:
        print('this function accepts "train","test","all" mode only')
        return
    
    change_metrics, bug_label = load_change_metrics_df(proj_name) # ['openstack','qt']
#     indep = change_metrics.columns
#     dep = 'defect'

    x_train, x_test, y_train, y_test = split_train_test_data(change_metrics, bug_label, percent_split = 70)
    
    if mode == 'train':
        return x_train,y_train
    elif mode == 'test':
        return x_test, y_test
    elif mode == 'all':
        return x_train, x_test, y_train, y_test
    
x_train, x_test, y_train, y_test = prepare_data(proj_name, mode = 'all')

In [22]:
# # print(y_train)
# for col in x_test.columns:
#     print(col,len(x_test[col].unique()))

col = list(x_test.columns)
print(col)
print(col.index('self'))

['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self', 'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp', 'rsexp', 'asawr', 'rsawr']
9


## Train global model

In [23]:
smt = SMOTE(k_neighbors=5, random_state=42, n_jobs=24)
# enn = EditedNearestNeighbours(n_neighbors=5, n_jobs=24)
# smt_tmk = SMOTETomek(smote = smt, random_state=0)
# smt_enn = SMOTEENN(smote=smt, enn=enn, random_state=0)

new_x_train, new_y_train = smt.fit_resample(x_train, y_train)

def train_global_model(x_train,y_train):
    global_model = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=24)
    global_model.fit(x_train, y_train)

    pickle.dump(global_model, open(proj_name+'_global_model.pkl','wb'))
    print('train global model finished')
    
train_black_box = True

if train_black_box:
    train_global_model(new_x_train, new_y_train)

train global model finished


## Obtain correctly predicted defective commits

In [24]:
global_model = pickle.load(open(proj_name+'_global_model.pkl','rb'))
# x_test, y_test = prepare_data(proj_name, mode = 'test')

pred = global_model.predict(x_test)
defective_prob = global_model.predict_proba(x_test)[:,1]

prediction_df = x_test.copy()
prediction_df['pred'] = pred
prediction_df['defective_prob'] = defective_prob
prediction_df['defect'] = y_test

correctly_predict_df = prediction_df[(prediction_df['pred']==1) & (prediction_df['defect']==1)]

In [25]:
print(len(correctly_predict_df))
prediction_df.to_csv(dump_dataframe_dir+proj_name+'_prediction_result.csv')
correctly_predict_df.to_csv(dump_dataframe_dir+proj_name+'_correctly_predict_as_defective.csv')


228


## Training local model

Note: this step includes instance generation

In [26]:
global_model = pickle.load(open(proj_name+'_global_model.pkl','rb'))

load_prediction_from_file = True
class_label = ['clean', 'defect']

if load_prediction_from_file:
    correctly_predict_df = pd.read_csv(dump_dataframe_dir+proj_name+'_correctly_predict_as_defective.csv')
    correctly_predict_df = correctly_predict_df.set_index('Unnamed: 0')
    
dep = 'defect'
indep = correctly_predict_df.columns[:22]

print(correctly_predict_df.columns)
print(len(correctly_predict_df.columns))
print(indep)

Index(['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self',
       'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp',
       'rsexp', 'asawr', 'rsawr', 'pred', 'defective_prob', 'defect'],
      dtype='object')
25
Index(['la', 'ld', 'nf', 'nd', 'ns', 'ent', 'nrev', 'rtime', 'hcmt', 'self',
       'ndev', 'age', 'nuc', 'app', 'aexp', 'rexp', 'arexp', 'rrexp', 'asexp',
       'rsexp', 'asawr', 'rsawr'],
      dtype='object')


In [27]:
# print(correctly_predict_df)


In [28]:
pyExp = PyExplainer(x_train,
            y_train,
            indep,
            dep,
            class_label,
            blackbox_model = global_model,
            categorical_features = ['self'])

In [29]:
feature_df = correctly_predict_df.loc[:, indep]
test_label = correctly_predict_df.loc[:, dep]
problem_index = [] # store index that cannot build pyExplainer

In [30]:
def create_pyExplainer_obj(search_function, feature_df, test_label, explainer='LRR'):
    problem_index = []
    time_spent = []
    
    for i in range(0,len(feature_df)):
        X_explain = feature_df.iloc[[i]]
        y_explain = test_label.iloc[[i]]

        row_index = str(X_explain.index[0])

        start = time.time()
        try:
            pyExp_obj = pyExp.explain(X_explain,
                                       y_explain,
                                       search_function = search_function, 
                                       top_k = 1000,
                                       max_rules=2000, 
                                       max_iter =None, 
                                       cv=5,
                                       explainer=explainer,
                                       debug = False)
            synt_pred = pyExp_obj['synthetic_predictions']
            
            print('{}: found {} defect from total {}'.format(row_index, str(np.sum(synt_pred)), 
                                                         str(len(synt_pred))))
            pickle.dump(pyExp_obj, open(pyExp_dir+proj_name+'_'+explainer+'_'+search_function+'_'+row_index+'.pkl','wb'))
        
        except Exception as e:
            problem_index.append(row_index)
            print('-'*100)
            print(e)
            print('found total {} problematic commit'.format(str(len(problem_index))))
            print('-'*100)
            
#         break

        end = time.time()

        time_spent.append(str(end-start))
#     print(row_index)
#     break
    
    
    return time_spent, problem_index

In [15]:
time_spent_rand, problem_index_rand = create_pyExplainer_obj('randompertubation', feature_df, test_label,'LRR')
pickle.dump(time_spent_rand, open(other_object_dir+proj_name+'_train_time_LRR_randompertubation.pkl','wb'))
pickle.dump(problem_index_rand, open(other_object_dir+proj_name+'_problem_index_LRR_randompertubation.pkl','wb'))

30882: found 764 defect from total 1000




12299: found 711 defect from total 1000
8228: found 664 defect from total 1000




19970: found 732 defect from total 1000




6301: found 570 defect from total 1000
4568: found 576 defect from total 1000
8645: found 579 defect from total 1000




27290: found 714 defect from total 1000




896: found 645 defect from total 1000




26003: found 331 defect from total 1000




9978: found 260 defect from total 1000




5991: found 444 defect from total 1000




979: found 636 defect from total 1000
22918: found 163 defect from total 1000
22835: found 463 defect from total 1000




7485: found 518 defect from total 1000




1761: found 541 defect from total 1000




27395: found 542 defect from total 1000




11696: found 567 defect from total 1000
23815: found 473 defect from total 1000




7082: found 391 defect from total 1000
31839: found 819 defect from total 1000




30682: found 453 defect from total 1000
1938: found 585 defect from total 1000




20578: found 742 defect from total 1000




5374: found 712 defect from total 1000




19855: found 677 defect from total 1000
22111: found 712 defect from total 1000
3380: found 350 defect from total 1000




10688: found 250 defect from total 1000




27116: found 455 defect from total 1000




31231: found 693 defect from total 1000




1910: found 639 defect from total 1000
28320: found 648 defect from total 1000




17951: found 520 defect from total 1000




18669: found 417 defect from total 1000




3435: found 428 defect from total 1000




16525: found 547 defect from total 1000
176: found 493 defect from total 1000




31678: found 717 defect from total 1000




13064: found 366 defect from total 1000
21433: found 646 defect from total 1000




17023: found 270 defect from total 1000




18936: found 642 defect from total 1000




18345: found 542 defect from total 1000
16830: found 516 defect from total 1000




7386: found 612 defect from total 1000




29746: found 777 defect from total 1000




26870: found 816 defect from total 1000




10546: found 603 defect from total 1000




11463: found 792 defect from total 1000




19434: found 467 defect from total 1000




18044: found 569 defect from total 1000




11191: found 698 defect from total 1000
30097: found 674 defect from total 1000
20058: found 504 defect from total 1000




10022: found 339 defect from total 1000




31897: found 398 defect from total 1000




29550: found 736 defect from total 1000




1258: found 678 defect from total 1000




20677: found 676 defect from total 1000
6449: found 350 defect from total 1000




3860: found 617 defect from total 1000




14552: found 380 defect from total 1000




18781: found 458 defect from total 1000




19245: found 281 defect from total 1000




26787: found 190 defect from total 1000
25797: found 713 defect from total 1000




3057: found 759 defect from total 1000




32045: found 731 defect from total 1000




12161: found 383 defect from total 1000




15813: found 666 defect from total 1000




19744: found 229 defect from total 1000




3944: found 614 defect from total 1000
19226: found 662 defect from total 1000




In [16]:
time_spent_ci, problem_index_ci = create_pyExplainer_obj('crossoverinterpolation', feature_df, test_label)
pickle.dump(time_spent_ci, open(other_object_dir+proj_name+'_train_time_LRR_crossoverinterpolation.pkl','wb'))
pickle.dump(problem_index_ci, open(other_object_dir+proj_name+'_problem_index_LRR_crossoverinterpolation.pkl','wb'))

30882: found 981 defect from total 2222
12299: found 1069 defect from total 2240
8228: found 1152 defect from total 2240
19970: found 1058 defect from total 2240
6301: found 808 defect from total 2240
----------------------------------------------------------------------------------------------------
Cannot convert non-finite values (NA or inf) to integer
found total 1 problematic commit
----------------------------------------------------------------------------------------------------
8645: found 976 defect from total 2240
27290: found 948 defect from total 2240
896: found 855 defect from total 2240
26003: found 959 defect from total 2240
9978: found 848 defect from total 2240
5991: found 1248 defect from total 2236
979: found 986 defect from total 2240
22918: found 1256 defect from total 2117
22835: found 1242 defect from total 2164
7485: found 1324 defect from total 2138
1761: found 1250 defect from total 2240
27395: found 1288 defect from total 2147
11696: found 1142 defect from t

## Just for testing

In [19]:
# display(feature_df)
# for c in feature_df:
#     print(c)

for k in range(0,1):
    print(k)

0


In [20]:
# test feature binarizer
# from pyexplainer.features import *
# fb = FeatureBinarizer(negations=True)
# fb.fit(x_train)
# display(fb.transform(feature_df))

In [31]:
search_function='randompertubation'
i = 10

X_explain = feature_df.iloc[[i]]
y_explain = test_label.iloc[[i]]

row_index = str(X_explain.index[0])

start = time.time()

start = time.time()

pyExp_obj = pyExp.explain(X_explain,
                           y_explain,
                           search_function = search_function, 
                           top_k = 1000,
                           max_rules=2000, 
                           max_iter =None, 
                           cv=5,
                           explainer='LRR',
                           debug = False)

In [32]:
print(pyExp_obj.keys())

dict_keys(['synthetic_data', 'synthetic_data_fb', 'synthetic_predictions', 'X_explain', 'y_explain', 'X_explain_fb', 'indep', 'dep', 'local_model'])


In [33]:
display(pyExp_obj['synthetic_data_fb'])

feature,la,la,la,la,la,la,la,la,la,ld,...,asawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr
operation,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=,...,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=
value,1.000000,2.000000,5.000000,9.000000,17.000000,27.000000,44.000000,76.000000,163.000000,0.000000,...,0.433365,0.180553,0.256410,0.319625,0.375489,0.428753,0.481633,0.575898,0.707838,0.848712
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,1,1,1,1,1,1
2,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,1,1,1,1,1,1
996,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
997,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
998,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### sample code for RQ1

In [26]:
synthetic_instances = pyExp_obj['synthetic_data']
sample_instance = pyExp_obj['X_explain']

In [23]:
def agg_list(val):
    return np.mean(val), np.median(val), np.max(val)

cos_sim = cosine_similarity(sample_instance.values.reshape(1,-1), synthetic_instances.values)[0]
euclid_dist = euclidean_distances(sample_instance.values.reshape(1,-1), synthetic_instances.values)[0]

### sample code for RQ2

In [35]:
'''get prediction from global model then compare with result obtained from model (but how to compare??)'''

local_model = pyExp_obj['local_model']
'''In case pyExp uses logistic rule regression'''
fb_sample_instance = pyExp.feature_binarizer.transform(sample_instance)
# print(pyExp.feature_binarizer.transform(sample_instance))
local_pred = local_model.predict(fb_sample_instance)
local_pred_prob = local_model.predict_proba(fb_sample_instance)

'''In case pyExp uses RuleFit'''
# local_pred = local_model.predict(sample_instance)
# local_pred_prob = local_model.predict_proba(sample_instance)

print(local_pred, local_pred_prob)

[False] [0.06562197]


In [44]:
# sample LRR when used with random perturbation

search_function='randompertubation'

for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    start = time.time()
    try:
        pyExp_obj = pyExp.explain(X_explain,
                                   y_explain,
                                   search_function = search_function, 
                                   top_k = 1000,
                                   max_rules=2000, 
                                   max_iter =None, 
                                   cv=5,
                                   explainer='LRR',
                                   debug = False)
        end = time.time()
        print('time spent to train LRR:',str(end-start),'secs')

        local_model = pyExp_obj['local_model']
        print('------------------Explanation from local model-------------------------')
        print(local_model.explain())
        print('-'*100)
    except:
        print('-'*100)
        print('there is only 1 class in the generated instances')
        print('-'*100)
    

building LRR model




time spent to train LRR: 0.3639814853668213 secs
------------------Explanation from local model-------------------------
           rule coefficient
0   (intercept)    0.926495
1  nrev <= 1.00    -11.2576
2    nd <= 1.00    -3.34054
----------------------------------------------------------------------------------------------------
building LRR model




time spent to train LRR: 0.39341306686401367 secs
------------------Explanation from local model-------------------------
            rule coefficient
0    (intercept)    -4.82297
1   nrev <= 1.00    -21.7674
2    ent <= 0.90      11.037
3  asawr <= 0.16     2.48033
4    nuc <= 3.00     2.48033
----------------------------------------------------------------------------------------------------
building LRR model




time spent to train LRR: 0.4327559471130371 secs
------------------Explanation from local model-------------------------
           rule coefficient
0   (intercept)    -9.12203
1  nrev <= 2.00     12.1779
2  nrev <= 1.00    -7.51396
3   ent <= 0.63    -7.51396
----------------------------------------------------------------------------------------------------
building LRR model
----------------------------------------------------------------------------------------------------
there is only 1 class in the generated instances
----------------------------------------------------------------------------------------------------
building LRR model




time spent to train LRR: 0.4454691410064697 secs
------------------Explanation from local model-------------------------
            rule coefficient
0    (intercept)    0.721185
1   nrev <= 1.00    -8.48336
2  asawr <= 0.05    -7.43149
----------------------------------------------------------------------------------------------------
building LRR model
time spent to train LRR: 0.15939950942993164 secs
------------------Explanation from local model-------------------------
          rule coefficient
0  (intercept)    -3.74222
----------------------------------------------------------------------------------------------------
building LRR model




time spent to train LRR: 0.511441707611084 secs
------------------Explanation from local model-------------------------
           rule coefficient
0   (intercept)    -9.11008
1  nrev <= 2.00      14.726
2  nrev <= 1.00    -9.90251
3   ent <= 0.00    -8.33679
----------------------------------------------------------------------------------------------------
building LRR model




time spent to train LRR: 0.4734790325164795 secs
------------------Explanation from local model-------------------------
            rule coefficient
0    (intercept)    -10.4348
1   nrev <= 1.00    -15.8551
2  asawr <= 0.09     15.7259
----------------------------------------------------------------------------------------------------
building LRR model
time spent to train LRR: 0.14858174324035645 secs
------------------Explanation from local model-------------------------
          rule coefficient
0  (intercept)    -2.90924
----------------------------------------------------------------------------------------------------
building LRR model
time spent to train LRR: 0.5116839408874512 secs
------------------Explanation from local model-------------------------
            rule coefficient
0    (intercept)     1.11111
1     ld <= 0.00    -11.0162
2   nrev <= 1.00     -2.5054
3  asawr <= 0.00   -0.249276
4  asawr <= 0.00   -0.249276
----------------------------------------------------



In [17]:
print(pyExp_obj.keys())

NameError: name 'pyExp_obj' is not defined

In [42]:
# sample LRR when used with crossover interpolation

search_function='crossoverinterpolation'

for i in [3,5,7,20,50,100,83,25,163,127]:
    X_explain = feature_df.iloc[[i]]
    y_explain = test_label.iloc[[i]]

    row_index = str(X_explain.index[0])

    start = time.time()

    start = time.time()
    pyExp_obj = pyExp.explain(X_explain,
                               y_explain,
                               search_function = search_function, 
                               top_k = 1000,
                               max_rules=2000, 
                               max_iter =None, 
                               cv=5,
                               explainer='LRR',
                               debug = False)
    end = time.time()
    print('time spent to train LRR:',str(end-start),'secs')
    
    local_model = pyExp_obj['local_model']
    print('------------------Explanation from local model-------------------------')
    print(local_model.explain())
    print('-'*100)
    
    

building LRR model
time spent to train LRR: 2.72156023979187 secs
------------------Explanation from local model-------------------------
          rule coefficient
0  (intercept)    0.849159
1  la <= 27.00    -1.38287
2  la <= 44.00   -0.769765
----------------------------------------------------------------------------------------------------
building LRR model
time spent to train LRR: 2.8035364151000977 secs
------------------Explanation from local model-------------------------
          rule coefficient
0  (intercept)    0.676163
1  la <= 44.00     -1.1893
----------------------------------------------------------------------------------------------------
building LRR model
time spent to train LRR: 2.8284826278686523 secs
------------------Explanation from local model-------------------------
          rule coefficient
0  (intercept)     1.26522
1  la <= 44.00    -1.39631
2  la <= 76.00   -0.731139
-----------------------------------------------------------------------------------

In [19]:
print(pyExp_obj.keys())

dict_keys(['synthetic_data', 'synthetic_predictions', 'X_explain', 'y_explain', 'X_explain_fb', 'indep', 'dep', 'local_model'])


In [20]:
# display(pyExp_obj['synthetic_data'].columns)
# print(feature_df.index)
local_model = pyExp_obj['local_model']

In [21]:
print(local_model.explain( maxCoeffs=None))
# print(local_model.z)

          rule coefficient
0  (intercept)     1.21719
1  la <= 44.00    -2.10262
2  la <= 76.00   -0.561556


In [22]:
generated_instance = pyExp_obj['synthetic_data']
print(generated_instance.columns)
display(generated_instance)

MultiIndex([(   'la', '<=',                 1.0),
            (   'la', '<=',                 2.0),
            (   'la', '<=',                 5.0),
            (   'la', '<=',                 9.0),
            (   'la', '<=',                17.0),
            (   'la', '<=',                27.0),
            (   'la', '<=',                44.0),
            (   'la', '<=',                76.0),
            (   'la', '<=',               163.0),
            (   'ld', '<=',                 0.0),
            ...
            ('asawr', '<=',   0.433364602876798),
            ('rsawr', '<=', 0.18055330452007923),
            ('rsawr', '<=',  0.2564102564102564),
            ('rsawr', '<=',  0.3196254791765793),
            ('rsawr', '<=',  0.3754889178617992),
            ('rsawr', '<=',  0.4287529047714299),
            ('rsawr', '<=',  0.4816326530612245),
            ('rsawr', '<=',  0.5758975125536251),
            ('rsawr', '<=',  0.7078384798099763),
            ('rsawr', '<=',  0.848

feature,la,la,la,la,la,la,la,la,la,ld,...,asawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr,rsawr
operation,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=,...,<=,<=,<=,<=,<=,<=,<=,<=,<=,<=
value,1.000000,2.000000,5.000000,9.000000,17.000000,27.000000,44.000000,76.000000,163.000000,0.000000,...,0.433365,0.180553,0.256410,0.319625,0.375489,0.428753,0.481633,0.575898,0.707838,0.848712
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,1,1,1,1,0,...,1,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,0,0,1,1,1,1
2117,0,0,0,0,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,1,1
2118,0,0,0,0,0,0,1,1,1,1,...,1,0,0,0,0,0,0,0,1,1
2119,1,1,1,1,1,1,1,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [41]:
print(generated_instance.loc[:, (generated_instance.columns.get_level_values(0)=='la') & 
                            (generated_instance.columns.get_level_values(1)=='<=') &
                            (generated_instance.columns.get_level_values(2)==44.0)  ])

feature     la
operation   <=
value     44.0
0            0
1            0
2            0
3            0
4            1
...        ...
2116         1
2117         0
2118         1
2119         1
2120         0

[2121 rows x 1 columns]


In [23]:
# # search_function = 'crossoverinterpolation' # 'randompertubation' or 'crossoverinterpolation'
# search_functions = ['randompertubation', 'crossoverinterpolation']

# for i in range(0,len(feature_df)):
#     X_explain = feature_df.iloc[[i]]
#     y_explain = test_label.iloc[[i]]
    
#     row_index = str(X_explain.index[0])
    
#     try:
#         pyExp_obj = pyExp.explain(X_explain,
#                                    y_explain,
#                                    search_function = search_function, 
#                                    top_k = 1000, 
#                                    max_rules=2000, 
#                                    max_iter =None, 
#                                    cv=5,
#                                    debug = False)
#         pickle.dump(pyExp_obj, open(pyExp_dir+search_function+'_'+row_index+'.pkl','wb'))
        
#         synt_pred = pyExp_obj['synthetic_predictions']
#         print('{}: found {} defect from total {}'.format(row_index, str(np.sum(synt_pred)), 
#                                                          str(len(synt_pred))))
# #         print('finished', row_index)
#     except:
#         problem_index.append(row_index)
# #     print(row_index)
#     break

In [24]:
# explain_index = 13
# X_explain = feature_df.iloc[[explain_index]]
# X_explain

In [25]:
# y_explain = test_label.iloc[[explain_index]]
# y_explain

In [26]:
# search_function = 'crossoverinterpolation' # 'randompertubation' or 'crossoverinterpolation''
# start = time.time()
# create_pyExp_rule_obj = pyExp.explain(X_explain,
#                                y_explain,
#                                search_function = search_function, 
#                                top_k = 1000, 
#                                max_rules=2000, 
#                                max_iter =None, 
#                                cv=5,
#                                debug = False)

# end = time.time()

In [27]:
# print('time spent {}'.format(str(end-start)))
# pickle.dump(create_pyExp_rule_obj, open(pyExp_dir+search_function+'_'+str(explain_index)+'.pkl','wb'))

In [28]:
# display(create_pyExp_rule_obj['synthetic_data'])

In [29]:
# # print(create_pyExp_rule_obj['synthetic_predictions'])
# # print(np.sum(create_pyExp_rule_obj['synthetic_predictions']))
# display(create_pyExp_rule_obj.keys())
# print(create_pyExp_rule_obj['synthetic_predictions'])