In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef, balanced_accuracy_score, auc
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

from my_util import *
from lime.lime.lime_tabular import LimeTabularExplainer


import sys, os,  pickle, time


# sys.path.append(os.path.abspath('../'))
# from pyexplainer.pyexplainer_pyexplainer import PyExplainer

# from datetime import datetime

from IPython.display import display

from multiprocessing import Pool

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'lime.lime'

In [2]:
data_path = './dataset/'
result_dir = './eval_result/'
dump_dataframe_dir = './prediction_result/'
exp_dir = './explainer_object/'

# proj_name = 'openstack' # ['openstack','qt']



if not os.path.exists(result_dir):
    os.makedirs(result_dir)
    
if not os.path.exists(dump_dataframe_dir):
    os.makedirs(dump_dataframe_dir)
    
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    

In [11]:
def train_global_model(proj_name, x_train,y_train, global_model_name = 'RF'):
    global_model_name = global_model_name.upper()
    if global_model_name not in ['RF','LR']:
        print('wrong global model name. the global model name must be RF or LR')
        return
    
    smt = SMOTE(k_neighbors=5, random_state=42, n_jobs=24)
    new_x_train, new_y_train = smt.fit_resample(x_train, y_train)
    
    if global_model_name == 'RF':
        global_model = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=24)
    elif global_model_name == 'LR':
        global_model = LogisticRegression(random_state=0, n_jobs=24)
        
    global_model.fit(new_x_train, new_y_train)
    pickle.dump(global_model, open(proj_name+'_'+global_model_name+'_global_model.pkl','wb'))
    
train_black_box = False

if train_black_box:
    train_global_model(x_train, y_train,'RF')
    train_global_model(x_train, y_train,'LR')

In [25]:
def get_correctly_predicted_defective_commit_indices(proj_name, global_model_name, x_test, y_test):
    global_model_name = global_model_name.upper()
    if global_model_name not in ['RF','LR']:
        print('wrong global model name. the global model name must be RF or LR')
        return
    
    prediction_df_dir = dump_dataframe_dir+proj_name+'_'+global_model_name+'_prediction_result.csv'
    correctly_predict_df_dir = dump_dataframe_dir+proj_name+'_'+global_model_name+'_correctly_predict_as_defective.csv'
    
    if not os.path.exists(prediction_df_dir) or not os.path.exists(correctly_predict_df_dir):
        global_model = pickle.load(open(proj_name+'_'+global_model_name+'_global_model.pkl','rb'))

        pred = global_model.predict(x_test)
        defective_prob = global_model.predict_proba(x_test)[:,1]

        prediction_df = x_test.copy()
        prediction_df['pred'] = pred
        prediction_df['defective_prob'] = defective_prob
        prediction_df['defect'] = y_test

    #     print('AUC is',roc_auc_score(y_test, defective_prob))
        correctly_predict_df = prediction_df[(prediction_df['pred']==1) & (prediction_df['defect']==1)]

#         print('total correct prediction: {}'.format(str(len(correctly_predict_df))))

        prediction_df.to_csv(prediction_df_dir)
        correctly_predict_df.to_csv(correctly_predict_df_dir)
    
    else:
        prediction_df = pd.read_csv(prediction_df_dir)
        correctly_predict_df = pd.read_csv(correctly_predict_df_dir)
        
        prediction_df = prediction_df.set_index('commit_id')
        correctly_predict_df = correctly_predict_df.set_index('commit_id')
#         print('total correct prediction: {}'.format(str(len(correctly_predict_df))))
        
    return correctly_predict_df.index

In [27]:
def create_every_explainer(proj_name, global_model_name, x_train, x_test, y_train, y_test, df_indices):
    
    global_model_name = global_model_name.upper()
    if global_model_name not in ['RF','LR']:
        print('wrong global model name. the global model name must be RF or LR')
        return
    
    global_model = pickle.load(open(proj_name+'_'+global_model_name+'_global_model.pkl','rb'))

    indep = x_test.columns
    dep = 'defect'
    class_label = ['clean', 'defect']
    
    # for our apporach
    pyExp = PyExplainer(x_train, y_train, indep, dep, global_model, class_label)

    # for baseline
    # note: 6 is index of 'self' feature
    lime_explainer = LimeTabularExplainer(x_train.values, categorical_features=[6],
                                      feature_names=indep, class_names=class_label, 
                                      random_state=0)

    

    feature_df = x_test.loc[df_indices]
    test_label = y_test.loc[df_indices]
    
    for i in range(0,len(feature_df)):
        X_explain = feature_df.iloc[[i]]
        y_explain = test_label.iloc[[i]]

        row_index = str(X_explain.index[0])

        pyExp_obj = pyExp.explain(X_explain,
                                   y_explain,
                                   search_function = 'CrossoverInterpolation', 
                                   top_k = 15,
                                   max_rules=2000, 
                                   max_iter = None, 
                                   cv=5,
                                   debug = False)
        pyExp_obj['commit_id'] = row_index

        # because I don't want to change key name in another evaluation file
        pyExp_obj['local_model'] = pyExp_obj['local_rulefit_model']
        del pyExp_obj['local_rulefit_model']
        
        X_explain = feature_df.iloc[i] # to prevent error in LIME
        exp, synt_inst, synt_inst_for_local_model, selected_feature_indices, local_model = lime_explainer.explain_instance(X_explain, 
                                                                                                                           global_model.predict_proba, 
                                                                                                                           num_samples=5000)

        lime_obj = {}
        lime_obj['rule'] = exp
        lime_obj['synthetic_instance_for_global_model'] = synt_inst
        lime_obj['synthetic_instance_for_lobal_model'] = synt_inst_for_local_model
        lime_obj['local_model'] = local_model
        lime_obj['selected_feature_indeces'] = selected_feature_indices
        lime_obj['commit_id'] = row_index

        all_explainer = {'pyExplainer':pyExp_obj, 'LIME': lime_obj}
        pickle.dump(all_explainer, open(exp_dir+proj_name+'_'+global_model_name+'_all_explainer_'+row_index+'.pkl','wb'))
        
        print('finished',row_index)


In [23]:
def train_global_model_runner(proj_name):

    x_train, x_test, y_train, y_test = prepare_data(proj_name, mode = 'all')

    train_global_model(proj_name, x_train, y_train,'RF')
    print('train RF of {} finished'.format(proj_name))
    train_global_model(proj_name, x_train, y_train,'LR')
    print('train LR of {} finished'.format(proj_name))
    
def train_explainer(proj_name):
    x_train, x_test, y_train, y_test = prepare_data(proj_name, mode = 'all')

    rf_correctly_predict_indice = get_correctly_predicted_defective_commit_indices(proj_name, 'rf', x_test, y_test)
    lr_correctly_predict_indice = get_correctly_predicted_defective_commit_indices(proj_name, 'lr', x_test, y_test)
    
    all_indices = set(x_test.index)
    
    rf_correctly_predict_indice = set(rf_correctly_predict_indice)
    rf_remain_indices = all_indices-rf_correctly_predict_indice

    lr_correctly_predict_indice = set(lr_correctly_predict_indice)
    lr_remain_indices = all_indices-lr_correctly_predict_indice

    create_every_explainer(proj_name, 'RF', x_train, x_test, y_train, y_test, rf_correctly_predict_indice)
    create_every_explainer(proj_name, 'LR', x_train, x_test, y_train, y_test, lr_correctly_predict_indice)
    create_every_explainer(proj_name, 'RF', x_train, x_test, y_train, y_test, rf_remain_indices)
    create_every_explainer(proj_name, 'LR', x_train, x_test, y_train, y_test, lr_remain_indices)

In [15]:
# runs fine
train_global_model_runner('openstack')
train_global_model_runner('qt')

train RF of openstack finished
train LR of openstack finished
train RF of qt finished
train LR of qt finished


In [28]:
# runs fine
train_explainer('openstack')

--------------------------------------------------------------------------------
finished 34a208d1f3829173815beca81d07b53633a12989
--------------------------------------------------------------------------------
finished 9567c2b6a06aa1e8205f9f30beca63d77500dd1d
--------------------------------------------------------------------------------
finished 1679acd53d1f0c330edf583afe8b347a7304499c
--------------------------------------------------------------------------------
finished d632b66dc8b701ca777af4335b6505b4c4cd7828


KeyboardInterrupt: 

In [29]:
# runs fine
train_explainer('qt')

--------------------------------------------------------------------------------
finished 4e15aa6d7c4f9a03f4ae57b3ba04ade3400cccf1
--------------------------------------------------------------------------------
finished 23bdca417bde716c79168ab372083fd885607123
--------------------------------------------------------------------------------
finished 52fc0a95a109d2e9fa279eeb0284a8178563080b
--------------------------------------------------------------------------------
finished 459c9a2a8840995436e610459216957bc7ebd914
--------------------------------------------------------------------------------
finished 4dbf574b7acb7ae8f852219700afa95f8d568f0e


KeyboardInterrupt: 