In [18]:
import pandas as pd
import numpy as np
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback



import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import metrics.abcd

import metrices
import measures

import sys
import traceback
import warnings
warnings.filterwarnings("ignore")

In [19]:
data_source1 = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
if platform.system() == 'Darwin' or platform.system() == 'Linux':
    _dir = data_source1 + '/'
else:
    _dir = data_source1 + '\\'
projects = [f for f in listdir(_dir) if isfile(join(_dir, f))]

In [20]:
def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def tune_learner(learner, train_X, train_Y, tune_X, tune_Y, goal,loc=None,target_class=None):
    if not target_class:
        target_class = goal
    clf = learner(train_X, train_Y, tune_X, tune_Y, goal,loc)
    tuner = DE.DE_Tune_ML(clf, clf.get_param(), goal, target_class)
    return tuner.Tune()

In [21]:
class DE_Learners(object):
    def __init__(self, clf, train_X, train_Y, test_X, test_Y, goal,loc):
        """

        :param clf: classifier, SVM, etc...
        :param train_X: training data, independent variables.
        :param train_Y: training labels, dependent variables.
        :param predict_X: testing data, independent variables.
        :param predict_Y: testingd labels, dependent variables.
        :param goal: the objective of your tuning, F, recision,....
        """
        self.train_X = train_X
        self.train_Y = train_Y
        self.test_X = test_X
        self.test_Y = test_Y
        self.loc = loc
        self.goal = goal
        self.param_distribution = self.get_param()
        self.learner = clf
        self.confusion = None
        self.params = None
        
    def apply_smote(self,df,neighbours,r,up_to_num,auto):
        cols = df.columns
        smt = SMOTE.smote(df,neighbor = neighbours,r = r,up_to_num=up_to_num,auto=auto)
        df = smt.run()
        df.columns = cols
        return df

    def learn(self,F, **kwargs):
        """
        :param F: a dict, holds all scores, can be used during debugging
        :param kwargs: a dict, all the parameters need to set after tuning.
        :return: F, scores.
        """
        self.scores = {self.goal: [0.0]}
        try:   
            neighbours = kwargs.pop('neighbours')
            r = kwargs.pop('r')
            up_to_num = kwargs.pop('up_to_num')
            self.learner.set_params(**kwargs)
            _df = pd.concat([self.train_X, self.train_Y], axis = 1)
            _df = self.apply_smote(_df,neighbours,r,up_to_num,True)
            y_train = _df.Buggy
            X_train = _df.drop(labels = ['Buggy'],axis = 1)
            predict_result = []
            clf = self.learner.fit(X_train, y_train)
            predict_result = clf.predict(self.test_X)
            self.abcd = metrices.measures(self.test_Y,predict_result,self.loc)
            self.scores = self._Abcd(self.abcd,F)
            self.params = kwargs
        except Exception as e:
            #print(e)
            a = 10
        return self.scores
    
    def _Abcd(self,abcd , F):
        """

        :param predicted: predicted results(labels)
        :param actual: actual results(labels)
        :param F: previously got scores
        :return: updated scores.
        """
        if self.goal in ['f1','precision','recall','g-score','d2h','ifa','pci_20']:
            F['f1'] = [abcd.calculate_f1_score()]
            F['precision'] = [abcd.calculate_precision()]
            F['recall'] = [abcd.calculate_recall()]
            F['g-score'] = [abcd.get_g_score()]
            F['d2h'] = [abcd.calculate_d2h()]
            F['pci_20'] = [abcd.get_pci_20()]
            F['ifa'] = [abcd.get_ifa()]
            return F
        else:
            print("wronging goal")
            return F

    def predict(self,test_X):
        return self.learner.predict(test_X)

In [22]:
class SK_LR(DE_Learners):
    def __init__(self, train_x, train_y, predict_x, predict_y, goal,loc=None):
        clf = LogisticRegression()
        super(SK_LR, self).__init__(clf, train_x, train_y, predict_x, predict_y,goal,loc)

    def get_param(self):
        tunelst = {"penalty": ['l1', 'l2','elasticnet','none'],
                   "multi_class": ['ovr', 'multinomial','auto'],
                   "C": [1.0,200.0],
                   "dual": [True, False],
                   "fit_intercept": [True, False],
                   "intercept_scaling": [1.0,100.0],
                   "class_weight": ["balanced", 'none'],
                   "solver": ['newton-cg','lbfgs','liblinear','sag', 'saga'],
                   "warm_start": [True, False],
                   "max_iter": [100,600],
                   "neighbours": [5,21],
                   "r":[1,6],
                   "up_to_num": [1,4000]}
        return tunelst

In [23]:
final_score = {}
count = 0
for s_project in projects[0:10]:
    try:
        s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
        print(s_project)
        df = prepare_data(s_path)
        if df.shape[0] < 50:
            continue
        else:
            count+=1
        df.reset_index(drop=True,inplace=True)
        d = {'buggy': True, 'clean': False}
        df['Buggy'] = df['Buggy'].map(d)
        #df,selected_cols = apply_cfs(df) # applying cfs above the loop
        y = df.Buggy
        X = df.drop(labels = ['Buggy'],axis = 1)
        kf = StratifiedKFold(n_splits = 5)
        goal = 'f1'
        learner = [SK_LR][0]
        F = {}
        score = {}
        for i in range(5):
            for train_index, tune_index in kf.split(X, y):
                X_train, X_tune = X.iloc[train_index], X.iloc[tune_index]
                y_train, y_tune = y[train_index], y[tune_index]
                _df = pd.concat([X_train,y_train], axis = 1) #applying DE inside loop
                _df,selected_cols = apply_cfs(_df)
                y_train = _df.Buggy
                X_train = _df.drop(labels = ['Buggy'],axis = 1)
                _df_tune = pd.concat([X_tune,y_tune], axis = 1)
                _df_tune_loc = _df_tune.LOC
                _df_tune = _df_tune[selected_cols]
                y_tune = _df_tune.Buggy
                X_tune = _df_tune.drop(labels = ['Buggy'],axis = 1)
                params, evaluation = tune_learner(learner, X_train, y_train,  X_tune,y_tune, goal,_df_tune_loc)  
                destination_projects = copy.deepcopy(projects)
                destination_projects.remove(s_project)
                for d_project in destination_projects:
                    try:
                        d_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + d_project
                        test_df = prepare_data(d_path)
                        if dest_df.shape[0] < 50:
                            continue
                        test_df.reset_index(drop=True,inplace=True)
                        d = {'buggy': True, 'clean': False}
                        test_df['Buggy'] = test_df['Buggy'].map(d)
                        df_test_loc = test_df.LOC
                        test_df = test_df[selected_cols]
                        test_y = test_df.Buggy
                        test_X = test_df.drop(labels = ['Buggy'],axis = 1)
                        clf = learner(X_train, y_train,  test_X,test_y, goal,df_test_loc)
                        F = clf.learn(F,**params)
                        _F = copy.deepcopy(F)
                        if 'f1' not in score.keys():
                            score[d_project] = _F
                        else:
                            score[d_project]['f1'].append(F['f1'][0])
                            score[d_project]['precision'].append(F['precision'][0])
                            score[d_project]['recall'].append(F['recall'][0])
                            score[d_project]['g-score'].append(F['g-score'][0])
                            score[d_project]['d2h'].append(F['d2h'][0])
                            score[d_project]['pci_20'].append(F['pci_20'][0])
                            score[d_project]['ifa'].append(F['ifa'][0])
                            score[d_project]['pd'].append(F['pd'][0])
                            score[d_project]['pf'].append(F['pf'][0])
                    except Exception as e:
                        print("Exception at test",s_project,d_project,e)
                        continue
            final_score[s_project] = score 
    except Exception as e:
        print("Exception at train",s_project,d_project,e)
        continue
        
        

bzbyte.csv
Exception at test bzbyte.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL'

Exception at test bzbyte.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test bzbyte.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTrac

Exception at test freedom-erp.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.cs

Exception at test freedom-erp.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test freedom-erp.c

Exception at test twostep.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv mylyn.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv pde.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv eclipse.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv lucene.csv "['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
Exception at test twostep.csv equinox.csv "['Host' 'Vcs' 'File' 'PL' 

In [24]:
final_score

{'bzbyte.csv': {'lcdata.csv': {'f1': [0.33],
   'precision': [0.25],
   'recall': [0.5],
   'g-score': [0.0],
   'd2h': [1.0],
   'pci_20': [87],
   'ifa': [7]},
  'llcon.csv': {'f1': [0.55],
   'precision': [0.49],
   'recall': [0.62],
   'g-score': [0.0],
   'd2h': [1.01],
   'pci_20': [97],
   'ifa': [0]},
  'freedom-erp.csv': {'f1': [0.68],
   'precision': [0.68],
   'recall': [0.68],
   'g-score': [0.72],
   'd2h': [0.51],
   'pci_20': [99],
   'ifa': [8]},
  'gpsee.csv': {'f1': [0.29],
   'precision': [0.45],
   'recall': [0.38],
   'g-score': [0.34],
   'd2h': [0.92],
   'pci_20': [97],
   'ifa': [2]},
  'twostep.csv': {'f1': [0.24],
   'precision': [0.45],
   'recall': [0.37],
   'g-score': [0.17],
   'd2h': [0.96],
   'pci_20': [98],
   'ifa': [1]},
  'tauruss.csv': {'f1': [0.0]},
  'makumba.csv': {'f1': [0.45],
   'precision': [0.59],
   'recall': [0.46],
   'g-score': [0.36],
   'd2h': [0.73],
   'pci_20': [97],
   'ifa': [6]},
  'openi.csv': {'f1': [0.52],
   'precision': [

In [25]:
with open('data/1385_LR_DE_bellwether_100.pkl', 'wb') as handle:
    pickle.dump(final_score, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
df = pd.read_pickle('data/1385_LR_default_bellwether_100.pkl')
results_f1 = {}
results_precision = {}
results_recall = {}
results_g = {}
results_d2h = {}
results_pci_20 = {}
results_ifa = {}
results_pd = {}
results_pf = {}
for s_project in df.keys():
    if s_project not in results_f1.keys():
        results_f1[s_project] = {}
        results_precision[s_project] = {}
        results_recall[s_project] = {}
        results_g[s_project] = {}
        results_d2h[s_project] = {}
        results_pci_20[s_project] = {}
        results_ifa[s_project] = {}
        results_pd[s_project] = {}
        results_pf[s_project] = {}
    for d_projects in df[s_project].keys():
        results_f1[s_project][d_projects] = np.median(df[s_project][d_projects]['f1'])
        results_precision[s_project][d_projects] = np.median(df[s_project][d_projects]['precision'])
        results_recall[s_project][d_projects] = np.median(df[s_project][d_projects]['recall'])
        results_g[s_project][d_projects] = np.median(df[s_project][d_projects]['g-score'])
        results_d2h[s_project][d_projects] = np.median(df[s_project][d_projects]['d2h'])
        results_pci_20[s_project][d_projects] = np.median(df[s_project][d_projects]['pci_20'])
        results_ifa[s_project][d_projects] = np.median(df[s_project][d_projects]['ifa'])
        results_pd[s_project][d_projects] = np.median(df[s_project][d_projects]['pd'])
        results_pf[s_project][d_projects] = np.median(df[s_project][d_projects]['pf'])

In [24]:
results_f1_df = pd.DataFrame.from_dict(results_f1, orient='index')
results_precision_df = pd.DataFrame.from_dict(results_precision, orient='index')
results_recall_df = pd.DataFrame.from_dict(results_recall, orient='index')
results_g_df = pd.DataFrame.from_dict(results_g, orient='index')
results_d2h_df = pd.DataFrame.from_dict(results_d2h, orient='index')
results_pci_20_df = pd.DataFrame.from_dict(results_pci_20, orient='index')
results_ifa_df = pd.DataFrame.from_dict(results_ifa, orient='index')
results_pd_df = pd.DataFrame.from_dict(results_pd, orient='index')
results_pf_df = pd.DataFrame.from_dict(results_pf, orient='index')

In [27]:
results_f1_df.to_csv('data/1385/100/1385_LR_bellwether_f1.csv')
results_precision_df.to_csv('data/1385/100/1385_LR_bellwether_precision.csv')
results_recall_df.to_csv('data/1385/100/1385_LR_bellwether_recall.csv')
results_g_df.to_csv('data/1385/100/1385_LR_bellwether_g.csv')
results_d2h_df.to_csv('data/1385/100/1385_LR_bellwether_d2h.csv')
results_pci_20_df.to_csv('data/1385/100/1385_LR_bellwether_pci_20.csv')
results_ifa_df.to_csv('data/1385/100/1385_LR_bellwether_ifa.csv')
results_pd_df.to_csv('data/1385/100/1385_LR_bellwether_pd.csv')
results_pf_df.to_csv('data/1385/100/1385_LR_bellwether_pf.csv')

Unnamed: 0,freedom-erp.csv,twostep.csv,iaml.csv,etics.csv,neoengine.csv,nsis.csv,vdsf.csv,monetdb.csv,amygdala.csv,ikvm.csv,...,gnukeda.csv,powerfolder-.csv,jparsec.csv,m-m-m.csv,adonthell.csv,k3d.csv,fmri-dmt.csv,amanda.csv,jaql.csv,bzbyte.csv
adonthell.csv,0.25,0.54,0.26,0.09,0.48,0.01,0.87,0.55,0.36,0.42,...,0.28,0.58,0.22,0.76,,0.79,0.88,0.37,0.46,0.56
amanda.csv,0.33,0.2,0.44,0.23,0.59,0.8,0.02,0.63,0.36,0.5,...,0.56,0.38,0.22,0.76,0.41,0.79,0.2,,0.33,0.71
amygdala.csv,0.43,0.2,0.42,0.09,0.21,0.89,0.01,0.16,,0.25,...,0.39,0.13,0.46,0.56,0.4,0.04,0.01,0.29,0.22,0.66
bloodycore.csv,0.25,0.49,0.26,0.66,0.48,0.01,0.87,0.55,0.31,0.42,...,0.28,0.58,0.46,0.05,0.27,0.79,0.87,0.37,0.46,0.09
bzbyte.csv,0.25,0.2,0.26,0.13,0.48,0.88,0.87,0.55,0.36,0.42,...,0.33,0.43,0.22,0.76,0.4,0.79,0.51,0.37,0.53,
etics.csv,0.43,0.62,0.53,,0.6,0.42,0.1,0.16,0.48,0.25,...,0.4,0.34,0.5,0.28,0.32,0.04,0.54,0.38,0.63,0.62
fleet-simulator.csv,0.25,0.33,0.26,0.5,0.48,0.28,0.87,0.55,0.36,0.42,...,0.28,0.64,0.22,0.74,0.5,0.79,0.65,0.37,0.46,0.52
flylegacy.csv,0.43,0.6,0.58,0.71,0.64,0.55,0.11,0.24,0.65,0.28,...,0.57,0.36,0.35,0.54,0.45,0.39,0.58,0.4,0.35,0.64
fmri-dmt.csv,0.25,0.33,0.31,0.55,0.48,0.13,0.87,0.55,0.34,0.42,...,0.37,0.59,0.25,0.75,0.57,0.79,,0.37,0.44,0.16
freedom-erp.csv,,0.2,0.42,0.09,0.21,0.89,0.01,0.58,0.36,0.24,...,0.39,0.13,0.22,0.76,0.4,0.3,0.01,0.29,0.23,0.66
