In [1]:
import pandas as pd
import numpy as np
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path


import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import birch
import metrics.abcd

import metrices
import measures

import sys
import traceback
import warnings
warnings.filterwarnings("ignore")

In [2]:
def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def load_data(path,target):
    df = pd.read_csv(path)
    if path == 'data/jm1.csv':
        df = df[~df.uniq_Op.str.contains("\?")]
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    return X,y

# Cluster Driver
def cluster_driver(df,print_tree = True):
    X = df.apply(pd.to_numeric)
    cluster = birch.birch(branching_factor=20)
    #X.set_index('Project Name',inplace=True)
    cluster.fit(X)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    #cluster_tree = cluster.model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [3]:
def get_predicted(cluster_data_loc,metrices_loc,fold,data_location):
    train_data = pd.read_pickle(cluster_data_loc + '/train_data.pkl')
    cluster,cluster_tree,max_depth = cluster_driver(train_data)

    test_data = pd.read_pickle(cluster_data_loc + '/test_data.pkl')
    #print(test_data)
    test_projects = test_data.index.values.tolist()
    goals = ['recall','precision','pf','pci_20','ifa']
    levels = [2,1,0]
    for _ in range(1):
        results = {}
        bellwether_models = {}
        bellwether_s_cols = {}
        self_model = {}
        self_model_test = {} 
        for level in levels:
            test_data = test_data
            predicted_cluster = cluster.predict(test_data,level)
            #print(level,predicted_cluster)
            for i in range(len(predicted_cluster)):
                try:
                    F = {}
                    _F = {}
                    c_id = predicted_cluster[i]
                    s_project_df = pd.read_csv(cluster_data_loc + '/bellwether_cdom_' + str(level) + '.csv')
                    if level == 1:
                        s_project_df.rename(columns = {'Unnamed: 0':'id'},inplace = True)
                    if level == 0:
                        s_project = s_project_df.bellwether.values[0]
                    else:
                        s_project = s_project_df[s_project_df['id'] == predicted_cluster[i]].bellwether.values[0]
                    if s_project not in bellwether_models.keys():
                        s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
                        df = prepare_data(s_path)
                        df.reset_index(drop=True,inplace=True)
                        d = {'buggy': True, 'clean': False}
                        df['Buggy'] = df['Buggy'].map(d)
                        df, s_cols = apply_cfs(df)
                        bellwether_s_cols[s_project] = s_cols
                        df = apply_smote(df)
                        y = df.Buggy
                        X = df.drop(labels = ['Buggy'],axis = 1)
                        clf_bellwether = LogisticRegression()
                        clf_bellwether.fit(X,y)
                        bellwether_models[s_project] = clf_bellwether
                    else:
                        clf_bellwether = bellwether_models[s_project]
                        s_cols = bellwether_s_cols[s_project]

                    d_project = test_projects[i]
                    if d_project not in self_model.keys():
                        d_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + d_project
                        test_df = prepare_data(d_path)
                        test_df.reset_index(drop=True,inplace=True)
                        d = {'buggy': True, 'clean': False}
                        test_df['Buggy'] = test_df['Buggy'].map(d)
                        test_y = test_df.Buggy
                        test_X = test_df.drop(labels = ['Buggy'],axis = 1)
                        X_train, X_test, y_train, y_test = train_test_split(test_X, test_y, test_size=0.1, random_state=42)
                        clf_self = LogisticRegression()
                        clf_self.fit(X_train,y_train)
                        self_model[d_project] = clf_self
                        self_model_test[d_project] = [X_test,y_test]
                    else:
                        clf_self = self_model[d_project]

                    _test_df = pd.concat(self_model_test[d_project], axis = 1)
                    _df_test_loc = _test_df.LOC
                    _test_df_1 = copy.deepcopy(_test_df[s_cols])
                    _test_df_2 = copy.deepcopy(_test_df[test_df.columns.tolist()])

                    y_test = _test_df_1.Buggy
                    X_test = _test_df_1.drop(labels = ['Buggy'],axis = 1)
                    predicted_bellwether = clf_bellwether.predict(X_test) 
                    abcd = metrices.measures(y_test,predicted_bellwether,_df_test_loc)
                    F['f1'] = [abcd.calculate_f1_score()]
                    F['precision'] = [abcd.calculate_precision()]
                    F['recall'] = [abcd.calculate_recall()]
                    F['g-score'] = [abcd.get_g_score()]
                    F['d2h'] = [abcd.calculate_d2h()]
                    F['pci_20'] = [abcd.get_pci_20()]
                    F['ifa'] = [abcd.get_ifa()]
                    F['pd'] = [abcd.get_pd()]
                    F['pf'] = [abcd.get_pf()]

                    y_test = _test_df_2.Buggy
                    X_test = _test_df_2.drop(labels = ['Buggy'],axis = 1)
                    predicted_self = clf_self.predict(X_test) 
                    abcd = metrices.measures(y_test,predicted_self,_df_test_loc)
                    _F['f1'] = [abcd.calculate_f1_score()]
                    _F['precision'] = [abcd.calculate_precision()]
                    _F['recall'] = [abcd.calculate_recall()]
                    _F['g-score'] = [abcd.get_g_score()]
                    _F['d2h'] = [abcd.calculate_d2h()]
                    _F['pci_20'] = [abcd.get_pci_20()]
                    _F['ifa'] = [abcd.get_ifa()]
                    _F['pd'] = [abcd.get_pd()]
                    _F['pf'] = [abcd.get_pf()]
                    
                    for goal in goals:
                        if goal == 'g':
                            _goal = 'g-score'
                        else:
                            _goal = goal
                        if _goal not in results.keys():
                            results[_goal] = {}
                        if d_project not in results[_goal].keys():
                            results[_goal][d_project] = []
                        results[_goal][d_project].append(F[_goal][0])
                        results[_goal][d_project].append(_F[_goal][0])
                except Exception as e:
                    print(d_project,e)
                    continue
        for key in results:
            df = pd.DataFrame.from_dict(results[key],orient='index')
            if not Path(data_location).is_dir():
                os.makedirs(Path(data_location))
            df.to_csv(data_location + '/bellwether_' + key + '.csv')
    return results

In [19]:
for i in range(20):
    fold = str(i)
    data_location = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/exp_cdom/fold_' + fold
    cluster_data_loc = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/exp2/2/fold_' + fold
    metrices_loc = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
    results = get_predicted(cluster_data_loc,metrices_loc,fold,data_location)

[cluster_id=0] N_children: 7 N_samples: 627
> [cluster_id=1] N_children: 0 N_samples: 1
> [cluster_id=2] N_children: 0 N_samples: 1
> [cluster_id=3] N_children: 5 N_samples: 55
> > [cluster_id=4] N_children: 0 N_samples: 9
> > [cluster_id=5] N_children: 0 N_samples: 9
> > [cluster_id=6] N_children: 0 N_samples: 7
> > [cluster_id=7] N_children: 0 N_samples: 17
> > [cluster_id=8] N_children: 0 N_samples: 13
> [cluster_id=9] N_children: 10 N_samples: 127
> > [cluster_id=10] N_children: 0 N_samples: 3
> > [cluster_id=11] N_children: 0 N_samples: 9
> > [cluster_id=12] N_children: 0 N_samples: 19
> > [cluster_id=13] N_children: 0 N_samples: 15
> > [cluster_id=14] N_children: 0 N_samples: 4
> > [cluster_id=15] N_children: 0 N_samples: 17
> > [cluster_id=16] N_children: 0 N_samples: 15
> > [cluster_id=17] N_children: 0 N_samples: 19
> > [cluster_id=18] N_children: 0 N_samples: 11
> > [cluster_id=19] N_children: 0 N_samples: 15
> [cluster_id=20] N_children: 16 N_samples: 183
> > [cluster_id=21]

theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
ontomorphtab.csv index 1 is out of bounds for axis 0 with size 1
theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 8 N_samples: 627
> [cluster_id=1] N_children: 7 N_samples: 79
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 10
> > [cluster_id=4] N_

toped.csv index 1 is out of bounds for axis 0 with size 1
emulemorph.csv index 1 is out of bounds for axis 0 with size 1
plato.csv index 1 is out of bounds for axis 0 with size 1
qtwin.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 9 N_samples: 627
> [cluster_id=1] N_children: 8 N_samples: 85
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 7
> > [cluster_id=4] N_children: 0 N_samples: 19
> > [cluster_id=5] N_children: 0 N_samples: 6
> > [cluster_id=6] N_children: 0 N_samples: 19
> > [cluster_id=7] N_children: 0 N_samples: 15
> > [cluster_id=8] N_children: 0 N_samples: 12
> > [cluster_id=9] N_children: 0 N_samples: 5
> [cluster_id=10] N_children: 2 N_samples: 4
> > [cluster_id=11] N_children: 0 N_samples: 1
> > [cluster_id=12] N_children: 0 N_samples: 3
> [cluster_id=13] N_children: 10 N_samples: 88
> > [cluster_id=14] N_children: 0 N_samples: 3
> > [cluster_id=15] N_children: 0 N_samples: 13
> > [cluster_id=16] N

xreal.csv index 1 is out of bounds for axis 0 with size 1
xpontus.csv index 1 is out of bounds for axis 0 with size 1
stemkit.csv index 1 is out of bounds for axis 0 with size 1
elynx.csv index 1 is out of bounds for axis 0 with size 1
alpine.csv index 1 is out of bounds for axis 0 with size 1
micomt.csv index 1 is out of bounds for axis 0 with size 1
avisynth2.csv index 1 is out of bounds for axis 0 with size 1
xreal.csv index 1 is out of bounds for axis 0 with size 1
xpontus.csv index 1 is out of bounds for axis 0 with size 1
stemkit.csv index 1 is out of bounds for axis 0 with size 1
elynx.csv index 1 is out of bounds for axis 0 with size 1
alpine.csv index 1 is out of bounds for axis 0 with size 1
micomt.csv index 1 is out of bounds for axis 0 with size 1
avisynth2.csv index 1 is out of bounds for axis 0 with size 1
xreal.csv index 1 is out of bounds for axis 0 with size 1
xpontus.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 9 N_samples: 628
> [clu

jchassis.csv index 1 is out of bounds for axis 0 with size 1
fmri-dmt.csv index 1 is out of bounds for axis 0 with size 1
mecat.csv index 1 is out of bounds for axis 0 with size 1
jcl.csv index 1 is out of bounds for axis 0 with size 1
jas.csv index 1 is out of bounds for axis 0 with size 1
xqilla.csv index 1 is out of bounds for axis 0 with size 1
wxjs.csv index 1 is out of bounds for axis 0 with size 1
glelite.csv index 1 is out of bounds for axis 0 with size 1
jchassis.csv index 1 is out of bounds for axis 0 with size 1
fmri-dmt.csv index 1 is out of bounds for axis 0 with size 1
mecat.csv index 1 is out of bounds for axis 0 with size 1
jcl.csv index 1 is out of bounds for axis 0 with size 1
jas.csv index 1 is out of bounds for axis 0 with size 1
xqilla.csv index 1 is out of bounds for axis 0 with size 1
wxjs.csv index 1 is out of bounds for axis 0 with size 1
glelite.csv index 1 is out of bounds for axis 0 with size 1
jchassis.csv index 1 is out of bounds for axis 0 with size 1
fmr

campsoft.csv This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
gtad.csv index 1 is out of bounds for axis 0 with size 1
elmo.csv index 1 is out of bounds for axis 0 with size 1
plog4u.csv This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
wpdev.csv index 1 is out of bounds for axis 0 with size 1
campsoft.csv This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
gtad.csv index 1 is out of bounds for axis 0 with size 1
plog4u.csv This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
wpdev.csv index 1 is out of bounds for axis 0 with size 1
campsoft.csv This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
gtad.csv index 1 is out of bounds for axis 0 with size 1
plog4u.csv This solver needs samples of at least 2 classes in the data

theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
theresa.csv index 1 is out of bounds for axis 0 with size 1
knowledge.csv index 1 is out of bounds for axis 0 with size 1
taokgame.csv index 1 is out of bounds for axis 0 with size 1
redshell.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 8 N_samples: 627
> [cluster_id=1] N_children: 7 N_samples: 79
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 10
> > [cluster_id=4] N_children: 0 N_samples: 13
> > [cluster_id=5] N_children: 0 N_samp

qtwin.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 9 N_samples: 627
> [cluster_id=1] N_children: 8 N_samples: 85
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 7
> > [cluster_id=4] N_children: 0 N_samples: 19
> > [cluster_id=5] N_children: 0 N_samples: 6
> > [cluster_id=6] N_children: 0 N_samples: 19
> > [cluster_id=7] N_children: 0 N_samples: 15
> > [cluster_id=8] N_children: 0 N_samples: 12
> > [cluster_id=9] N_children: 0 N_samples: 5
> [cluster_id=10] N_children: 2 N_samples: 4
> > [cluster_id=11] N_children: 0 N_samples: 1
> > [cluster_id=12] N_children: 0 N_samples: 3
> [cluster_id=13] N_children: 10 N_samples: 88
> > [cluster_id=14] N_children: 0 N_samples: 3
> > [cluster_id=15] N_children: 0 N_samples: 13
> > [cluster_id=16] N_children: 0 N_samples: 11
> > [cluster_id=17] N_children: 0 N_samples: 9
> > [cluster_id=18] N_children: 0 N_samples: 7
> > [cluster_id=19] N_children: 0 N_samples: 3
> > [cluste

elynx.csv index 1 is out of bounds for axis 0 with size 1
alpine.csv index 1 is out of bounds for axis 0 with size 1
micomt.csv index 1 is out of bounds for axis 0 with size 1
avisynth2.csv index 1 is out of bounds for axis 0 with size 1
xreal.csv index 1 is out of bounds for axis 0 with size 1
xpontus.csv index 1 is out of bounds for axis 0 with size 1
stemkit.csv index 1 is out of bounds for axis 0 with size 1
elynx.csv index 1 is out of bounds for axis 0 with size 1
alpine.csv index 1 is out of bounds for axis 0 with size 1
micomt.csv index 1 is out of bounds for axis 0 with size 1
avisynth2.csv index 1 is out of bounds for axis 0 with size 1
xreal.csv index 1 is out of bounds for axis 0 with size 1
xpontus.csv index 1 is out of bounds for axis 0 with size 1
[cluster_id=0] N_children: 9 N_samples: 628
> [cluster_id=1] N_children: 7 N_samples: 75
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 6
> > [cluster_id=4] N_children: 0 N_samples: 20


jchassis.csv index 1 is out of bounds for axis 0 with size 1
fmri-dmt.csv index 1 is out of bounds for axis 0 with size 1
mecat.csv index 1 is out of bounds for axis 0 with size 1
jcl.csv index 1 is out of bounds for axis 0 with size 1
jas.csv index 1 is out of bounds for axis 0 with size 1
xqilla.csv index 1 is out of bounds for axis 0 with size 1
wxjs.csv index 1 is out of bounds for axis 0 with size 1
glelite.csv index 1 is out of bounds for axis 0 with size 1
jchassis.csv index 1 is out of bounds for axis 0 with size 1
fmri-dmt.csv index 1 is out of bounds for axis 0 with size 1
mecat.csv index 1 is out of bounds for axis 0 with size 1
jcl.csv index 1 is out of bounds for axis 0 with size 1
jas.csv index 1 is out of bounds for axis 0 with size 1
xqilla.csv index 1 is out of bounds for axis 0 with size 1
wxjs.csv index 1 is out of bounds for axis 0 with size 1
ambulant.csv index 1 is out of bounds for axis 0 with size 1
tradelink.csv index 1 is out of bounds for axis 0 with size 1
g

In [28]:
for key in results:
    df = pd.DataFrame.from_dict(results[key],orient='index')
    print(df)

                           0     1     2     3     4     5
guineu.csv              0.29  0.83  0.80  0.83  0.80  0.83
rhex.csv                0.00  0.00  0.00  0.00  0.18  0.00
owlib.csv               0.54  0.61  0.69  0.61  0.55  0.61
ktc.csv                 0.69  0.88  0.69  0.88  0.59  0.88
jedit.csv               0.60  0.40  0.53  0.40  0.70  0.40
net-snmp.csv            0.00  0.75  0.62  0.75  0.57  0.75
encog-java.csv          0.20  0.00  0.21  0.00  0.16  0.00
umber.csv               0.67  1.00  0.40  1.00  0.44  1.00
jahshakafx.csv          0.61  0.69  0.53  0.69  0.45  0.69
jahshaka.csv            0.50  0.00  0.33  0.00  0.29  0.00
selenium.csv            0.29  0.44  0.29  0.44  0.59  0.44
paintown.csv            0.36  0.62  0.50  0.62  0.57  0.62
h2database.csv          0.32  0.00  0.36  0.00  0.37  0.00
x4x.csv                 0.89  0.89  0.80  0.89  0.80  0.89
elmo.csv                0.00  0.00  0.00  0.00  0.00  0.00
jmoney.csv              0.57  0.40  0.86  0.40  0.60  0.

In [60]:
results1 = results.pop('werx.csv')

Unnamed: 0,0,1,2,3,4,5
guineu.csv,[0.71],[0.44],[0.8],[0.5],[0.8],[0.4]
gtad.csv,[0.0],[0.0],[0.0],[0.0],[0.0],[0.0]
rhex.csv,[0.22],[0.4],[0.2],[0.4],[0.0],[0.4]
owlib.csv,[0.58],[0.64],[0.56],[0.59],[0.55],[0.67]
ktc.csv,[0.71],[0.78],[0.65],[0.83],[0.63],[0.83]
jedit.csv,[0.7],[0.62],[0.74],[0.67],[0.59],[0.67]
net-snmp.csv,[0.57],[0.75],[0.0],[0.5],[0.6],[0.73]
encog-java.csv,[0.16],[0.0],[0.17],[0.17],[0.15],[0.15]
umber.csv,[0.5],[0.8],[0.44],[0.8],[0.4],[0.8]
jahshakafx.csv,[0.61],[0.71],[0.62],[0.64],[0.45],[0.69]


In [8]:
for perc in range(5,105,5):
    print(perc/100)

0.05
0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95
1.0
