In [3]:
import pandas as pd
import numpy as np
import math
import pickle
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path


import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import birch
import metrics.abcd

import metrices
import measures

import sys
import traceback
import warnings
warnings.filterwarnings("ignore")

In [4]:
def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def load_data(path,target):
    df = pd.read_csv(path)
    if path == 'data/jm1.csv':
        df = df[~df.uniq_Op.str.contains("\?")]
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    return X,y

# Cluster Driver
def cluster_driver(df,print_tree = True):
    X = df.apply(pd.to_numeric)
    cluster = birch.birch(branching_factor=20)
    #X.set_index('Project Name',inplace=True)
    cluster.fit(X)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    #cluster_tree = cluster.model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [15]:
def get_predicted(cluster_data_loc,metrices_loc,fold,data_location,bell0_loc):
    train_data = pd.read_pickle(cluster_data_loc + '/train_data.pkl')
    cluster,cluster_tree,max_depth = cluster_driver(train_data)
    t_df = pd.DataFrame()
    for project in train_data.index.values.tolist():
        _s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + project
        s_df = prepare_data(_s_path)
        t_df = pd.concat([t_df,s_df])
    t_df.reset_index(drop=True,inplace=True)
    d = {'buggy': True, 'clean': False}
    t_df['Buggy'] = t_df['Buggy'].map(d)
    t_df, g_s_cols = apply_cfs(t_df)
    t_df = apply_smote(t_df)
    train_y = t_df.Buggy
    train_X = t_df.drop(labels = ['Buggy'],axis = 1)
    clf_global = LogisticRegression()
    clf_global.fit(train_X,train_y)
    test_data = pd.read_pickle('/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/projects/other_selected_attr.pkl')
    test_data = pd.DataFrame.from_dict(test_data,orient='index')
    test_projects = test_data.index.values.tolist()
    goals = ['recall','precision','pf','pci_20','ifa']
    levels = [2,1,0]
    for _ in range(1):
        results = {}
        bellwether_models = {}
        bellwether0_models = {}
        bellwether0_s_cols = {}
        bellwether_s_cols = {}
        self_model = {}
        self_model_test = {} 
        for level in levels:
            test_data = test_data
            predicted_cluster = cluster.predict(test_data,level)
            #print(level,predicted_cluster)
            for i in range(len(predicted_cluster)):
                try:
                    F = {}
                    _F = {}
                    b_F = {}
                    g_F = {}
                    r_F = {}
                    c_id = predicted_cluster[i]
                    s_project_df = pd.read_csv(cluster_data_loc + '/bellwether_cdom_' + str(level) + '.csv')
                    if level == 1:
                        s_project_df.rename(columns = {'Unnamed: 0':'id'},inplace = True)
                    if level == 0:
                        s_project = s_project_df.bellwether.values[0]
                    else:
                        s_project = s_project_df[s_project_df['id'] == predicted_cluster[i]].bellwether.values[0]
                    if s_project not in bellwether_models.keys():
                        s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
                        df = prepare_data(s_path)
                        df.reset_index(drop=True,inplace=True)
                        d = {'buggy': True, 'clean': False}
                        df['Buggy'] = df['Buggy'].map(d)
                        df, s_cols = apply_cfs(df)
                        bellwether_s_cols[s_project] = s_cols
                        df = apply_smote(df)
                        y = df.Buggy
                        X = df.drop(labels = ['Buggy'],axis = 1)
                        clf_bellwether = LogisticRegression()
                        clf_bellwether.fit(X,y)
                        bellwether_models[s_project] = clf_bellwether
                    else:
                        clf_bellwether = bellwether_models[s_project]
                        s_cols = bellwether_s_cols[s_project]
                        
                    b_s_project_df = pd.read_csv(bell0_loc + '/bellwether_cdom_0.csv')
                    b_s_project = b_s_project_df.bellwether.values[0]
                    if b_s_project not in bellwether0_models.keys():
                        b_s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
                        b_df = prepare_data(b_s_path)
                        b_df.reset_index(drop=True,inplace=True)
                        d = {'buggy': True, 'clean': False}
                        b_df['Buggy'] = b_df['Buggy'].map(d)
                        b_df, b_s_cols = apply_cfs(b_df)
                        bellwether0_s_cols[b_s_project] = b_s_cols
                        b_df = apply_smote(b_df)
                        b_y = b_df.Buggy
                        b_X = b_df.drop(labels = ['Buggy'],axis = 1)
                        b_clf_bellwether = LogisticRegression()
                        b_clf_bellwether.fit(X,y)
                        bellwether0_models[b_s_project] = b_clf_bellwether
                    else:
                        b_clf_bellwether = bellwether0_models[b_s_project]
                        b_s_cols = bellwether0_s_cols[b_s_project]

                    d_project = test_projects[i]
                    kf = StratifiedKFold(n_splits = 3)
                    d_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + d_project
                    test_df = prepare_data(d_path)
                    test_df.reset_index(drop=True,inplace=True)
                    d = {'buggy': True, 'clean': False}
                    test_df['Buggy'] = test_df['Buggy'].map(d)
                    #test_df, x_s_cols = apply_cfs(test_df)
                    test_y = test_df.Buggy
                    test_X = test_df.drop(labels = ['Buggy'],axis = 1)
#                     y_test = _test_df_1.Buggy
#                     X_test = _test_df_1.drop(labels = ['Buggy'],axis = 1)
                    predicted_bellwether = clf_bellwether.predict(X_test)
                    abcd = metrices.measures(y_test,predicted_bellwether,_df_test_loc)
                    if 'f1' not in F.keys():
                        F['f1'] = []
                        F['precision'] = []
                        F['recall'] = []
                        F['g-score'] = []
                        F['d2h'] = []
                        F['pci_20'] = []
                        F['ifa'] = []
                        F['pd'] = []
                        F['pf'] = []
                    F['f1'].append(abcd.calculate_f1_score())
                    F['precision'].append(abcd.calculate_precision())
                    F['recall'].append(abcd.calculate_recall())
                    F['g-score'].append(abcd.get_g_score())
                    F['d2h'].append(abcd.calculate_d2h())
                    F['pci_20'].append(abcd.get_pci_20())
                    try:
                        F['ifa'].append(abcd.get_ifa_roc())
                    except:
                        F['ifa'].append(0)
                    F['pd'].append(abcd.get_pd())
                    F['pf'].append(abcd.get_pf())

                    for goal in goals:
                        if goal == 'g':
                            _goal = 'g-score'
                        else:
                            _goal = goal
                        if _goal not in results.keys():
                            results[_goal] = {}
                        if d_project not in results[_goal].keys():
                            results[_goal][d_project] = []
                        results[_goal][d_project].append(np.median(F[_goal]))
                except Exception as e:
                    print(d_project,e)
                    continue
        for key in results:
            df = pd.DataFrame.from_dict(results[key],orient='index')
            if not Path(data_location).is_dir():
                os.makedirs(Path(data_location))
            df.to_csv(data_location + '/bellwether_' + key + '.csv')
    return results

In [16]:
for i in range(1):
    fold = str(i)
    data_location = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/other_projects_new/fold_' + fold
    cluster_data_loc = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/new_bellwether_pre_re_pf/2/fold_' + fold
    metrices_loc = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
    bell0_loc = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/src/data/1385/new_bellwether_pre_re_pf_v2/0/fold_' + str(fold)
    results = get_predicted(cluster_data_loc,metrices_loc,fold,data_location,bell0_loc)


[cluster_id=0] N_children: 7 N_samples: 627
> [cluster_id=1] N_children: 0 N_samples: 1
> [cluster_id=2] N_children: 0 N_samples: 1
> [cluster_id=3] N_children: 5 N_samples: 55
> > [cluster_id=4] N_children: 0 N_samples: 9
> > [cluster_id=5] N_children: 0 N_samples: 9
> > [cluster_id=6] N_children: 0 N_samples: 7
> > [cluster_id=7] N_children: 0 N_samples: 17
> > [cluster_id=8] N_children: 0 N_samples: 13
> [cluster_id=9] N_children: 10 N_samples: 127
> > [cluster_id=10] N_children: 0 N_samples: 3
> > [cluster_id=11] N_children: 0 N_samples: 9
> > [cluster_id=12] N_children: 0 N_samples: 19
> > [cluster_id=13] N_children: 0 N_samples: 15
> > [cluster_id=14] N_children: 0 N_samples: 4
> > [cluster_id=15] N_children: 0 N_samples: 17
> > [cluster_id=16] N_children: 0 N_samples: 15
> > [cluster_id=17] N_children: 0 N_samples: 19
> > [cluster_id=18] N_children: 0 N_samples: 11
> > [cluster_id=19] N_children: 0 N_samples: 15
> [cluster_id=20] N_children: 16 N_samples: 183
> > [cluster_id=21]

pernband.csv name 'X_test' is not defined
processing.csv name 'X_test' is not defined
ominos.csv name 'X_test' is not defined
audela.csv name 'X_test' is not defined
j-wings.csv name 'X_test' is not defined
itk-snap.csv name 'X_test' is not defined
spinn3r-client.csv name 'X_test' is not defined
novatk.csv name 'X_test' is not defined
openwebbuilder.csv name 'X_test' is not defined
protomol.csv name 'X_test' is not defined
cupsfilter.csv name 'X_test' is not defined
x10.csv name 'X_test' is not defined
pokersource.csv name 'X_test' is not defined
mlxos.csv name 'X_test' is not defined
rainmeter.csv name 'X_test' is not defined
wxextended.csv name 'X_test' is not defined
amino.csv name 'X_test' is not defined
mev-tm4.csv name 'X_test' is not defined
rkward.csv name 'X_test' is not defined
subsonic.csv name 'X_test' is not defined
libfunutil.csv name 'X_test' is not defined
jvoicexml.csv name 'X_test' is not defined
jmedialibrary.csv name 'X_test' is not defined
minibits.csv name 'X_test

ugaagga.csv name 'X_test' is not defined
dyuproject.csv name 'X_test' is not defined
karaoke-dx.csv name 'X_test' is not defined
anyconfig.csv name 'X_test' is not defined
route-me.csv name 'X_test' is not defined
guifications.csv name 'X_test' is not defined
chopshop-166.csv name 'X_test' is not defined
minime.csv name 'X_test' is not defined
simgear.csv name 'X_test' is not defined
pmd.csv name 'X_test' is not defined
adaptit.csv name 'X_test' is not defined
gdipp.csv name 'X_test' is not defined
jamonapi.csv name 'X_test' is not defined
toop.csv name 'X_test' is not defined
logicalloy.csv name 'X_test' is not defined
openvrml.csv name 'X_test' is not defined
xulplayer.csv name 'X_test' is not defined
systats.csv name 'X_test' is not defined
ganttproject.csv name 'X_test' is not defined
ooccollider.csv name 'X_test' is not defined
opencollada.csv name 'X_test' is not defined
moagg.csv name 'X_test' is not defined
jikesrvm.csv name 'X_test' is not defined
modelling4all.csv name 'X_tes

adr.csv name 'X_test' is not defined
hits.csv name 'X_test' is not defined
mlton.csv name 'X_test' is not defined
yucata.csv name 'X_test' is not defined
metastudio.csv name 'X_test' is not defined
runningbuddy.csv name 'X_test' is not defined
scalaris.csv name 'X_test' is not defined
snes9x-gx.csv name 'X_test' is not defined
oge.csv name 'X_test' is not defined
coldemoplayer.csv name 'X_test' is not defined
actiongame.csv name 'X_test' is not defined
renaissancecore.csv name 'X_test' is not defined
ryuon.csv name 'X_test' is not defined
mojonation.csv name 'X_test' is not defined
python-on-a-chip.csv name 'X_test' is not defined
mnemisis.csv name 'X_test' is not defined
glunatic.csv name 'X_test' is not defined
google-enterprise-connector-manager.csv name 'X_test' is not defined
qore.csv name 'X_test' is not defined
xot.csv name 'X_test' is not defined
wc-mdx-editor.csv name 'X_test' is not defined
virtualphotoorg.csv name 'X_test' is not defined
gwtchismes.csv name 'X_test' is not d

cayambe.csv name 'X_test' is not defined
voot.csv name 'X_test' is not defined
freedots.csv name 'X_test' is not defined
ome.csv name 'X_test' is not defined
eli-project.csv name 'X_test' is not defined
roguelike.csv name 'X_test' is not defined
pernband.csv name 'X_test' is not defined
processing.csv name 'X_test' is not defined
ominos.csv name 'X_test' is not defined
audela.csv name 'X_test' is not defined
j-wings.csv name 'X_test' is not defined
itk-snap.csv name 'X_test' is not defined
spinn3r-client.csv name 'X_test' is not defined
novatk.csv name 'X_test' is not defined
openwebbuilder.csv name 'X_test' is not defined
protomol.csv name 'X_test' is not defined
cupsfilter.csv name 'X_test' is not defined
x10.csv name 'X_test' is not defined
pokersource.csv name 'X_test' is not defined
mlxos.csv name 'X_test' is not defined
rainmeter.csv name 'X_test' is not defined
wxextended.csv name 'X_test' is not defined
amino.csv name 'X_test' is not defined
mev-tm4.csv name 'X_test' is not def

languagetool.csv name 'X_test' is not defined
adeopensite.csv name 'X_test' is not defined
qashweb.csv name 'X_test' is not defined
ugaagga.csv name 'X_test' is not defined
dyuproject.csv name 'X_test' is not defined
karaoke-dx.csv name 'X_test' is not defined
anyconfig.csv name 'X_test' is not defined
route-me.csv name 'X_test' is not defined
guifications.csv name 'X_test' is not defined
chopshop-166.csv name 'X_test' is not defined
minime.csv name 'X_test' is not defined
simgear.csv name 'X_test' is not defined
pmd.csv name 'X_test' is not defined
adaptit.csv name 'X_test' is not defined
gdipp.csv name 'X_test' is not defined
jamonapi.csv name 'X_test' is not defined
toop.csv name 'X_test' is not defined
logicalloy.csv name 'X_test' is not defined
openvrml.csv name 'X_test' is not defined
xulplayer.csv name 'X_test' is not defined
systats.csv name 'X_test' is not defined
ganttproject.csv name 'X_test' is not defined
ooccollider.csv name 'X_test' is not defined
opencollada.csv name 'X

jamocha.csv name 'X_test' is not defined
archive-access.csv name 'X_test' is not defined
freecloth.csv name 'X_test' is not defined
ahuman.csv name 'X_test' is not defined
ejbtool.csv name 'X_test' is not defined
inikah2.csv name 'X_test' is not defined
esup-helpdesk.csv name 'X_test' is not defined
adr.csv name 'X_test' is not defined
hits.csv name 'X_test' is not defined
mlton.csv name 'X_test' is not defined
yucata.csv name 'X_test' is not defined
metastudio.csv name 'X_test' is not defined
runningbuddy.csv name 'X_test' is not defined
scalaris.csv name 'X_test' is not defined
snes9x-gx.csv name 'X_test' is not defined
oge.csv name 'X_test' is not defined
coldemoplayer.csv name 'X_test' is not defined
actiongame.csv name 'X_test' is not defined
renaissancecore.csv name 'X_test' is not defined
ryuon.csv name 'X_test' is not defined
mojonation.csv name 'X_test' is not defined
python-on-a-chip.csv name 'X_test' is not defined
mnemisis.csv name 'X_test' is not defined
glunatic.csv name 

valgrind.csv name 'X_test' is not defined
bigbluebutton.csv name 'X_test' is not defined
webappframework.csv name 'X_test' is not defined
any23.csv name 'X_test' is not defined
apbs.csv name 'X_test' is not defined
pure-lang.csv name 'X_test' is not defined
tuxkart.csv name 'X_test' is not defined
python.csv name 'X_test' is not defined
cayambe.csv name 'X_test' is not defined
voot.csv name 'X_test' is not defined
freedots.csv name 'X_test' is not defined
ome.csv name 'X_test' is not defined
eli-project.csv name 'X_test' is not defined
roguelike.csv name 'X_test' is not defined
pernband.csv name 'X_test' is not defined
processing.csv name 'X_test' is not defined
ominos.csv name 'X_test' is not defined
audela.csv name 'X_test' is not defined
j-wings.csv name 'X_test' is not defined
itk-snap.csv name 'X_test' is not defined
spinn3r-client.csv name 'X_test' is not defined
novatk.csv name 'X_test' is not defined
openwebbuilder.csv name 'X_test' is not defined
protomol.csv name 'X_test' is 

opentnl.csv name 'X_test' is not defined
kitten.csv name 'X_test' is not defined
jtge.csv name 'X_test' is not defined
hsqldb.csv name 'X_test' is not defined
eclipse-imp.csv name 'X_test' is not defined
surprise.csv name 'X_test' is not defined
pastmon.csv name 'X_test' is not defined
ncut.csv name 'X_test' is not defined
grailrtls.csv name 'X_test' is not defined
languagetool.csv name 'X_test' is not defined
adeopensite.csv name 'X_test' is not defined
qashweb.csv name 'X_test' is not defined
ugaagga.csv name 'X_test' is not defined
dyuproject.csv name 'X_test' is not defined
karaoke-dx.csv name 'X_test' is not defined
anyconfig.csv name 'X_test' is not defined
route-me.csv name 'X_test' is not defined
guifications.csv name 'X_test' is not defined
chopshop-166.csv name 'X_test' is not defined
minime.csv name 'X_test' is not defined
simgear.csv name 'X_test' is not defined
pmd.csv name 'X_test' is not defined
adaptit.csv name 'X_test' is not defined
gdipp.csv name 'X_test' is not defi

loserjabber.csv name 'X_test' is not defined
jassda.csv name 'X_test' is not defined
pymol.csv name 'X_test' is not defined
retribengine.csv name 'X_test' is not defined
makumba.csv name 'X_test' is not defined
lladd.csv name 'X_test' is not defined
simal.csv name 'X_test' is not defined
jamocha.csv name 'X_test' is not defined
archive-access.csv name 'X_test' is not defined
freecloth.csv name 'X_test' is not defined
ahuman.csv name 'X_test' is not defined
ejbtool.csv name 'X_test' is not defined
inikah2.csv name 'X_test' is not defined
esup-helpdesk.csv name 'X_test' is not defined
adr.csv name 'X_test' is not defined
hits.csv name 'X_test' is not defined
mlton.csv name 'X_test' is not defined
yucata.csv name 'X_test' is not defined
metastudio.csv name 'X_test' is not defined
runningbuddy.csv name 'X_test' is not defined
scalaris.csv name 'X_test' is not defined
snes9x-gx.csv name 'X_test' is not defined
oge.csv name 'X_test' is not defined
coldemoplayer.csv name 'X_test' is not defin