In [1]:
import pandas as pd
import numpy as np
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback



import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import metrics.abcd

import metrices
import measures

import sys
import traceback
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_source1 = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
if platform.system() == 'Darwin' or platform.system() == 'Linux':
    _dir = data_source1 + '/'
else:
    _dir = data_source1 + '\\'
projects = [f for f in listdir(_dir) if isfile(join(_dir, f))]

In [3]:
def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def tune_learner(learner, train_X, train_Y, tune_X, tune_Y, goal,loc=None,target_class=None):
    if not target_class:
        target_class = goal
    clf = learner(train_X, train_Y, tune_X, tune_Y, goal,loc)
    tuner = DE.DE_Tune_ML(clf, clf.get_param(), goal, target_class)
    return tuner.Tune()

In [5]:
final_score = {}
count = 0
for project in projects:
    try:
        path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + project
        print(project)
        df = prepare_data(path)
        if df.shape[0] < 50:
            continue
        else:
            count+=1
        df.reset_index(drop=True,inplace=True)
        d = {'buggy': True, 'clean': False}
        df['Buggy'] = df['Buggy'].map(d)
        buggy = df[df['Buggy'] == True]
        buggy_percentage = buggy.shape[0]/df.shape[0]
        y = df.Buggy
        X = df.drop(labels = ['Buggy'],axis = 1)
        kf = StratifiedKFold(n_splits = 5)
        goal = 'f1'
        F = {}
        score = {}
        for i in range(5):
            for train_index, tune_index in kf.split(X, y):
                X_train, X_tune = X.iloc[train_index], X.iloc[tune_index]
                y_train, y_tune = y[train_index], y[tune_index]
                _df_tune_loc = X_tune.LOC
                clf = LogisticRegression()
                #clf = SVC()
                clf.fit(X_train,y_train)
                predicted = clf.predict(X_tune)
                abcd = metrices.measures(y_tune,predicted,_df_tune_loc)
                F['f1'] = [abcd.calculate_f1_score()]
                F['precision'] = [abcd.calculate_precision()]
                F['recall'] = [abcd.calculate_recall()]
                F['g-score'] = [abcd.get_g_score()]
                F['d2h'] = [abcd.calculate_d2h()]
                F['pci_20'] = [abcd.get_pci_20()]
                F['ifa'] = [abcd.get_ifa()]
                F['pd'] = [abcd.get_pd()]
                F['pf'] = [abcd.get_pf()]
                _F = copy.deepcopy(F)
                if 'f1' not in score.keys():
                    _F['buggy_prec'] = [buggy_percentage]
                    score = _F
                else:
                    score['f1'].append(F['f1'][0])
                    score['precision'].append(F['precision'][0])
                    score['recall'].append(F['recall'][0])
                    score['g-score'].append(F['g-score'][0])
                    score['d2h'].append(F['d2h'][0])
                    score['pci_20'].append(F['pci_20'][0])
                    score['ifa'].append(F['ifa'][0])
                    score['pd'].append(F['pd'][0])
                    score['pf'].append(F['pf'][0])
                    score['buggy_prec'].append(buggy_percentage)
            final_score[project] = score 
    except Exception as e:
        print(e)
        continue

bzbyte.csv
lcdata.csv
llcon.csv
freedom-erp.csv
gpsee.csv
twostep.csv
tauruss.csv
makumba.csv
openi.csv
pure-lang.csv
roguelike.csv
iaml.csv
wiquery.csv
etics.csv
neoengine.csv
owasp-esapi-java.csv
voot.csv
nsis.csv
vdsf.csv
airhead-research.csv
monetdb.csv
x10.csv
wmii.csv
bungeni-editor.csv
openejb.csv
gogglesmm.csv
workhub2.csv
amygdala.csv
mozzie.csv
ikvm.csv
gild.csv
jicarilla.csv
arcanea-project.csv
vito.csv
middle-man.csv
pernband.csv
aquachat.csv
tgl.csv
guitoo.csv
vrmoo.csv
j4fry.csv
metacosm.csv
mplayer-ce.csv
moviesandbox.csv
uwom.csv
geotools.csv
jrdf.csv
chatforte.csv
ng4j.csv
bloodycore.csv
allacrost.csv
jahshakafx.csv
jstock.csv
nassp.csv
phission.csv
soar.csv
xbplayer.csv
scite-ru.csv
mixxx.csv
toop.csv
mobicents.csv
openwebbuilder.csv
google-secure-data-connector.csv
opencollada.csv
jcache.csv
flylegacy.csv
pde.csv
"['Host' 'Vcs' 'File' 'PL' 'IssueTracking'] not found in axis"
mcmc-jags.csv
mcore3d.csv
gdis.csv
gtad.csv
index 1 is out of bounds for axis 0 with size 1
l

openfrag.csv
zildo.csv
duplicati.csv
This solver needs samples of at least 2 classes in the data, but the data contains only one class: True
olex2.csv
linav.csv
enzo.csv
kvs.csv
vvis.csv
adobe-source.csv
x4x.csv
fakedetector.csv
openmalaria.csv
autofac.csv
neptuner.csv
stajistics.csv
typeset-dictionary.csv
konversation.csv
net-snmp.csv
uface.csv
atlantisos.csv
n2cms.csv
pdfedit.csv
nutz.csv
refdb.csv
vym.csv
cornerstone.csv
nwos.csv
enthusiasm.csv
lynkeos.csv
planeshift.csv
google-caja.csv
phet.csv
maya-work-in-progress.csv
bibletime.csv
locify.csv
firebreath.csv
openss.csv
qpe.csv
smartgwt.csv
eid-applet.csv
nagiosplug.csv
blue-c.csv
jbpm.csv
mvt.csv
ontopia.csv
turbotrader-bos.csv
renaissancecore.csv
appleiigo.csv
muvee-symbolic-expressions.csv
xapian.csv
powermock.csv
keyczar.csv
pal.csv
vrjuggler.csv
woofy.csv
eedt.csv
aime.csv
ipfilter.csv
odyssiportal.csv
barde.csv
libmesh.csv
herostats.csv
smarttag.csv
sauerbraten.csv
super-tux.csv
jgossipforum.csv
languish.csv
aufs.csv
flexpay.

gamestone.csv
nspectre.csv
google-enterprise-connector-sharepoint.csv
jam-daq.csv
badtrinitycore.csv
kftpgrabber.csv
openvrml.csv
qtwin.csv
sync4j.csv
snakeyaml.csv
mevenide.csv
robocode.csv
spheral.csv
red5.csv
digital-delegation.csv
springside.csv
adapdev.csv
ltp.csv
reactivision.csv
ggz.csv
virtualphotoorg.csv
gretl.csv
fido.csv
index 1 is out of bounds for axis 0 with size 1
moagg.csv
ambulant.csv
jmedialibrary.csv
indigente.csv
mlton.csv
logicalloy.csv
nitric.csv
gmod.csv
naxl.csv
yaprm.csv
jtge.csv
mojonation.csv
bulmages.csv
oscarmcmaster.csv
perlqt4.csv
itextsharp.csv
libchipcard.csv
superwaba.csv
proftp.csv
kmatplot.csv
nuttx.csv
nsuml.csv
checker-framework.csv
pocketwit.csv
plato.csv
alpine.csv
bscweasel.csv
requin.csv
mededis.csv
ns-3-dev-def-routing.csv
macjlib.csv
persevere-framework.csv
inkscape.csv
massiv.csv
index 1 is out of bounds for axis 0 with size 1
thrust.csv
fulguro.csv
gusanos.csv
gwtwiki.csv
semanticvectors.csv
popeye-chess.csv
blogentis.csv
wxsvg.csv
cantera.

In [6]:
with open('data/1385/1385_LR_default_1.pkl', 'wb') as handle:
    pickle.dump(final_score, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
df = pd.read_pickle('data/1385/1385_LR_default_1.pkl')
results = []
for project in df.keys():
    results.append([project,np.median(df[project]['f1']),
                   np.median(df[project]['precision']),
                   np.median(df[project]['recall']),
                   np.median(df[project]['g-score']),
                   np.median(df[project]['d2h']),
                   np.median(df[project]['pci_20']),
                   np.median(df[project]['ifa']),
                   np.median(df[project]['pd']),
                   np.median(df[project]['pf']),
                   np.median(df[project]['buggy_prec'])])

In [8]:
results = []
for project in df.keys():
    results.append([project,np.median(df[project]['f1']),
                   np.median(df[project]['precision']),
                   np.median(df[project]['recall']),
                   np.median(df[project]['g-score']),
                   np.median(df[project]['d2h']),
                   np.median(df[project]['pci_20']),
                   np.median(df[project]['ifa']),
                   np.median(df[project]['pd']),
                   np.median(df[project]['pf']),
                   np.median(df[project]['buggy_prec'])])

In [9]:
results_df = pd.DataFrame(results,columns = ['project','f1','precision','recall','g-score','d2h','pci_20','ifa','pd','pf','buggyness'])


In [10]:
results_df.to_csv('data/1385/1385_LR_default_1.csv')

In [None]:
score = np.array([.68,.54,.77,.66,.66,.73,.5,.77,.62,.58,.7,.46,.73,.66,.66,.76,.54,.56,.66,.66,.71,.29,.69,.58,.54])
print(np.median(score))
plt.boxplot(score)