In [1]:
import pandas as pd
import numpy as np
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
import sys
import traceback
import warnings
import timeit



import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import metrics.abcd
import FFT_New as FFT
import metrices
import measures

warnings.filterwarnings("ignore")

In [2]:
def get_projects(data_source):
    #'/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
    if platform.system() == 'Darwin' or platform.system() == 'Linux':
        _dir = data_source + '/'
    else:
        _dir = data_source + '\\'
    projects = [f for f in listdir(_dir) if isfile(join(_dir, f))]
    return projects

def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def tune_learner(learner, train_X, train_Y, tune_X, tune_Y, goal,loc=None,target_class=None):
    if not target_class:
        target_class = goal
    clf = learner(train_X, train_Y, tune_X, tune_Y, goal,loc)
    tuner = DE.DE_Tune_ML(clf, clf.get_param(), goal, target_class)
    return tuner.Tune()

In [3]:
selected_projects = get_projects('/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted')

In [4]:
selected_projects

['bzbyte.csv',
 'lcdata.csv',
 'llcon.csv',
 'freedom-erp.csv',
 'gpsee.csv',
 'twostep.csv',
 'tauruss.csv',
 'makumba.csv',
 'openi.csv',
 'pure-lang.csv',
 'roguelike.csv',
 'iaml.csv',
 'wiquery.csv',
 'etics.csv',
 'neoengine.csv',
 'owasp-esapi-java.csv',
 'voot.csv',
 'nsis.csv',
 'vdsf.csv',
 'airhead-research.csv',
 'monetdb.csv',
 'x10.csv',
 'wmii.csv',
 'bungeni-editor.csv',
 'openejb.csv',
 'gogglesmm.csv',
 'workhub2.csv',
 'amygdala.csv',
 'mozzie.csv',
 'ikvm.csv',
 'gild.csv',
 'jicarilla.csv',
 'arcanea-project.csv',
 'vito.csv',
 'middle-man.csv',
 'pernband.csv',
 'aquachat.csv',
 'tgl.csv',
 'guitoo.csv',
 'vrmoo.csv',
 'j4fry.csv',
 'metacosm.csv',
 'mplayer-ce.csv',
 'moviesandbox.csv',
 'uwom.csv',
 'geotools.csv',
 'jrdf.csv',
 'chatforte.csv',
 'ng4j.csv',
 'bloodycore.csv',
 'allacrost.csv',
 'jahshakafx.csv',
 'jstock.csv',
 'nassp.csv',
 'phission.csv',
 'soar.csv',
 'xbplayer.csv',
 'scite-ru.csv',
 'mixxx.csv',
 'toop.csv',
 'mobicents.csv',
 'openwebbuil

In [5]:
for project in selected_projects:
    start = timeit.default_timer()
    if project != 'logicmail.csv':
        continue
    path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + project
    df = prepare_data(path)
    if df.shape[0] < 50:
        continue
    df.reset_index(drop=True,inplace=True)
    d = {'buggy': True, 'clean': False}
    df['Buggy'] = df['Buggy'].map(d)
    y = df.Buggy
    X = df.drop(labels = ['Buggy'],axis = 1)
    train_X,test_X,train_y,test_y = train_test_split(X, y, test_size=0.33)
    train_df = pd.concat([train_X,train_y], axis = 1)
    test_df = pd.concat([test_X,test_y], axis = 1)
    print(test_df.shape)
    fft = FFT.FFT('cdom','Buggy',5)
    fft.train, fft.test = train_df, test_df
    fft.build_trees()
    fft.eval_trees()
    results = fft.find_best_tree()
    best_structure = fft.structures[fft.best]
    stop = timeit.default_timer()
    print("Model training time: ", stop - start)
    break
    

(18, 67)
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3
	----- PERFORMANCES FOR ALL FFTs on Training Data -----
	CLF 	PRE  	REC 	SPE 	FPR 	NPV 	ACC 	F_1 	cdom
	FFT(0)	1.000	1.000	00000	10000	00000	1.000	1.000	[1.0, 1.0, 1]
	FFT(1)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(2)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(3)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(4)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(5)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(6)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(7)	1.000	1.000	1.000	0.000	1.000	1.000	1.000	[1.0, 1.0, 0.0]
	FFT(8)	1.000	0.267	1.000	0.000	0.267	0.421	0.421	[1.0, 0.267, 0.0]
	FFT(9)	1.000	0.267	1.000	0.000	0.267	0.421	0.421	[1.0, 0.267, 0.0]
	FFT(10)	1.000	0.267	1.000	0.000	0.267	0.421	0.421	[1.0, 0.267, 0.0]
	FFT(11)	1.000	0.214	1.000	0.

In [8]:
df = prepare_data('/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/bzbyte.csv')
fft.test = df

In [10]:
x = fft.eval_other_project(1)
print(x)

0
1
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


In [None]:
fft.test

In [11]:
fft.best

1