In [1]:
import pandas as pd
import numpy as np
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback



import matplotlib.pyplot as plt

import SMOTE
import feature_selector
import DE
import CFS
import birch
import metrics.abcd


from multiprocessing import Pool, cpu_count
from threading import Thread
from multiprocessing import Queue

import metrices
import measures

import sys
import traceback
import warnings
warnings.filterwarnings("ignore")

cores = cpu_count()

In [2]:
data_source1 = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted'
if platform.system() == 'Darwin' or platform.system() == 'Linux':
    _dir = data_source1 + '/'
else:
    _dir = data_source1 + '\\'
projects = [f for f in listdir(_dir) if isfile(join(_dir, f))]

In [3]:
def prepare_data(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['Host','Vcs','Project','File','PL','IssueTracking'],axis=1)
    df = df.dropna()
    df = df[['TLOC', 'TNF', 'TNC', 'TND', 'LOC', 'CL', 'NStmt', 'NFunc',
       'RCC', 'MNL', 'avg_WMC', 'max_WMC', 'total_WMC', 'avg_DIT', 'max_DIT',
       'total_DIT', 'avg_RFC', 'max_RFC', 'total_RFC', 'avg_NOC', 'max_NOC',
       'total_NOC', 'avg_CBO', 'max_CBO', 'total_CBO', 'avg_DIT.1',
       'max_DIT.1', 'total_DIT.1', 'avg_NIV', 'max_NIV', 'total_NIV',
       'avg_NIM', 'max_NIM', 'total_NIM', 'avg_NOM', 'max_NOM', 'total_NOM',
       'avg_NPBM', 'max_NPBM', 'total_NPBM', 'avg_NPM', 'max_NPM', 'total_NPM',
       'avg_NPRM', 'max_NPRM', 'total_NPRM', 'avg_CC', 'max_CC', 'total_CC',
       'avg_FANIN', 'max_FANIN', 'total_FANIN', 'avg_FANOUT', 'max_FANOUT',
       'total_FANOUT', 'NRev', 'NFix', 'avg_AddedLOC', 'max_AddedLOC',
       'total_AddedLOC', 'avg_DeletedLOC', 'max_DeletedLOC',
       'total_DeletedLOC', 'avg_ModifiedLOC', 'max_ModifiedLOC',
       'total_ModifiedLOC','Buggy']]
    return df

def get_features(df):
    fs = feature_selector.featureSelector()
    df,_feature_nums,features = fs.cfs_bfs(df)
    return df,features

def apply_cfs(df):
    y = df.Buggy.values
    X = df.drop(labels = ['Buggy'],axis = 1)
    X = X.values
    selected_cols = CFS.cfs(X,y)
    cols = df.columns[[selected_cols]].tolist()
    cols.append('Buggy')
    return df[cols],cols
    
def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def load_data(path,target):
    df = pd.read_csv(path)
    if path == 'data/jm1.csv':
        df = df[~df.uniq_Op.str.contains("\?")]
    y = df[target]
    X = df.drop(labels = target, axis = 1)
    X = X.apply(pd.to_numeric)
    return X,y

# Cluster Driver
def cluster_driver(df,print_tree = True):
    X = df.apply(pd.to_numeric)
    cluster = birch.birch(branching_factor=20)
    #X.set_index('Project Name',inplace=True)
    cluster.fit(X)
    cluster_tree,max_depth = cluster.get_cluster_tree()
    #cluster_tree = cluster.model_adder(cluster_tree)
    if print_tree:
        cluster.show_clutser_tree()
    return cluster,cluster_tree,max_depth

In [4]:
attr_dict = pd.read_pickle('data/1385/projects/selected_attr.pkl')
attr_df = pd.DataFrame.from_dict(attr_dict,orient='index')
cluster,cluster_tree,max_depth = cluster_driver(attr_df)

[cluster_id=0] N_children: 9 N_samples: 697
> [cluster_id=1] N_children: 9 N_samples: 87
> > [cluster_id=2] N_children: 0 N_samples: 2
> > [cluster_id=3] N_children: 0 N_samples: 7
> > [cluster_id=4] N_children: 0 N_samples: 17
> > [cluster_id=5] N_children: 0 N_samples: 6
> > [cluster_id=6] N_children: 0 N_samples: 18
> > [cluster_id=7] N_children: 0 N_samples: 15
> > [cluster_id=8] N_children: 0 N_samples: 12
> > [cluster_id=9] N_children: 0 N_samples: 6
> > [cluster_id=10] N_children: 0 N_samples: 4
> [cluster_id=11] N_children: 2 N_samples: 4
> > [cluster_id=12] N_children: 0 N_samples: 1
> > [cluster_id=13] N_children: 0 N_samples: 3
> [cluster_id=14] N_children: 10 N_samples: 103
> > [cluster_id=15] N_children: 0 N_samples: 3
> > [cluster_id=16] N_children: 0 N_samples: 4
> > [cluster_id=17] N_children: 0 N_samples: 16
> > [cluster_id=18] N_children: 0 N_samples: 11
> > [cluster_id=19] N_children: 0 N_samples: 19
> > [cluster_id=20] N_children: 0 N_samples: 3
> > [cluster_id=21] 

In [5]:
selected_projects = list(attr_df.iloc[cluster_tree[6].data_points].index)

In [47]:
def load_data_t(s_project):
    s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
    df = prepare_data(s_path)
    df.reset_index(drop=True,inplace=True)
    d = {'buggy': True, 'clean': False}
    df['Buggy'] = df['Buggy'].map(d)
    df, s_cols = apply_cfs(df)
    df = apply_smote(df)
    y = df.Buggy
    X = df.drop(labels = ['Buggy'],axis = 1)
    return X,y,s_cols

def load_data_te(s_project):
    s_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/1385/converted/' + s_project
    df = prepare_data(s_path)
    df.reset_index(drop=True,inplace=True)
    d = {'buggy': True, 'clean': False}
    df['Buggy'] = df['Buggy'].map(d)
    return df

In [52]:
train_X, train_y,s1_cols = load_data_t('google-collections.csv')
clf1 = LogisticRegression()
clf1.fit(train_X,train_y)

train_X, train_y,s2_cols = load_data_t('piccolo2d.csv')
clf2 = LogisticRegression()
clf2.fit(train_X,train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
for project in selected_projects:
    print("Selcted Project==============================================")
    test_df = load_data_te(project)
    _test_df = test_df[s1_cols]
    _test_y = _test_df.Buggy
    _test_X = _test_df.drop(labels = ['Buggy'],axis = 1)
    prediction1 = clf1.predict(_test_X)
    print("From Clf1----------------------")
    print(classification_report(_test_y, prediction1))
    _test_df = test_df[s2_cols]
    _test_y = _test_df.Buggy
    _test_X = _test_df.drop(labels = ['Buggy'],axis = 1)
    prediction2 = clf2.predict(_test_X)
    print("From Clf2-----------------------")
    print(classification_report(_test_y, prediction2))
    prediction = []
    for i in range(len(prediction1)):
        if (prediction1[i] == True) and (prediction2[i] == True):
            prediction.append(True)
        elif (prediction1[i] == False) and (prediction2[i] == False):
            prediction.append(False)
        elif (prediction1[i] == False) and (prediction2[i] == True):
            prediction.append(True)
        elif (prediction1[i] == True) and (prediction2[i] == False):
            prediction.append(False)
    #prediction = [a and b for a, b in zip(prediction1, prediction2)]
    print("From Clf mixed-------------------")
    print(classification_report(_test_y, prediction))

From Clf1----------------------
              precision    recall  f1-score   support

       False       0.12      1.00      0.22         2
        True       1.00      0.82      0.90        80

    accuracy                           0.83        82
   macro avg       0.56      0.91      0.56        82
weighted avg       0.98      0.83      0.89        82

From Clf2-----------------------
              precision    recall  f1-score   support

       False       0.03      1.00      0.05         2
        True       1.00      0.09      0.16        80

    accuracy                           0.11        82
   macro avg       0.51      0.54      0.11        82
weighted avg       0.98      0.11      0.16        82

From Clf mixed-------------------
              precision    recall  f1-score   support

       False       0.03      1.00      0.05         2
        True       1.00      0.09      0.16        80

    accuracy                           0.11        82
   macro avg       0.51      

From Clf1----------------------
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        92
        True       0.08      1.00      0.15         8

    accuracy                           0.08       100
   macro avg       0.04      0.50      0.07       100
weighted avg       0.01      0.08      0.01       100

From Clf2-----------------------
              precision    recall  f1-score   support

       False       0.95      0.76      0.84        92
        True       0.15      0.50      0.24         8

    accuracy                           0.74       100
   macro avg       0.55      0.63      0.54       100
weighted avg       0.88      0.74      0.79       100

From Clf mixed-------------------
              precision    recall  f1-score   support

       False       0.95      0.76      0.84        92
        True       0.15      0.50      0.24         8

    accuracy                           0.74       100
   macro avg       0.55      

From Clf1----------------------
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        10
        True       0.96      1.00      0.98       241

    accuracy                           0.96       251
   macro avg       0.48      0.50      0.49       251
weighted avg       0.92      0.96      0.94       251

From Clf2-----------------------
              precision    recall  f1-score   support

       False       0.06      0.80      0.12        10
        True       0.98      0.51      0.68       241

    accuracy                           0.53       251
   macro avg       0.52      0.66      0.40       251
weighted avg       0.95      0.53      0.65       251

From Clf mixed-------------------
              precision    recall  f1-score   support

       False       0.06      0.80      0.12        10
        True       0.98      0.51      0.68       241

    accuracy                           0.53       251
   macro avg       0.52      

In [34]:
z = [a and b for a, b in zip(x, y)]

In [35]:
z

[True, False, False, False]