In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

random.seed(10)

  from numpy.core.umath_tests import inner1d


In [2]:
os.getcwd()
data_path = "/defects/src/data/Jureczko/collated_data/"
os.chdir(os.getcwd() + data_path)

In [3]:
data = pd.read_csv("ant_merged.csv")
print(data.head())
X = data.loc[:,data.columns!='$<bug']
y = data.loc[:,data.columns=='$<bug']

   $wmc  $dit  $noc  $cbo  $rfc  $lcom  $ca  $ce  $npm    $lcom3  ...    $dam  \
0    11     4     2    14    42     29    2   12     5  0.725000  ...     1.0   
1    14     1     1     8    32     49    4    4    12  0.835165  ...     1.0   
2     3     2     0     1     9      0    0    1     1  0.000000  ...     1.0   
3    12     3     0    12    37     32    0   12    12  0.858586  ...     1.0   
4     6     3     0     4    21      1    0    4     6  0.700000  ...     1.0   

   $moa      $mfa      $cam  $ic  $cbm       $amc  $max_cc  $avg_cc  $<bug  
0     1  0.885057  0.232323    3     4  34.545455        3   1.2727      0  
1     0  0.000000  0.307692    0     0  16.857143        6   1.6429      1  
2     1  0.714286  0.666667    1     1  17.333333        1   0.6667      0  
3     1  0.770833  0.458333    0     0  24.083333        3   1.4167      0  
4     0  0.880952  0.416667    2     2  21.000000        1   0.8333      0  

[5 rows x 21 columns]


In [4]:
print("X dim: ",X.shape)
print("Y dim: ",y.shape)

X dim:  (1692, 20)
Y dim:  (1692, 1)


In [5]:
projList = os.listdir()
projs = [p.split('_')[0] for p in projList]
print("List of projects:",projs)

List of projects: ['ant', 'camel', 'ivy', 'jedit', 'log4j', 'lucene', 'poi', 'velocity', 'xalan', 'xerces']


In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
# print("X_train dim: ", X_train.shape)
# print("X_test dim: ", X_test.shape)
# print("y_train dim: ", y_train.shape)
# print("y_test dim: ", y_test.shape)

In [13]:
tbl = [[0]*len(projs) for p in projs]

for i in range(len(projs)):
    print("\nIteration ",i)
    trainData = pd.read_csv(projList[i])
    #print("train data: ", projList[i], projs[i])
    #print(trainData.head())
    X_train = trainData.loc[:,trainData.columns!='$<bug']
    y_train = trainData.loc[:,trainData.columns=='$<bug']
    for j in range(len(projs)):
        if (i != j):
            testData = pd.read_csv(projList[j])
            #print("test data: ", projList[j], projs[j])
            #print(testData.head())
            X_test = testData.loc[:, testData.columns!='$<bug']
            y_test = testData.loc[:, testData.columns=='$<bug']
            clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
            clf.fit(X_train, y_train.values.ravel())
            y_pred = clf.predict(X_test)
            cm = confusion_matrix(y_test, y_pred)
            #print("Sum: ",sum(y_pred))
            #print("Confusion Matrix: ",cm)
            #print("True Positive: ", cm[1][1])
            #print("False Positive: ", cm[0][1])
            #print("True Negative: ", cm[0][0])
            #print("False Negative: ", cm[1][0])
            #pd = tp/(tp+fn)
            recall = cm[1][1]/(cm[1][1] + cm[1][0])
            #pf = fp/(fp+tn)
            pf = cm[0][1]/(cm[0][1] + cm[0][0])
            
            g = 2/((1/recall) + (1/(1-pf)))
            
            print("Model trained on ",projs[i] + " " +  projs[j])
            acc = metrics.accuracy_score(y_test, y_pred)
            #print("Project: {}, Accuracy: {}, Precision: {}".format(projs[j],acc,
            #                                                       metrics.precision_score(y_test, y_pred)))
            tbl[i][j]=acc
            #print("Test Project: {}, G-Score {}".format(projs[j], g))
print(tbl)


Iteration  0
Model trained on  ant camel
Model trained on  ant ivy
Model trained on  ant jedit
Model trained on  ant log4j
Model trained on  ant lucene
Model trained on  ant poi
Model trained on  ant velocity
Model trained on  ant xalan
Model trained on  ant xerces

Iteration  1
Model trained on  camel ant
Model trained on  camel ivy
Model trained on  camel jedit
Model trained on  camel log4j
Model trained on  camel lucene
Model trained on  camel poi
Model trained on  camel velocity
Model trained on  camel xalan
Model trained on  camel xerces

Iteration  2
Model trained on  ivy ant
Model trained on  ivy camel
Model trained on  ivy jedit
Model trained on  ivy log4j
Model trained on  ivy lucene
Model trained on  ivy poi
Model trained on  ivy velocity
Model trained on  ivy xalan
Model trained on  ivy xerces

Iteration  3
Model trained on  jedit ant
Model trained on  jedit camel
Model trained on  jedit ivy
Model trained on  jedit log4j
Model trained on  jedit lucene
Model trained on  jedi

In [21]:
df = pd.DataFrame(tbl)
df.columns=projs
df.insert(0,'projects',projs)
print(df)
#print(os.getcwd())
df.to_csv("..//baseline_accuracy.csv",index=False)

   projects       ant     camel       ivy     jedit     log4j    lucene  \
0       ant  0.000000  0.790589  0.809659  0.808462  0.456570  0.492327   
1     camel  0.780142  0.000000  0.813920  0.775300  0.489978  0.487212   
2       ivy  0.806147  0.788434  0.000000  0.819897  0.429844  0.462916   
3     jedit  0.799054  0.801006  0.829545  0.000000  0.423163  0.442455   
4     log4j  0.319740  0.344109  0.259943  0.288736  0.000000  0.599744   
5    lucene  0.433806  0.463721  0.467330  0.438536  0.623608  0.000000   
6       poi  0.591608  0.645474  0.636364  0.551172  0.634744  0.635550   
7  velocity  0.531324  0.452586  0.386364  0.424814  0.585746  0.576726   
8     xalan  0.552009  0.627874  0.529830  0.424242  0.603563  0.595908   
9    xerces  0.400709  0.354526  0.382102  0.363636  0.596882  0.625320   

        poi  velocity     xalan    xerces  
0  0.513788  0.439750  0.504518  0.631163  
1  0.524673  0.463224  0.501807  0.606817  
2  0.496372  0.430360  0.483735  0.613512 

### Training a random forest classifier

In [15]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
clf.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
y_predictions = clf.predict(X_test)

In [17]:
print("Accuracy: ",metrics.accuracy_score(y_test, y_predictions))
print("Precision: ",metrics.precision_score(y_test, y_predictions))
print("Recall: ",metrics.recall_score(y_test, y_predictions))
print("F1-score: ",metrics.f1_score(y_test, y_predictions))

Accuracy:  0.8053097345132744
Precision:  0.5217391304347826
Recall:  0.35294117647058826
F1-score:  0.4210526315789474
