In [1]:
import os, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
random.seed(10)

In [2]:
os.getcwd()
data_path = "/defects/src/data/Jureczko/collated_data/"
os.chdir(os.getcwd() + data_path)

In [3]:
data = pd.read_csv("ant_merged.csv")
print(data.head())
X = data.loc[:,data.columns!='$<bug']
y = data.loc[:,data.columns=='$<bug']

   $wmc  $dit  $noc  $cbo  $rfc  $lcom  $ca  $ce  $npm    $lcom3  ...    $dam  \
0    11     4     2    14    42     29    2   12     5  0.725000  ...     1.0   
1    14     1     1     8    32     49    4    4    12  0.835165  ...     1.0   
2     3     2     0     1     9      0    0    1     1  0.000000  ...     1.0   
3    12     3     0    12    37     32    0   12    12  0.858586  ...     1.0   
4     6     3     0     4    21      1    0    4     6  0.700000  ...     1.0   

   $moa      $mfa      $cam  $ic  $cbm       $amc  $max_cc  $avg_cc  $<bug  
0     1  0.885057  0.232323    3     4  34.545455        3   1.2727      0  
1     0  0.000000  0.307692    0     0  16.857143        6   1.6429      1  
2     1  0.714286  0.666667    1     1  17.333333        1   0.6667      0  
3     1  0.770833  0.458333    0     0  24.083333        3   1.4167      0  
4     0  0.880952  0.416667    2     2  21.000000        1   0.8333      0  

[5 rows x 21 columns]


In [4]:
print("X dim: ",X.shape)
print("Y dim: ",y.shape)

X dim:  (1692, 20)
Y dim:  (1692, 1)


In [15]:
projList = os.listdir()
projs = [p.split('_')[0] for p in projList]
print("List of projects:",projs)

List of projects: ['ant', 'camel', 'ivy', 'jedit', 'log4j', 'lucene', 'poi', 'velocity', 'xalan', 'xerces']


In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
# print("X_train dim: ", X_train.shape)
# print("X_test dim: ", X_test.shape)
# print("y_train dim: ", y_train.shape)
# print("y_test dim: ", y_test.shape)

In [26]:
for i in range(len(projs)):
    print("\nIteration ",i)
    trainData = pd.read_csv(projList[i])
    #print("train data: ", projList[i], projs[i])
    #print(trainData.head())
    X_train = trainData.loc[:,trainData.columns!='$<bug']
    y_train = trainData.loc[:,trainData.columns=='$<bug']
    for j in range(len(projs)):
        if (i != j):
            testData = pd.read_csv(projList[j])
            #print("test data: ", projList[j], projs[j])
            #print(testData.head())
            X_test = testData.loc[:, testData.columns!='$<bug']
            y_test = testData.loc[:, testData.columns=='$<bug']
            clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
            clf.fit(X_train, y_train.values.ravel())
            y_pred = clf.predict(X_test)
            print("Model trained on ",projs[i])
            print("Project: {} Accuracy: {}".format(projs[j],metrics.accuracy_score(y_test, y_pred)))

            


Iteration  0
Model trained on  ant
Project: camel Accuracy: 0.790948275862069
Model trained on  ant
Project: ivy Accuracy: 0.8011363636363636
Model trained on  ant
Project: jedit Accuracy: 0.8107489994282447
Model trained on  ant
Project: log4j Accuracy: 0.45434298440979953
Model trained on  ant
Project: lucene Accuracy: 0.49104859335038364
Model trained on  ant
Project: poi Accuracy: 0.5137880986937591
Model trained on  ant
Project: velocity Accuracy: 0.4397496087636933
Model trained on  ant
Project: xalan Accuracy: 0.5051204819277109
Model trained on  ant
Project: xerces Accuracy: 0.62507608034084

Iteration  1
Model trained on  camel
Project: ant Accuracy: 0.7807328605200946
Model trained on  camel
Project: ivy Accuracy: 0.8082386363636364
Model trained on  camel
Project: jedit Accuracy: 0.7827329902801601
Model trained on  camel
Project: log4j Accuracy: 0.48775055679287305
Model trained on  camel
Project: lucene Accuracy: 0.4859335038363171
Model trained on  camel
Project: poi Acc

### Training a random forest classifier

In [15]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=1)
clf.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
y_predictions = clf.predict(X_test)

In [17]:
print("Accuracy: ",metrics.accuracy_score(y_test, y_predictions))
print("Precision: ",metrics.precision_score(y_test, y_predictions))
print("Recall: ",metrics.recall_score(y_test, y_predictions))
print("F1-score: ",metrics.f1_score(y_test, y_predictions))

Accuracy:  0.8053097345132744
Precision:  0.5217391304347826
Recall:  0.35294117647058826
F1-score:  0.4210526315789474
