## Importing packages

In [2]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the file

In [3]:
data=pd.read_csv("ant-1.4.csv")
data.info()
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 24 columns):
name       178 non-null object
version    178 non-null float64
name.1     178 non-null object
wmc        178 non-null int64
dit        178 non-null int64
noc        178 non-null int64
cbo        178 non-null int64
rfc        178 non-null int64
lcom       178 non-null int64
ca         178 non-null int64
ce         178 non-null int64
npm        178 non-null int64
lcom3      178 non-null float64
loc        178 non-null int64
dam        178 non-null float64
moa        178 non-null int64
mfa        178 non-null float64
cam        178 non-null float64
ic         178 non-null int64
cbm        178 non-null int64
amc        178 non-null float64
max_cc     178 non-null int64
avg_cc     178 non-null float64
bug        178 non-null int64
dtypes: float64(7), int64(15), object(2)
memory usage: 33.5+ KB


Unnamed: 0,name,version,name.1,wmc,dit,noc,cbo,rfc,lcom,ca,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,bug
0,ant,1.4,org.apache.tools.ant.taskdefs.PathConvert,14,3,0,8,41,33,1,...,1.0,2,0.74,0.357143,1,1,25.857143,4,1.5714,0
1,ant,1.4,org.apache.tools.ant.taskdefs.Untar,5,3,0,9,43,0,0,...,1.0,0,0.902439,0.533333,2,3,53.6,1,0.8,0
2,ant,1.4,org.apache.tools.ant.taskdefs.PumpStreamHandler,13,1,1,8,20,46,6,...,1.0,0,0.0,0.615385,0,0,7.230769,1,0.7692,0
3,ant,1.4,org.apache.tools.ant.taskdefs.Copydir,8,4,0,7,35,4,0,...,1.0,0,0.917647,0.46875,3,3,31.375,5,1.375,0
4,ant,1.4,org.apache.tools.tar.TarEntry,31,1,0,6,65,243,4,...,1.0,0,0.0,0.182796,1,1,22.612903,4,1.1613,0
5,ant,1.4,org.apache.tools.tar.TarOutputStream,15,3,0,3,37,41,1,...,0.75,1,0.454545,0.311111,1,2,27.333333,1,0.8,0
6,ant,1.4,org.apache.tools.ant.taskdefs.JikesOutputParser,12,1,0,3,21,44,1,...,1.0,1,0.0,0.261905,0,0,10.666667,4,1.25,0
7,ant,1.4,org.apache.tools.ant.taskdefs.compilers.Sj,2,2,0,6,9,1,1,...,0.0,0,0.958333,1.0,0,0,15.5,1,0.5,0
8,ant,1.4,org.apache.tools.ant.taskdefs.Filter,6,3,0,6,16,5,0,...,1.0,0,0.880952,0.5,0,0,14.333333,1,0.8333,0
9,ant,1.4,org.apache.tools.ant.taskdefs.ExecuteOn,15,4,2,18,55,37,2,...,1.0,4,0.846154,0.193333,3,7,47.0,12,2.4667,1


# Data clean-up: Conversion, pruning, split

In [4]:
# Convert the 'bug' column from integer type to boolean ()
# if bug is greater than zero, then, buggy file
data["bug"]=(data["bug"]>0)
data['bug'].head(10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: bug, dtype: bool

# Checking for nulls

In [5]:
data_null=data[data.isnull().any(axis=1)] #checking for any null value
data_null

Unnamed: 0,name,version,name.1,wmc,dit,noc,cbo,rfc,lcom,ca,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,bug


### Pruning the columns

In [6]:
data_corr=data
x=data.drop(["bug","name","version","name.1"],1) #Dropping Input variables not relevant
y=data_corr["bug"]  #Output variable

## Importing packages for machine learning

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Split into training and testing data

In [8]:
# Train test split: 90% train, 10% test data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=1000) 
x_train.shape

(160, 20)

In [9]:
x_test.shape

(18, 20)

In [10]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Logistic Regression

In [11]:
log=LogisticRegression(random_state=0)
log.fit(x_train,y_train)
y_pred=pd.DataFrame(log.predict(x_test))

In [12]:
#Comparing Actual vs. Predicted outcomes
pd.DataFrame({'Actual': [y for y in y_test.values], 
              'Predicted': [y for y in y_pred.values]})

Unnamed: 0,Actual,Predicted
0,False,[False]
1,False,[False]
2,True,[False]
3,False,[False]
4,False,[False]
5,False,[False]
6,False,[False]
7,False,[False]
8,False,[False]
9,False,[False]


In [13]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8888888888888888
[[14  0]
 [ 2  2]]


In [14]:
import pickle
filename = 'defectModel'
outfile = open(filename,'wb')
pickle.dump(log,outfile) #save the model
outfile.close()

#We saved the best model with 88 percent accuracy for deployment