## Importing packages

In [1]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

## Loading the file

In [2]:
data=pd.read_csv("ant-1.4.csv")
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 24 columns):
name       178 non-null object
version    178 non-null float64
name.1     178 non-null object
wmc        178 non-null int64
dit        178 non-null int64
noc        178 non-null int64
cbo        178 non-null int64
rfc        178 non-null int64
lcom       178 non-null int64
ca         178 non-null int64
ce         178 non-null int64
npm        178 non-null int64
lcom3      178 non-null float64
loc        178 non-null int64
dam        178 non-null float64
moa        178 non-null int64
mfa        178 non-null float64
cam        178 non-null float64
ic         178 non-null int64
cbm        178 non-null int64
amc        178 non-null float64
max_cc     178 non-null int64
avg_cc     178 non-null float64
bug        178 non-null int64
dtypes: float64(7), int64(15), object(2)
memory usage: 33.5+ KB


Unnamed: 0,version,wmc,dit,noc,cbo,rfc,lcom,ca,ce,npm,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,bug
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,...,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.4,10.494382,2.269663,0.58427,10.752809,33.837079,74.101124,5.691011,5.449438,8.196629,...,0.701811,0.679775,0.467399,0.488047,0.662921,1.05618,26.664679,4.331461,1.377567,0.264045
std,4.453419e-16,10.988565,1.304179,3.337203,16.645035,30.663683,275.640663,16.494479,4.911675,9.166421,...,0.435954,1.371105,0.415967,0.261298,1.024489,1.965002,24.840011,5.083175,0.993203,0.545085
min,1.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.4,4.0,1.0,0.0,3.25,12.0,0.0,0.0,2.0,3.0,...,0.0,0.0,0.0,0.291098,0.0,0.0,11.7,1.0,0.8,0.0
50%,1.4,7.0,2.0,0.0,7.0,24.0,5.0,1.0,4.0,5.0,...,1.0,0.0,0.60451,0.444444,0.0,0.0,20.666667,2.5,1.01925,0.0
75%,1.4,14.0,3.0,0.0,11.0,48.0,41.0,4.0,8.0,10.0,...,1.0,1.0,0.880952,0.666667,1.0,1.0,33.67674,5.0,1.6859,0.0
max,1.4,77.0,6.0,40.0,136.0,196.0,2446.0,135.0,28.0,68.0,...,1.0,9.0,1.0,1.0,4.0,11.0,208.166667,35.0,6.087,3.0


## Data clean-up: Conversion, pruning, split

In [3]:
data["bug"]=data["bug"]>0 #if bug is greater than zero then buggy File
data['bug'].head(10)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: bug, dtype: bool

## Checking for nulls

In [4]:
data_null=data[data.isnull().any(axis=1)] #checking for any null value
data_null

Unnamed: 0,name,version,name.1,wmc,dit,noc,cbo,rfc,lcom,ca,...,dam,moa,mfa,cam,ic,cbm,amc,max_cc,avg_cc,bug


### Pruning the columns

In [5]:
data_corr=data
x=data.drop(["bug","name","version","name.1"],1) #Dropping Input variables not relevant
y=data_corr["bug"]  #Output variable

## Importing packages for machine learning

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Split into training and testing data

In [7]:
# Train test split: 90% train, 10% test data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=1000) 
x_train.shape

(160, 20)

In [8]:
x_test.shape

(18, 20)

In [9]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Logistic Regression

In [10]:
log=LogisticRegression(random_state=0)
log.fit(x_train,y_train)
y_pred=pd.DataFrame(log.predict(x_test))

In [11]:
#Comparing Actual vs. Predicted outcomes
pd.DataFrame({'Actual': [y for y in y_test.values], 
              'Predicted': [y for y in y_pred.values]})

Unnamed: 0,Actual,Predicted
0,False,[False]
1,False,[False]
2,True,[False]
3,False,[False]
4,False,[False]
5,False,[False]
6,False,[False]
7,False,[False]
8,False,[False]
9,False,[False]


In [12]:
print(accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)

0.8888888888888888


array([[14,  0],
       [ 2,  2]], dtype=int64)

In [13]:
#5 fold cross validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log, x_train,y_train, cv=10) #5 fold cross validation
scores

array([0.82352941, 0.70588235, 0.70588235, 0.64705882, 0.5625    ,
       0.625     , 0.8       , 0.8       , 0.73333333, 0.8       ])

In [14]:
#Leave one out cross validation
from sklearn import model_selection
loocv = model_selection.LeaveOneOut()
results = model_selection.cross_val_score(log, x_train,y_train, cv=loocv)

results.shape

(160,)