In [25]:
from sklearn import tree
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from helpers import preview_tree
from copy import copy
import json
import csv
import math
import re
import numpy as np
import matplotlib.pyplot as plt
import random

Let's get the data

In [26]:
with open('./experiences/history.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)

Prepare
=======

Let's remove columns, convert to float, separate train, test & validation data and separate labels.

In [27]:
# Change label
label = '__changed'
label_index = 0

for i, column in enumerate(data[0]) :
    if (column == label) : 
        label_index = i
        break
        
if (label_index == 0) :
    print('Using the __changed label only...')

for i, row in enumerate(data) :
    if i == 0 : continue
    row[0] = row[0][0] + row[label_index][0]

Using the __changed label only...


In [28]:
# Remove Columns
def deleteColumns(regex, data) :
    indexes = []
    for i, row in enumerate(data[0]) :
        if (i == 0) : continue
        if (re.match(regex, row)) : indexes.append(i)
                
    for rowN, row in enumerate(data) : 
        for columnN in reversed(indexes) :
            del data[rowN][columnN]
        
deleteColumns('^_mostChanged.*', data)
deleteColumns('^__filename', data)

# deleteColumns('^authorChanges.*', data)
# deleteColumns('^authorChanges\:\:.*', data)
# deleteColumns('.*\:date\+size\-weighted.*', data)
# deleteColumns('.*\:date\-weighted.*', data)
# deleteColumns('.*\:size\-weighted.*', data)

# deleteColumns('^changes$', data)
# deleteColumns('^changes-fixes$', data)
# deleteColumns('^changes-others$', data)

# deleteColumns('.*\:normalized.*', data)
# deleteColumns('.*\:raw.*', data)

for i, a in enumerate(data[0]):
    if (i == 0) : continue
    print('%4d -> %s' % (i - 1, a))

   0 -> _bytes
   1 -> _lines
   2 -> changes-fixes:date+size-weighted:normalized
   3 -> changes-fixes:date+size-weighted:raw
   4 -> changes-fixes:date-weighted:normalized
   5 -> changes-fixes:date-weighted:raw
   6 -> changes-fixes:normalized
   7 -> changes-fixes:raw
   8 -> changes-fixes:size-weighted:normalized
   9 -> changes-fixes:size-weighted:raw
  10 -> changes-others:date+size-weighted:normalized
  11 -> changes-others:date+size-weighted:raw
  12 -> changes-others:date-weighted:normalized
  13 -> changes-others:date-weighted:raw
  14 -> changes-others:normalized
  15 -> changes-others:raw
  16 -> changes-others:size-weighted:normalized
  17 -> changes-others:size-weighted:raw
  18 -> changes:date+size-weighted:normalized
  19 -> changes:date+size-weighted:raw
  20 -> changes:date-weighted:normalized
  21 -> changes:date-weighted:raw
  22 -> changes:normalized
  23 -> changes:raw
  24 -> changes:size-weighted:normalized
  25 -> changes:size-weighted:raw


In [29]:
# To Float
for i, row in enumerate(data[1:]) :
    for j, value in enumerate(row[1:]) :        
        row[j+1] = float(value)

# Understand the Data

In [None]:
def getColumn(data, name) :
    for i, column in enumerate(data[0]) :
        if (column == name) : 
            result = []
            for k, row in enumerate(data[1:]) :
                result.append(float(row[i]))
            return result
        
X = np.array(getColumn(data, 'changes-fixes:normalized'))
Y = np.array(getColumn(data, '__date'))

# Y = np.random.normal(0, 2, len(X)) # getColumn(data, 'authors')

Z = getColumn(data, '__changed')

plt.scatter(X, Y, c=Z, s=100)
plt.show()

# Split

In [34]:
# Stratify
data_clean = [row for row in data if row[0] == '00']
data_buggy = [row for row in data if row[0] == '11']

random.shuffle(data_clean)
random.shuffle(data_buggy)

print('%d clean - %d buggy => %d total' % (len(data_clean), len(data_buggy), len(data_clean) + len(data_buggy)))

7165 clean - 1780 buggy => 8945 total


In [None]:
# for i in range(len(data_buggy[0])) : 
#     print(data[0][i+1] + ' => ' + str(np.mean([row[i+1] for row in data_buggy]) - np.mean([row[i+1] for row in data_clean])))

In [35]:
# Train + Test

def slicePercentage(data, fromPerc, toPerc) : 
    return data[math.floor(len(data)*fromPerc) : math.floor(len(data)*toPerc)]

def labelIt(data) :
    labels = []
    newData = []
    for i, row in enumerate(data) :
        labels.append(row[0])
        newData.append(row[1:])
    
    return {'data': newData, 'labels': labels}

breakpoint = len(data_buggy) * 0.7 * 1.6 / len(data_clean)
train = labelIt(slicePercentage(data_clean, 0, breakpoint) + slicePercentage(data_buggy, 0, 0.7))
test = labelIt(slicePercentage(data_clean, 0.7, 1) + slicePercentage(data_buggy, 0.7, 1))

print('%d train - %d test => %d total' % (len(train['data']), len(test['data']), len(test['data']) + len(train['data'])))

3239 train - 2684 test => 5923 total


# Train

In [42]:
# clf = tree.DecisionTreeClassifier()
# clf = SVC(C=1.0, class_weight="balanced", probability=True, tol=0.02, verbose=False)
clf = RandomForestClassifier(n_estimators=1000, n_jobs=3, class_weight="balanced", criterion='gini')
# clf = AdaBoostClassifier()
clf = clf.fit(train['data'], train['labels'])

In [45]:
train['data'][0]

[4636.0,
 95.0,
 0.03030485363017099,
 4.923737,
 0.166941866587825,
 0.984747,
 0.16666666666666666,
 1.0,
 0.030120481927710843,
 5.0,
 0.03397699801046551,
 96.491225,
 0.06019738008089545,
 1.9693649999999998,
 0.058823529411764705,
 2.0,
 0.03349282296650718,
 98.0,
 0.03370382050643181,
 101.414962,
 0.09686930032845656,
 2.954112,
 0.09523809523809523,
 3.0,
 0.03324468085106383,
 103.0]

In [None]:
predictions = clf.predict(test['data']).tolist()
print('Predicting %d clean and %d buggy' % (predictions.count('00'), predictions.count('11')))

# analysis[trueLabel][prediction]
analysis = {'00': {'00': 0, '11': 0}, '11': {'00': 0, '11': 0}}
for i, prediction in enumerate(predictions) :
    analysis[test['labels'][i]][prediction] += 1

print(analysis)

In [43]:
def predictProbs(test_data) :
    test_data = [row[1:] for row in test_data] # Remove label
    predictions_proba = clf.predict_proba(test_data).tolist()
    return [probs[1] for probs in predictions_proba]

predict_buggy = predictProbs(slicePercentage(data_buggy, 0.7, 1))
predict_clean = predictProbs(slicePercentage(data_clean, breakpoint, 1))

print('Buggy (%d): mean = %.2f%%; std = %.2f%%' % (len(predict_buggy), np.mean(predict_buggy)*100, np.std(predict_buggy)*100))
print('Clean (%d): mean = %.2f%%; std = %.2f%%' % (len(predict_clean), np.mean(predict_clean)*100, np.std(predict_clean)*100))

Buggy (534): mean = 56.30%; std = 25.03%
Clean (5172): mean = 25.84%; std = 21.57%


In [None]:
from operator import itemgetter

importances = []
for i, importance in enumerate(clf.feature_importances_) :
    importances.append([importance, data[0][i+1]])

importances = sorted(importances, key=itemgetter(0), reverse=True)

for i, row in enumerate(importances) :
    print('%.3f%% - %s' % (row[0] * 100, row[1]))

In [None]:
# from sklearn import tree
# from sklearn.externals.six import StringIO
# import graphviz

# def preview_tree(clf, class_names, feature_names) :
#     dot_data = StringIO()
#     tree.export_graphviz(clf, out_file=dot_data, 
#                          rotate=False, rounded=True, filled=True,
#                          class_names=class_names, feature_names=feature_names)
#     dot = graphviz.Source(dot_data.getvalue().replace('digraph Tree {', """digraph Tree {
#          node [ fontname=Arial, fontsize=8];
#     """))
#     dot.render('x', view=True)


In [None]:
# preview_tree(clf.estimators_[0], ['clean', 'buggy'], data[0][1:])