#### TODO
* Precision-Recall http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
* Use Git meta data as feature

#### Columns
0. id
1. repository_id
2. blamed_commit_id
3. type
4. sha
5. url
6. author_email
7. author_name
8. author_when
9. committer_email
10. committer_name
11. committer_when
12. additions
13. deletions
14. total_changes
15. past_changes
16. future_changes
17. past_different_authors
18. future_different_authors
19. author_contributions_percent
20. message
21. patch
22. hunk_count
23. cve
24. files_changed
25. patch_keywords

In [1]:
import psycopg2
conn = psycopg2.connect("dbname=postgres host=localhost port=55432 user=postgres")
cur = conn.cursor()
cur.execute("SELECT type, COUNT(id) FROM export.commits GROUP BY type")
cur.fetchall()


[('blamed_commit', 714L), ('fixing_commit', 1137L), ('other_commit', 349558L)]

In [11]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
import numpy as np
import scipy as sp
import psycopg2
import enum

class Colum(enum.IntEnum):
    id = 1
    type = 3
    patch = 21
    
class Data:
    def __init__(self, config):
        self.cur = psycopg2.connect(config).cursor()
    def fetchAll(self, sql):
        self.cur.execute(sql)
        return np.array(self.cur.fetchall())

def tfidf(text):
    # @TODO preprocessor: log number
    vectorizer = TfidfVectorizer(min_df=1)
    vectorized = vectorizer.fit_transform(text)
#     print vectorizer.get_feature_names()
    return vectorized.toarray()

# Fetch original data
data = Data("dbname=postgres host=localhost port=55432 user=postgres")
vcc = data.fetchAll("SELECT * FROM export.commits WHERE type  = 'blamed_commit' ORDER BY RANDOM() LIMIT 50")
ucc = data.fetchAll("SELECT * FROM export.commits WHERE type != 'blamed_commit' ORDER BY RANDOM() LIMIT 1000")

# Concat onece and shuffle
sample = np.concatenate([vcc, ucc])
np.random.shuffle(sample)
patches = sample[:, Colum.patch]
labels = sample[:, Colum.type]

x = tfidf(patches)
y = is_vcc = (labels == 'blamed_commit')
vcc_paches = patches[is_vcc]
ucc_paches = patches[~is_vcc]

clf = LinearSVC(C=1.0)
scores = cross_validation.cross_val_score(clf, x, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.95 (+/- 0.00)


In [None]:
# Precision-Recall by TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import label_binarize
from sklearn import cross_validation
import numpy as np
import scipy as sp
import psycopg2
import enum

class Colum(enum.IntEnum):
    id = 1
    type = 3
    patch = 21
    
class Data:
    def __init__(self, config):
        self.cur = psycopg2.connect(config).cursor()
    def fetchAll(self, sql):
        self.cur.execute(sql)
        return np.array(self.cur.fetchall())

def tfidf(text):
    vectorizer = TfidfVectorizer(min_df=1)
    vectorized = vectorizer.fit_transform(text)
    return vectorized.toarray()

precision = dict()
recall = dict()
average_precision = dict()

# Fetch original data
data = Data("dbname=postgres host=localhost port=55432 user=postgres")
vcc = data.fetchAll("SELECT * FROM export.commits WHERE type  = 'blamed_commit' ORDER BY RANDOM() LIMIT 5")
ucc = data.fetchAll("SELECT * FROM export.commits WHERE type != 'blamed_commit' ORDER BY RANDOM() LIMIT 100")

# Concat onece and shuffle
sample = np.concatenate([vcc, ucc])
np.random.shuffle(sample)
patches = sample[:, Colum.patch]
labels = sample[:, Colum.type]

X = tfidf(patches)
y = is_vcc = (labels == 'blamed_commit')

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# Run classifier
classifier = LinearSVC(C=1.0)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
precision[0], recall[0], _ = precision_recall_curve(y_test, y_score)
average_precision[0] = average_precision_score(y_test, y_score)

print precision, recall, average_precision

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()

# # Plot Precision-Recall curve for each class
# plt.clf()
# plt.plot(recall[0], precision[0], label='Precision-recall curve of class {0} (area = {1:0.2f})'
#          ''.format(i, average_precision[0]))

# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Extension of Precision-Recall curve to multi-class')
# plt.legend(loc="lower right")
# plt.show()

In [2]:
# Precision-Recall by CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import label_binarize
from sklearn import cross_validation
import numpy as np
import scipy as sp
import psycopg2
import enum

class Colum(enum.IntEnum):
    id = 1
    type = 3
    patch = 21
    
class Data:
    def __init__(self, config):
        self.cur = psycopg2.connect(config).cursor()
    def fetchAll(self, sql):
        self.cur.execute(sql)
        return np.array(self.cur.fetchall())

def vectorize(text):
    vectorizer = CountVectorizer(min_df=1)
    vectorized = vectorizer.fit_transform(text)
    return vectorized.toarray()

precision = dict()
recall = dict()
average_precision = dict()

# Fetch original data
data = Data("dbname=postgres host=localhost port=55432 user=postgres")
vcc = data.fetchAll("SELECT * FROM export.commits WHERE type  = 'blamed_commit' ORDER BY RANDOM() LIMIT 5")
ucc = data.fetchAll("SELECT * FROM export.commits WHERE type != 'blamed_commit' ORDER BY RANDOM() LIMIT 100")

# Concat onece and shuffle
sample = np.concatenate([vcc, ucc])
np.random.shuffle(sample)
patches = sample[:, Colum.patch]
labels = sample[:, Colum.type]

X = vectorize(patches)
y = is_vcc = (labels == 'blamed_commit')

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# Run classifier
classifier = LinearSVC(C=1.0)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
precision[0], recall[0], _ = precision_recall_curve(y_test, y_score)
average_precision[0] = average_precision_score(y_test, y_score)

#print precision, recall, average_precision

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()


{0: array([ 0.05882353,  0.04      ,  0.04081633,  0.04166667,  0.04255319,
        0.04347826,  0.04444444,  0.04545455,  0.04651163,  0.02380952,
        0.02439024,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ])} {0: array([ 1.        ,  0.66666667,  0.66666667,  0.66666667,  0.66666667,
        0.66666667,  0.66666667,  0.66666667,  0.66666667,  0.33333333,
        0.33333333,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  

In [2]:
# Precision-Recall
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import label_binarize
from sklearn import cross_validation
import numpy as np
import scipy as sp
import psycopg2
import enum
from datetime import datetime
from diff_extractor import extract_lines, extract_added_lines, extract_removed_lines

import re
from unidiff import PatchSet


def _normalize(str):
    return re.sub(r'[ \t]+', ' ', str.value.strip())

def is_added_or_removed(line):
    return line.is_added or line.is_removed
    
class Colum(enum.IntEnum):
    id = 1
    type = 3
    patch = 21
    
class Data:
    def __init__(self, config):
        self.cur = psycopg2.connect(config).cursor()
    def fetchAll(self, sql):
        self.cur.execute(sql)
        return np.array(self.cur.fetchall())

def vectorize(text):
    vectorizer = CountVectorizer(min_df=1)
    vectorized = vectorizer.fit_transform(text)
    return vectorized.toarray()

precision = dict()
recall = dict()
average_precision = dict()

# Fetch original data
data = Data("dbname=postgres host=localhost port=55432 user=postgres")
vcc = data.fetchAll("SELECT * FROM export.commits WHERE type  = 'blamed_commit' LIMIT 5")
ucc = data.fetchAll("SELECT * FROM export.commits WHERE type != 'blamed_commit' LIMIT 100")

# Concat onece and shuffle
sample = np.concatenate([vcc, ucc])
np.random.shuffle(sample)
patches = sample[:, Colum.patch]
labels = sample[:, Colum.type]

X = vectorize([" ".join(extract_lines(patch.decode('utf8', 'ignore').encode('utf8', 'ignore').splitlines(), is_added_or_removed)) for patch in patches])

y = is_vcc = (labels == 'blamed_commit')

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# Run classifier
classifier = LinearSVC(C=1.0)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
precision[0], recall[0], _ = precision_recall_curve(y_test, y_score)
average_precision[0] = average_precision_score(y_test, y_score)

print precision, recall, average_precision

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
# plt.show()
plt.savefig("figure_%s" % datetime.now().strftime('%s'))


{0: array([ 0.06      ,  0.04081633,  0.04166667,  0.04255319,  0.04347826,
        0.02222222,  0.02272727,  0.02325581,  0.02380952,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ])} {0: array([ 1.        ,  0.66666667,  0.66666667,  0.66666667,  0.66666667,
        0.33333333,  0.33333333,  0.33333333,  0.33333333,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        

In [None]:
# TODO According to VCC-Finder paper, combination with Git metrics improve Precision-Recall score

In [1]:
import numpy as np

npz = np.load("var/vcc_data.npz")
print npz.files
# npz['arr_0']
npz.close()


['arr_0']


In [8]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import label_binarize
from sklearn import cross_validation
import numpy as np
import scipy as sp
import enum
from datetime import datetime
from diff_extractor import extract_lines, extract_added_lines, extract_removed_lines

class Colum(enum.IntEnum):
    id = 1
    type = 3
    patch = 21
    
def fetch(filename, key = None):
    npz = np.load(filename)
    key = npz.files[0] if key is None else key
    data = npz[key]
    npz.close()
    return data

def vectorize(text):
    vectorizer = CountVectorizer(min_df=1)
    vectorized = vectorizer.fit_transform(text)
    return vectorized.toarray()

data = fetch('var/vcc_sample_40x800.npz')

patches = data[:, Colum.patch]
labels = data[:, Colum.type]

# Note: Be sure that unicode(patch, 'utf-8')
X = vectorize([" ".join(extract_lines(patch.splitlines())) for patch in patches])
y = is_vcc = (labels == 'blamed_commit')

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# Run classifier
classifier = LinearSVC(C=1.0)
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
precision[0], recall[0], _ = precision_recall_curve(y_test, y_score)
average_precision[0] = average_precision_score(y_test, y_score)

print precision, recall, average_precision

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()
plt.savefig("figure_%s" % datetime.now().strftime('%s'))


{0: array([ 0.03381643,  0.031477  ,  0.0315534 ,  0.03163017,  0.03170732,
        0.03178484,  0.03186275,  0.03194103,  0.0320197 ,  0.03209877,
        0.03217822,  0.03225806,  0.03233831,  0.03241895,  0.0325    ,
        0.03258145,  0.03266332,  0.03274559,  0.03282828,  0.03291139,
        0.03299492,  0.03307888,  0.03316327,  0.03324808,  0.03333333,
        0.03341902,  0.03350515,  0.03359173,  0.03367876,  0.03376623,
        0.03385417,  0.03394256,  0.03403141,  0.03412073,  0.03421053,
        0.03430079,  0.03439153,  0.03183024,  0.03191489,  0.032     ,
        0.03208556,  0.03217158,  0.03225806,  0.03234501,  0.03243243,
        0.03252033,  0.0298913 ,  0.02997275,  0.03005464,  0.03013699,
        0.03021978,  0.03030303,  0.03038674,  0.03047091,  0.03055556,
        0.02785515,  0.02793296,  0.0280112 ,  0.02808989,  0.02816901,
        0.02824859,  0.02549575,  0.02556818,  0.02564103,  0.02571429,
        0.02578797,  0.02586207,  0.0259366 ,  0.02601156,  

In [2]:
import sys
print sys.getdefaultencoding()


ascii
