#### TODO
* Precision-Recall http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
* Use Git meta data as feature

#### Columns
0. id
1. repository_id
2. blamed_commit_id
3. type
4. sha
5. url
6. author_email
7. author_name
8. author_when
9. committer_email
10. committer_name
11. committer_when
12. additions
13. deletions
14. total_changes
15. past_changes
16. future_changes
17. past_different_authors
18. future_different_authors
19. author_contributions_percent
20. message
21. patch
22. hunk_count
23. cve
24. files_changed
25. patch_keywords

In [1]:
import psycopg2
conn = psycopg2.connect("dbname=postgres host=localhost port=55432 user=postgres")
cur = conn.cursor()
cur.execute("SELECT type, COUNT(id) FROM export.commits GROUP BY type")
cur.fetchall()


[('blamed_commit', 714L), ('fixing_commit', 1137L), ('other_commit', 349558L)]

In [48]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import label_binarize
from sklearn import cross_validation
import numpy as np
import scipy as sp
import enum
from datetime import datetime
from diff_extractor import extract_lines, extract_added_lines, extract_removed_lines
from unidiff.errors import UnidiffParseError

class Colum(enum.IntEnum):
    id = 1
    type = 3
    additions = 13
    deletions = 14
    past_different_authors = 18
    future_different_authors = 19
    author_contributions_percent = 20
    patch = 21
    hunk_count = 23
    files_changed = 25
    
def fetch(filename, key = None):
    npz = np.load(filename)
    key = npz.files[0] if key is None else key
    data = npz[key]
    npz.close()
    return data


print 'Started loading data'
data = fetch('var/vcc_sample_40x800.npz')
# data = fetch('var/vcc_data.npz')
print 'Data loaded #%d' % len(data)

patches = data[:, Colum.patch]
labels = data[:, Colum.type]

# Note: Be sure that unicode(patch, 'utf-8')
print 'Start extracting lines...'
cleansed_paches = [u''] * len(patches)
invalid_paches = []

# @TODO Replace all numeric token N by log(N) in each patch
for index, patch in enumerate(patches):
    try:
        cleansed_paches[index] = u' '.join(extract_lines(patch.splitlines()))
    except UnidiffParseError as e:
        # @todo Recover 445 patches at total in vcc_data.npz
        invalid_paches.append((index, patch, e))

print 'Completed extracting lines including #%d invalid patches.' % len(invalid_paches)

vectorizer = CountVectorizer(min_df=1)
vectorized = vectorizer.fit_transform(cleansed_paches) # csr_matrix
# feature_names = vectorizer.get_feature_names()
X = vectorized.toarray()

# Now X is sparse array looks like:
# [[0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]
#  ..., 
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]]

# Bind metrics from Git metadata
target_metrics = map(lambda n:n-1, [
    Colum.additions,
    Colum.deletions,
    Colum.past_different_authors,
    Colum.future_different_authors,
    Colum.author_contributions_percent,
    Colum.hunk_count,
    Colum.files_changed,
])

X2 = np.hstack(([row[target_metrics] for row in data], X))
# Now combined X2 looks like
# [[3L 66L 7L ..., 0L 0L 0L]
#  [54L 23L 6L ..., 0L 0L 0L]
#  [3L 1L 12L ..., 0L 0L 0L]
#  ..., 
#  [1L 2L 1L ..., 0L 0L 0L]
#  [46L 21L 0L ..., 0L 0L 0L]
#  [32L 12L 42L ..., 0L 0L 0L]]

!osascript -e 'display notification "Done" with title "VCC-Note" sound name "Purr"'


Started loading data
Data loaded #840
Start extracting lines...
Completed extracting lines including #0 invalid patches.
[[0.001128002046518 7L 25L ..., 0L 0L 0L]
 [0.0145636640169885 6L 15L ..., 0L 0L 0L]
 [0.0536618754277892 12L 14L ..., 0L 0L 0L]
 ..., 
 [0.00174236030399655 1L 12L ..., 0L 0L 0L]
 [0.310701064654059 0L 188L ..., 0L 0L 0L]
 [0.0208913249347146 42L 50L ..., 0L 0L 0L]]
