In [110]:
!git clone https://github.com/bstri/cs616

Cloning into 'cs616'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), done.


Import and parse API info

In [0]:
import csv

classToAPI = {}
shortNameToLong = {}

with open('cs616/eclipseAPI.csv') as f:
  reader = csv.reader(f, quotechar="'", doublequote=False, skipinitialspace=True, escapechar='\\')
  for row in reader:
    shortName = row[1].split('.')[-1]
    if shortName in shortNameToLong:
      shortNameToLong[shortName].append(row[1])
    else:
      shortNameToLong[shortName] = [row[1]]
    classToAPI[row[1]] = row[3]

Import and parse bug report info into Pandas DataFrame

In [0]:
import pandas as pd
#import xml.etree.ElementTree as ET

# This is a one-liner for the record books
# brdf = pd.DataFrame([{c.get('name'): ' '.join((c.text or '').split()) for c in el} for el in ET.parse('eclipseBugReports.xml').getroot()[1]])

brdf = pd.read_csv('cs616/updated_brdf.csv')

Define functions for feature computation

In [0]:
from scipy.spatial.distance import cosine
import math
import numpy as np
import pickle
from IPython.core.debugger import set_trace

class FeatureComputer:
    def __init__(self, brdf):
        self.brdf = brdf.sort_values('commit_timestamp', ascending=False)
        self.relevantFiles = brdf.files.str.extractall(r"([^ ][\w ./]+?\.java)") # remove the '/' if you just want the file name, and not the full path
        self.processedBugReports = []

    def computeFeatures(self, brIndex, br, src):
        pastBR = self.pastBR(brIndex, src[0])
        return dict(similarity=self.sim(br[2], src[2]),
                    tsssSimilarity=self.sim(br[2], src[2], method='tsss'),
                    collaborativeFiltering=self.collaborativeFiltering(brIndex, pastBR),
                    classNameSimilarity=self.classNameSim(br, src),
                    bugFixingRecency=self.bugFixingRecency(brIndex, pastBR),
                    bugFixingFrequency=pastBR.shape[0] # problem: as time goes on, files may continue to be fixed, so maybe a density metric would be better
                    )

    def sim(self, r, s, method='cos'): # r and s should be numpy arrays
        r = r.astype('float64')
        s = s.astype('float64')
        cos = cosine(r,s)
        if cos < 0:
            set_trace()
        if method == 'cos':
            return cos
        elif method == 'tsss':
            theta = math.acos(cos) + 0.17 # authors suggest this 10-degree adjustment
            rMag = (r**2).sum()**.5 # for some reason this is faster than np.linalg.norm
            sMag = (s**2).sum()**.5
            TS = rMag * sMag * math.sin(theta) / 2
            MD = abs(rMag - sMag)
            ED = ((r-s)**2).sum()**.5
            SS = (MD + ED)**2 * theta/2
            return TS*SS
        else:
            raise ValueError

    def getVectorFromBugReportIndex(self, rIndex):
        return self.processedBugReports[self.brdf.shape[0] - 1 - rIndex][2]

    def apiEnrich(self, s):
        pass

    def classNameSim(self, br, src):
        cName = src[3].rpartition('/')[-1]
        return len(cName) if cName in br[3] else 0        

    # returns np array of bug report indices
    def pastBR(self, rIndex, srcName):
        # grab all bug reports for which s was fixed before r was reported
        # reports are already sorted by commit timestamp (descending), so this line will get all bug reports submitted before this one
        br = self.relevantFiles.loc[rIndex+1:][self.relevantFiles.loc[rIndex+1:][0] == srcName].index.get_level_values(0).to_numpy()
        mask = []
        for i in range(br.shape[0]): # here we only get the ones that were fixed before this one was reported
            mask.append(self.brdf.at[br[i], 'commit_timestamp'] <= self.brdf.at[rIndex, 'report_timestamp'])
        return br[mask]

    def bugFixingRecency(self, rIndex, br): 
        if br.shape[0] == 0:
            return 0
        mostRecent = br[0]
        # I think the paper truncates month so an integer is used, but I'd like to try this way
        monthDur = (self.brdf.at[rIndex, 'report_timestamp'] - self.brdf.at[mostRecent, 'commit_timestamp'])/60/60/24/30
        return 1/(monthDur+1)

    def collaborativeFiltering(self, rIndex, br):
        # combine all bug reports in br by summing their vector representations
        if br.shape[0] == 0: 
            return 0
        combined = self.getVectorFromBugReportIndex(br[0])
        for i in range(1,br.shape[0]):
            combined += self.getVectorFromBugReportIndex(br[i])
        return self.sim(self.getVectorFromBugReportIndex(rIndex), combined)

    def createTrainingData(self, numReportsProcessed = 250):
        indices = []
        rows = []
        label = 'relevancy'
        for i in range(brdf.shape[0] - 1, brdf.shape[0] - 1 - numReportsProcessed, -1):
            if i%10 == 0:
                print('processing bug report', i)
            irrelevantFiles = []
            with open("drive/My Drive/Colab Files/{}.pickle".format(i), 'rb') as f:
                processedCommit = pickle.load(f, encoding='utf-8')
            br = processedCommit[1][0]
            self.processedBugReports.append(br)
            for j, src in enumerate(processedCommit[1][1:]):
                features = self.computeFeatures(i, br, src) # dict of feat_name -> val
                if src[0] in self.relevantFiles.loc[(i,),0].values:
                    features[label] = 1
                    # rows.append(features)
                    # indices.append((i,src[0]))
                else:
                    features[label] = 0
                rows.append(features)
                indices.append((i,src[0]))
                # else:
                #     irrelevantFiles.append((j+1,self.sim(br[2],src[2])))
            
            # irrelevantFiles.sort(key=lambda t: t[1], reverse=True) # sort irrelevant files by inc. similarity 
            # # but will that produce irrelevant files with higher similarities than relevant?
            # # something to check. If so, perhaps just randomly pick?
            # for j in range(numIrrelevantFiles):
            #     indexInCommit = irrelevantFiles[j][0]
            #     src = processedCommit[1][indexInCommit]
            #     features = self.computeFeatures(i, br, src)
            #     features[label] = 0
            #     rows.append(features)
            #     indices.append((i,src[0]))

        trainingData = pd.DataFrame(data=rows, index=pd.MultiIndex.from_tuples(indices))

        return trainingData

# Make training data

In [13]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# fc = FeatureComputer(brdf)
# trainingData = fc.createTrainingData(numReportsProcessed=250)
# trainingData.to_csv('trainingData250.csv')

processing bug report 6490
processing bug report 6480
processing bug report 6470
processing bug report 6460
processing bug report 6450
processing bug report 6440
processing bug report 6430
processing bug report 6420
processing bug report 6410
processing bug report 6400
processing bug report 6390
processing bug report 6380
processing bug report 6370
processing bug report 6360
processing bug report 6350
processing bug report 6340
processing bug report 6330
processing bug report 6320
processing bug report 6310
processing bug report 6300
processing bug report 6290
processing bug report 6280
processing bug report 6270
processing bug report 6260
processing bug report 6250


# Define machine learning model

In [0]:
import tensorflow as tf
from tensorflow.keras import layers

def makeModel(numFeatures):
    model = tf.keras.Sequential()
    model.add(layers.Dense(10, input_shape=(numFeatures,), activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Dense(1, activation='linear'))
    # model.add(layers.Dense(1, activation='sigmoid'))

    return model

# Train model

In [0]:
trainingData = pd.read_csv('cs616/trainingData250.csv', index_col=[0,1])
trainingData.index.set_names(['br', 'src'], inplace=True)

In [0]:
relevantMask = trainingData['relevancy'] == 1
relevants = trainingData[relevantMask]
irrelevants = trainingData[~relevantMask].sort_values(['br','similarity'], ascending=False)

In [158]:
relevants.shape

(440, 7)

In [159]:
irrelevants.shape

(297610, 7)

In [197]:
# grab top 300 irrelevant files per bug report
partialTrainingData = pd.concat([relevants, irrelevants.groupby(level=0).head(300)])
partialTrainingData.pop('classNameSimilarity')
# partialTrainingData = (partialTrainingData - partialTrainingData.min())/(partialTrainingData.max() - partialTrainingData.min())
# validation = partialTrainingData.drop(df.index())
# yTrain = partialTrainingData.pop('relevancy')
partialTrainingData.shape

(75440, 6)

In [198]:
validationSize = 25
rnd = np.random.choice(partialTrainingData.index.get_level_values(0), size=validationSize, replace=False)

XValidation = partialTrainingData.loc[rnd]
yValidation = XValidation.pop('relevancy')
XValidation = (XValidation - XValidation.min())/(XValidation.max() - XValidation.min())

XTrain = partialTrainingData.drop(rnd)
yTrain = XTrain.pop('relevancy')
XTrain = (XTrain - XTrain.min())/(XTrain.max() - XTrain.min())

print(XValidation.shape, XTrain.shape)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


(7558, 5) (67882, 5)


In [199]:
model = makeModel(XTrain.shape[1])
# model.compile(optimizer='Adam', metrics=['accuracy'], loss=custom_loss(recall_weight=0.9, spec_weight=0.1))
opt = tf.keras.optimizers.Adam(learning_rate=.0001)
model.compile(optimizer=opt, loss=tf.keras.losses.MeanSquaredError())
# model.compile(optimizer=opt, metrics=['accuracy'], loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=.2))
model.fit(XTrain.values, yTrain.values, batch_size = 50, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7efe1d5784a8>

# Evaluate Model

In [0]:
# from IPython.core.debugger import set_trace
def getRelevantFileRankings(brIndex):
    # set_trace()
    testBr = trainingData.loc[brIndex]
    yTest = testBr.pop('relevancy')
    testBr.pop('classNameSimilarity')
    pred = model(testBr.values, training=False)
    n = pred.numpy()
    comb = np.column_stack((n.reshape((-1,)), yTest.values))
    s = comb[comb[:,0].argsort()][::-1]
    # s = np.sort(comb, axis=0)[::-1]
    return np.argwhere(s[:,1] == 1).reshape((-1,))

In [0]:
def meanReciprocalRank(rankings):
    rankSum = 0
    for r in rankings:
        rankSum += 1/(r[0]+1)
    return rankSum/len(rankings)

def meanAvgPrecision(rankings, k):
    sumAvgPrec = 0
    for r in rankings:
        sumAvgPrec += (r[r < k].shape[0])/r.shape[0]
    return sumAvgPrec/len(rankings)

def accuracy(rankings, k):
    count = 0
    for r in rankings:
        if (r < k).sum() >= 1:
            count += 1
    return count/len(rankings)

In [0]:
brFileRankings = []
for i in range(brdf.shape[0] - 1, brdf.shape[0] - 1 - 250, -1):
    brFileRankings.append(getRelevantFileRankings(i))

validationFileRankings = []
for i in range(validationSize):
    validationFileRankings.append(getRelevantFileRankings(rnd[i]))

In [201]:
print('MRR on all processed bug reports:', meanReciprocalRank(brFileRankings))
print('MAP@20:', meanAvgPrecision(brFileRankings, 20))
print('MAP@10:', meanAvgPrecision(brFileRankings, 10))
print('MAP@5:', meanAvgPrecision(brFileRankings, 5))
print('Acc@10:', accuracy(brFileRankings, 10))
print('Acc@20:', accuracy(brFileRankings, 20))

print('MRR on 25 withheld bug reports:', meanReciprocalRank(validationFileRankings))
print('MAP@20:', meanAvgPrecision(validationFileRankings, 20))
print('MAP@10:', meanAvgPrecision(validationFileRankings, 10))
print('MAP@5:', meanAvgPrecision(validationFileRankings, 5))
print('Acc@10:', accuracy(validationFileRankings, 10))
print('Acc@20:', accuracy(validationFileRankings, 20))

MRR on all processed bug reports: 0.08022405381432912
MAP@20: 0.19604126984126985
MAP@10: 0.14809841269841265
MAP@5: 0.10179365079365078
Acc@10: 0.208
Acc@20: 0.264
MRR on 25 withheld bug reports: 0.10140956412588333
MAP@20: 0.21733333333333335
MAP@10: 0.17733333333333334
MAP@5: 0.11333333333333334
Acc@10: 0.24
Acc@20: 0.28


In [196]:
validationFileRankings

[array([  6,  52, 197]),
 array([117]),
 array([157, 246]),
 array([117]),
 array([27]),
 array([61]),
 array([25]),
 array([  8, 309, 359]),
 array([1116]),
 array([5]),
 array([  3, 549]),
 array([3]),
 array([ 5, 32, 83]),
 array([474]),
 array([223]),
 array([ 78, 595]),
 array([29]),
 array([46]),
 array([458]),
 array([18]),
 array([  2,   3,  30,  49,  57,  93, 115, 118, 187]),
 array([ 7,  9, 53]),
 array([1116]),
 array([ 21, 384, 441, 442]),
 array([149])]

In [0]:
model.save('reg3Model.h5')

In [203]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 10)                60        
_________________________________________________________________
batch_normalization_20 (Batc (None, 10)                40        
_________________________________________________________________
leaky_re_lu_20 (LeakyReLU)   (None, 10)                0         
_________________________________________________________________
dense_49 (Dense)             (None, 10)                110       
_________________________________________________________________
batch_normalization_21 (Batc (None, 10)                40        
_________________________________________________________________
leaky_re_lu_21 (LeakyReLU)   (None, 10)                0         
_________________________________________________________________
dense_50 (Dense)             (None, 1)               

# Java parsing experimentation

In [1]:
# !pip install javalang

Collecting javalang
  Downloading https://files.pythonhosted.org/packages/cb/e0/12344443d66b9a84844171be90112892a371da6db09866741774b8bc0a2f/javalang-0.13.0-py3-none-any.whl
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [0]:
# import javalang
# with open('sample.java') as f:
#     j = f.read()
# tree = javalang.parse.parse(j)

In [42]:
# with open('sample.java') as f:    
#     lines = f.readlines()

# # javalang.parse.parse_type(lines[35].strip())
# javalang.parse.parse_type('private ITextStore i = 4;')

JavaSyntaxError: ignored

In [0]:
# i = 0
# for path, node in tree.filter(javalang.tree.TypeDeclaration):
#     print(i)
#     print(path, node)
#     i += 1