<center><h1>Step 0 - Preprocessing</h1></center> 

In this section, we first read the data including the bug reports and source code files of all 12 projects and for ease of access, we save them as two pickle files in the ./Data directory. Therefore, this set of code will populate the ./Data directory with "allBugReports.pickle" which is a pandas dataframe that contains all the bug reports from all projects and "allSourceCodes.pickle" which is a pandas dataframe that contains all source files after preprocessing.

### Required Libraries

In [1]:
!pip install javalang



In [852]:
from __future__ import division
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
import warnings
import javalang
import re
import glob
import math
import time
from scipy import spatial
import scipy.spatial.distance
import xml.etree.ElementTree as ET
import requests
import multiprocessing
from tqdm.notebook import tqdm as tq
from time import gmtime, strftime
from random import randint

import copy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import normalize

import html

warnings.simplefilter(action='ignore', category=FutureWarning)

<center><h1>Splitting code and natural language</h1></center> 

<center><h1>Loading source codes into pandas Dataframe</h1></center> 

In [3]:
def classNames_methodNames(node):
    result=''
    if isinstance(node,javalang.tree.MethodDeclaration) or isinstance(node,javalang.tree.ClassDeclaration):
        return node.name.lower()+' '
    if not (isinstance(node,javalang.tree.PackageDeclaration) or
        isinstance(node,javalang.tree.FormalParameter) or
       isinstance(node,javalang.tree.Import)):
        if node:
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=classNames_methodNames(childNode)
    return result
    
def traverse_node(node,i=0):
    i+=1
    result=''
    if not(isinstance(node,javalang.tree.PackageDeclaration)
            or isinstance(node,javalang.tree.FormalParameter)            
            or isinstance(node,javalang.tree.Import)
            or isinstance(node,javalang.tree.CompilationUnit)):
        if node:
            if (isinstance(node,int) or isinstance(node,str) or isinstance(node,float)) and i==2:
                result+=node+' '
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=traverse_node(childNode,i)
    return result

def code_parser(code):
    try:
        tree = javalang.parse.parse(code)
        return ''.join([traverse_node(node) for path, node in tree]) + ' ' + ''.join([classNames_methodNames(node)
                                                                                      for path, node in tree])
    except Exception as e: 
        print(e)
        return ''


def loadSourceFiles2df(PATH,project):
    """
    Receives: group name and project name 
    Process: open the source file directory and finds all the java files,
             and after preprocessing(using code_preprocessor) load them into a pandas dataframe 
    Returns: dataframe >> "filename","code","size"
    """
    print('Loading source files of {}  ...'.format(project))
    PATH=os.path.join("data",project,"gitrepo")
    all_source_files=glob.glob(PATH+'/**/*.java', recursive=True)
    source_codes_df=pd.DataFrame([])
    sourceCodesList=[]

    for filename in tq(all_source_files):
        code=open(filename,encoding='ISO-8859-1').read()
        if 'src/' in filename:
            sourceCodesList.append(dict({"filename":filename.split('src/')[1].replace('/','.').lower(),
                                         "unprocessed_code":code,
                                         'project':project}))
        else:
            sourceCodesList.append(dict({"filename":filename.split(project)[1].replace('/','.').lower(),
                                         "unprocessed_code":code,
                                         'project':project}))
    source_codes_df=source_codes_df.append(pd.DataFrame(sourceCodesList))
    return source_codes_df

def load_all_SCs(dataPath):
    print('\tLoading all source codes ... ')
    source_codes_df=pd.DataFrame([])
    all_projects= [folder for folder in listdir(dataPath)]
    for project in all_projects:
        source_path=os.path.join(dataPath,project,"gitrepo")
        source_codes_df=source_codes_df.append(loadSourceFiles2df(source_path,project))
    return source_codes_df

<center><h1>Loading bug reports pandas Dataframe</h1></center> 

In [4]:
def loadBugs2df(PATH,project):
    """
    @Receives: the path to bug repository (the xml file)
    @Process: Parses the xml file and reads the fix files per bug id. 
    @Returns: Returns the dataframe
    """
    print("Loading Bug reports ... ")
    all_bugs_df=pd.DataFrame([],columns=["id","fix","text","fixdate"])
    bugRepo = ET.parse(PATH).getroot()
    buglist=[]                   
    for bug in tq(bugRepo.findall('bug')):
        bugDict=dict({"id":bug.attrib['id'],"fix":[],"fixdate":bug.attrib['fixdate']
                      ,"summary":None,"description":None,"project":project,"average_precision":0.0})
        for bugDetail in bug.find('buginformation'):
            if bugDetail.tag=='summary':
                bugDict["summary"]=bugDetail.text
            elif bugDetail.tag=='description':
                bugDict["description"]=bugDetail.text
        bugDict["fix"]=np.array([fixFile.text.replace('/','.').lower() for fixFile in bug.find('fixedFiles')])
        summary=str(bugDict['summary']) if str(bugDict['summary']) !=np.nan else ""
        description=str(bugDict['description']) if str(bugDict['description']) !=np.nan else ""
        buglist.append(bugDict)
    all_bugs_df=all_bugs_df.append(pd.DataFrame(buglist))
    return all_bugs_df.set_index('id')

def load_all_BRs(dataPath):
    print('\tLoading all bug reports ... ')
    all_bugs_df=pd.DataFrame([])
    all_projects= [folder for folder in listdir(dataPath)]
    for project in all_projects:
        data_path=os.path.join(dataPath,project,"bugrepo","repository.xml")
        all_bugs_df=all_bugs_df.append(loadBugs2df(data_path,project))
        print(len(all_bugs_df))
    return all_bugs_df




<center><h1>Main Preprocessing class</h1></center> 

In [5]:
class PreprocessingUnit:

    all_projects_source_codes=pd.DataFrame([])
    all_projects_bugreports=pd.DataFrame([])
    
    def __init__(self,dataPath):

        self.dataPath=dataPath
        self.dataFolder=os.path.join(os.getcwd(),'Output')
        if not os.path.exists(self.dataFolder):
            os.makedirs(self.dataFolder)
            
    def execute(self):

        self.loadEverything()

    def loadEverything(self):
        vectorize=False
        if PreprocessingUnit.all_projects_bugreports.empty:
            bugReportFile=os.path.join(self.dataFolder,'allBugReports.pickle')
            if not os.path.isfile(bugReportFile):
                PreprocessingUnit.all_projects_bugreports=load_all_BRs(dataPath=self.dataPath)
                vectorize=True
                PreprocessingUnit.all_projects_bugreports.to_pickle(bugReportFile)
            else: 
                PreprocessingUnit.all_projects_bugreports=pd.read_pickle(bugReportFile)
        print("*** All bug reports are are preprocessed and stored as: {} ***".format('/'.join(bugReportFile.split('/')[-2:])))

        if PreprocessingUnit.all_projects_source_codes.empty:
            sourceCodeFile=os.path.join(self.dataFolder,'allSourceCodes.pickle')
            if not os.path.isfile(sourceCodeFile):
                PreprocessingUnit.all_projects_source_codes=load_all_SCs(dataPath=self.dataPath)
                vectorize=True
                PreprocessingUnit.all_projects_source_codes.to_pickle(sourceCodeFile)
            else:
                PreprocessingUnit.all_projects_source_codes=pd.read_pickle(sourceCodeFile)
        print("*** All source codes are preprocessed and stored as: {} ***".format('/'.join(sourceCodeFile.split('/')[-2:])))
        

### MAIN

In [6]:
if __name__=="__main__":

    config={'DATA_PATH':os.path.join('data')}
    preprocessor=PreprocessingUnit(dataPath=config['DATA_PATH'])
    preprocessor.execute()
   


*** All bug reports are are preprocessed and stored as: C:\Users\QCepl\Documents\Year6\ENSF544\Final Project\Final Project\Output\allBugReports.pickle ***
*** All source codes are preprocessed and stored as: C:\Users\QCepl\Documents\Year6\ENSF544\Final Project\Final Project\Output\allSourceCodes.pickle ***


In [413]:
def reindexBugReports(bug_reports):
    numReports = len(bug_reports)
    bug_reports["report_index"] = range(numReports)
    bug_reports.set_index("report_index", inplace=True)
    return bug_reports

def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    all_projects_bugreports = reindexBugReports(all_projects_bugreports)
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bug_reports, all_projects_source_files = loadEverything()



*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


In [684]:
#display(all_projects_bug_reports[::10]) # .iloc[10]
display(all_projects_bug_reports.loc[0])
#print("Source Files")
display(all_projects_source_files[::1000]) #.iloc[10].print

print(pd.unique(all_projects_source_files["project"]))

fix                  [org.apache.commons.collections.map.flat3map.j...
text                                                               NaN
fixdate                                            2006-07-18 22:02:11
summary              Flat3Map.Entry.setValue() overwrites other Ent...
description          Flat3Map&amp;apos;s Entry objects will overwri...
project                                                    COLLECTIONS
average_precision                                                    0
Name: 0, dtype: object

Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
300,\gitrepo\src\main\java\org\springframework\dat...,/*\n * Copyright 2012 the original author or a...,DATACMNS
52,\gitrepo\src\main\java\org\wildfly\security\ss...,"/*\n * JBoss, Home of Professional Open Source...",ELY
90,\gitrepo\cas\src\main\java\org\springframework...,"/* Copyright 2004, 2005, 2006 Acegi Technology...",SEC
1090,\gitrepo\web\src\main\java\org\springframework...,package org.springframework.security.web.authe...,SEC
437,\gitrepo\spring-beans\src\main\java\org\spring...,/*\n * Copyright 2002-2012 the original author...,SPR
1437,\gitrepo\spring-context\src\test\java\org\spri...,/*\n * Copyright 2002-2013 the original author...,SPR
2437,\gitrepo\spring-core\src\test\java\org\springf...,/*\n * Copyright 2002-2009 the original author...,SPR
3437,\gitrepo\spring-orm\src\test\java\org\springfr...,/*\n * Copyright 2002-2013 the original author...,SPR
4437,\gitrepo\spring-web\src\main\java\org\springfr...,/*\n * Copyright 2002-2014 the original author...,SPR


['COLLECTIONS' 'CONFIGURATION' 'DATACMNS' 'DATAMONGO' 'DATAREST' 'ELY'
 'IO' 'LANG' 'LDAP' 'SEC' 'SOCIALFB' 'SPR']


# Preprocessing

In [502]:
def removePunctuation(textToRemove):
    for index in range(2):
        textToRemove = html.unescape(textToRemove)
    
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', textToRemove)

def splitConcaternatedWord(word):
    # https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
    # if there is a function(), add to list to return
    return re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', word)).split()

javaReservedWords = [
    "void","string","char","short","int","long","float","double","boolean","byte",
    "const","final","volatile",
    "this","class","interface","public","private","protected","extends","implements","super",
    "if","else","for","switch","case","while","continue","break",
    "try","catch","finally","throw","throws"
    "println",
    "return", "new",
]
commonCommentWords = [
    "method",
    "licensed", "apache", "software", "foundation",
    "copyright", "original", "author",
]
def isWordToRemove(word):
    # Remove \n with re.sub
    isRemoveable = False
    isRemoveable = isRemoveable or word in ENGLISH_STOP_WORDS
    isRemoveable = isRemoveable or word in javaReservedWords
    isRemoveable = isRemoveable or word in commonCommentWords
    return isRemoveable

stemmer = SnowballStemmer(language='english')
def stemWord(word):
    return stemmer.stem(word)

def preprocessText(text):
    text = text.strip()
    text = removePunctuation(text)
    result = []
    for word in text.split():
        subWords = splitConcaternatedWord(word)
        for subWord in subWords:
            subWord = subWord.lower()
            if not isWordToRemove(subWord):
                subWord = stemWord(subWord)
                result.append(subWord)

    return ' '.join(result)

def normalizeFileNamePath(filename):
    result = filename
    result = result.replace("\\", ".")
    # https://stackoverflow.com/questions/30945784/how-to-remove-all-characters-before-a-specific-character-in-python
    #result = re.sub(r'^.*?org', 'org', filename)
    
    # https://stackoverflow.com/questions/33141595/how-can-i-remove-everything-in-a-string-until-a-characters-are-seen-in-python
    result = result[result.find("org"):]
    
    return result

def processSourceFileRow(row):
    row["filename"] = normalizeFileNamePath(row["filename"])
    row["unprocessed_code"] = preprocessText(row["unprocessed_code"])
    
    return row

def processBugReportRow(row):
    # fix	text	fixdate	summary	description	project	average_precision
    
    summary = preprocessText(row["summary"]) if row["summary"] is not None else ""
    description = preprocessText(row["description"]) if row["description"] is not None else ""
    row["text"] = " ".join([summary, description])
    
    return row

# Preprossing Runner

In [465]:
#TODO test all projects
projects = ['COLLECTIONS', 'CONFIGURATION', 'DATACMNS', 'DATAMONGO', 'DATAREST', 'ELY', 'IO', 'LANG', 'LDAP', 'SEC', 'SOCIALFB', 'SPR']

In [699]:
def preprocessTables(source_files, train_bug_reports, test_bug_reports):
    # Preprocess source files
    source_files = source_files.apply(lambda row: processSourceFileRow(row), axis=1)
    source_files = source_files.rename(columns={"filename": "filename", "unprocessed_code": "code", "project": "project"})
    
    # Preprocess bug reports
    train_bug_reports = train_bug_reports.apply(lambda row: processBugReportRow(row), axis=1)
    
    test_bug_reports = test_bug_reports.apply(lambda row: processBugReportRow(row), axis=1)
    
    return { "source_files": source_files, "train_bug_reports": train_bug_reports, "test_bug_reports": test_bug_reports, }

In [721]:
preprocessed_tables = {project: {} for project in projects}

In [722]:
for project in projects:
    source_files = all_projects_source_files[all_projects_source_files["project"] == project]
    
    project_bug_reports = all_projects_bug_reports[all_projects_bug_reports["project"] == project]
    train_bug_reports, test_bug_reports = train_test_split(project_bug_reports, train_size=0.8, random_state=0)
    
    preprocessed_tables[project] = preprocessTables(source_files, train_bug_reports, test_bug_reports)
    

In [736]:
def copy_table(preprocessed_tables):
    return copy.deepcopy(preprocessed_tables)

In [729]:
#.gitrepo.gradle.jdiff.null.java # SPR 0
display(preprocessed_tables_copy["COLLECTIONS"]["train_bug_reports"].loc[0])

fix                  [org.apache.commons.collections.map.flat3map.j...
text                 flat3 map entri set valu overwrit entri valu f...
fixdate                                            2006-07-18 22:02:11
summary              Flat3Map.Entry.setValue() overwrites other Ent...
description          Flat3Map&amp;apos;s Entry objects will overwri...
project                                                    COLLECTIONS
average_precision                                                    0
Name: 0, dtype: object

In [864]:
projects_to_analyze = ['COLLECTIONS',]# 'CONFIGURATION', 'DATACMNS', 'DATAMONGO', 'DATAREST', 'ELY', 'IO', 'LANG', 'LDAP', 'SEC', 'SOCIALFB', 'SPR']

In [704]:
print(preprocessed_tables["COLLECTIONS"]["source_files"].shape)
print(preprocessed_tables["COLLECTIONS"]["train_bug_reports"].shape)
print(preprocessed_tables["COLLECTIONS"]["test_bug_reports"].shape)

display(preprocessed_tables["COLLECTIONS"]["test_bug_reports"].iloc[0])

(476, 3)
(73, 7)
(19, 7)


fix                  [org.apache.commons.collections.testlistutils....
text                 collect util remov actual retain remov collect...
fixdate                                            2006-08-18 19:01:22
summary                CollectionUtils removeAll is actually retainAll
description          The removeAll(Collection collection, Collectio...
project                                                    COLLECTIONS
average_precision                                                    0
Name: 2, dtype: object

# Method Runners

# Method 1

In [863]:
method1_similarity_results = { project: None for project in projects }

#display(method1_similarity_results)

In [968]:
for project in projects:
    source_files = preprocessed_tables[project]["source_files"]
    test_bug_reports = preprocessed_tables[project]["test_bug_reports"]
    
    method1_similarity_results[project] = runMethod1(source_files, test_bug_reports)

In [970]:
display(method1_similarity_results["COLLECTIONS"])

Unnamed: 0,filename,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,org.apache.commons.collections.arraystack.java,0.046535,0.025070,0.059912,0.031993,0.019968,0.014076,0.022364,0.028789,0.021666,0.014344,0.010817,0.037183,0.017624,0.143310,0.027635,0.008271,0.004538,0.010480,0.022175
1,org.apache.commons.collections.bag.java,0.091316,0.058965,0.121966,0.084011,0.040581,0.061102,0.061162,0.083923,0.099390,0.023980,0.013635,0.046710,0.092361,0.296972,0.090471,0.019921,0.030205,0.032276,0.072662
2,org.apache.commons.collections.bagutils.java,0.005842,0.008480,0.007623,0.010366,0.019817,0.005531,0.010599,0.003543,0.006288,0.002021,0.001098,0.003235,0.003341,0.012677,0.002969,0.041549,0.002340,0.001346,0.002816
3,org.apache.commons.collections.beanmap.java,0.011232,0.017707,0.033126,0.008781,0.024031,0.044827,0.008652,0.014647,0.018638,0.007929,0.015102,0.008279,0.014939,0.043987,0.008216,0.038838,0.100349,0.014946,0.022126
4,org.apache.commons.collections.bidimap.java,0.022314,0.005647,0.038495,0.017425,0.022631,0.169073,0.017522,0.038836,0.021054,0.008808,0.040623,0.012511,0.017991,0.109649,0.010672,0.010035,0.307274,0.030112,0.013910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,org.apache.commons.collections.set.testtransfo...,0.050167,0.082350,0.125298,0.054457,0.047051,0.073204,0.047242,0.074348,0.075547,0.005988,0.032069,0.027710,0.080570,0.037077,0.054487,0.007703,0.034206,0.005403,0.135347
472,org.apache.commons.collections.set.testtypedse...,0.021401,0.062051,0.117976,0.026704,0.033306,0.084177,0.033504,0.100024,0.081339,0.008015,0.025580,0.014340,0.094287,0.030218,0.037606,0.045597,0.035776,0.005917,0.101540
473,org.apache.commons.collections.set.testtypedso...,0.018532,0.104024,0.131549,0.020471,0.044492,0.072925,0.027638,0.050380,0.077660,0.004182,0.037842,0.015324,0.083951,0.033723,0.072230,0.036571,0.030874,0.004870,0.159748
474,org.apache.commons.collections.set.testunmodif...,0.051845,0.098072,0.140221,0.056994,0.026412,0.095132,0.051737,0.091092,0.090012,0.005808,0.037634,0.030131,0.101550,0.037535,0.064621,0.166062,0.028654,0.010408,0.125662


In [943]:
def runMethod1(source_files, test_bug_reports):
    # Find similarity
    similarity_matrix = findSimilarity(test_bug_reports["text"], source_files["code"])
    
    similarity_results = similarityDataFrame_Method1(similarity_matrix, source_files)
    
    return similarity_results

In [665]:
def buildTFIDF(X_train):
    vectorizer = TfidfVectorizer(min_df=2)  # Your solution
    # Fit the vectorizer here
    vectorizer.fit(X_train)
    return vectorizer

def findSimilarity(query_series, lookup_series):
    # https://goodboychan.github.io/python/datacamp/natural_language_processing/2020/07/17/04-TF-IDF-and-similarity-scores.html
    vectorizer = buildTFIDF(lookup_series)
    
    lookup_vectors = vectorizer.transform(lookup_series)
    
    #display(test_bug_reports["text"])
    query_vectors = vectorizer.transform(query_series)
    #printSimilarityMatrix(report_vectors, vectorizer)
    
    # rows are reports, cols are files
    similarity_matrix = cosine_similarity(query_vectors, lookup_vectors)
    
    #print(similarity_matrix.shape)
    #print(similarity_matrix)
    
    return similarity_matrix

In [536]:
# Make code_vectors more readable
def printInputVectors(input_vectors, vectorizer):
    vocab = vectorizer.get_feature_names()
    documents_tfidf_lol = [{word: tfidf_value for word, tfidf_value in zip(vocab, sent)} 
                           for sent in input_vectors.toarray()]
    
    documents_tfidf = pd.DataFrame(documents_tfidf_lol)
    documents_tfidf.fillna(0, inplace=True)
    
    print("\n Similarity Vectors:")
    display(documents_tfidf)

## Setting up similarity_df for Method1

In [967]:
# Switch the convention from similarity_matrix: Bug reports become columns and file become indices
# Should be more convenient for calculating MRR and MAP
# Use implicit index to tie similarity matrix to the index of test reports, files

def initializeSimilarityDF(num_reports, num_files):
    results_file_index = [file_index for file_index in range(num_files)]

    columns = [bug_index for bug_index in range(num_reports)]
    columns.insert(0, "filename")

    similarity_results = pd.DataFrame(index=results_file_index, columns=columns)
    
    #print("\n similarityDF shape")
    #print(method1_similarity_results.shape)
    #display(method1_similarity_results)
    
    return similarity_results

def populateSimilarityDF_Method1(method1_similarity_results, similarity_matrix, source_files):
    for similarity_row, similarity_col in np.ndindex(similarity_matrix.shape):
        #print((similarity_row, similarity_col))
        result_row = similarity_col
        # We offset columns by 1 to include filenames
        result_col = similarity_row

        currentFile = source_files.iloc[similarity_col].filename
        method1_similarity_results.at[result_row, "filename"] = currentFile

        # Add similarity data for this between file and reports
        method1_similarity_results.at[result_row, result_col] = similarity_matrix[similarity_row, similarity_col]

def similarityDataFrame_Method1(similarity_matrix, source_files):
    num_reports, num_files = similarity_matrix.shape
    method1_similarity_results = initializeSimilarityDF(num_reports, num_files)
    populateSimilarityDF_Method1(method1_similarity_results, similarity_matrix, source_files)
    
    #print("method1_similarity_results_df")
    #display(display(method1_similarity_results))
    
    #return method1_similarity_results
    return findRevisedSimilarity(method1_similarity_results, source_files)

In [965]:
def multiplySimilarityBySigmoid(row, terms_df):
    filename = row["filename"]
    sigmoid = terms_df.loc[filename]["num_terms"]
    
    #print("<" + filename + ">")
    
    if filename != "a": # bug I don't have time to fix
        for bug_index in range(len(row) - 1): # -1 to not count filename
            new_value = row[bug_index] * sigmoid
            #print("nv")
            #print(new_value)
            row[bug_index] = new_value
    else:
        for bug_index in range(len(row) - 1): # -1 to not count filename
            row[bug_index] = 0
    
    return row

def findRevisedSimilarity(similarity_results, source_files):
    terms_df = termsScalar(source_files)
    
    #similarity_results = pd.merge(similarity_results, terms_df, on=["filename"], how="left")
    
    similarity_results = similarity_results.apply(lambda row: multiplySimilarityBySigmoid(row, terms_df), axis=1)
    
    return similarity_results

def termsPerFile(input_row):
    terms = len(input_row["code"].split())
    entry = {"filename": input_row["filename"], "num_terms": terms}
    
    return pd.Series(data=entry, index=["filename", "num_terms"])

def sigmoid_row(row):
    sigmoid = 1 / (1 + (math.exp(-row["num_terms"])))
    row["num_terms"] = sigmoid
    
    return row

def termsScalar(source_files):
    terms_df = source_files.apply(lambda row: termsPerFile(row), axis=1)
    terms_df = terms_df.set_index("filename")
    
    terms_array = [terms_df["num_terms"].to_numpy()]
    normalized_terms = normalize(terms_array)
    
    terms_df["num_terms"] = normalized_terms[0]
    
    terms_df = terms_df.apply(lambda row: sigmoid_row(row), axis=1)
    
    return terms_df

#test_sf = pd.DataFrame({"filename": ["one", "two", "three"], "code": ["one", "one two", "one two three"] })

#test_norm = termsScalar(test_sf)
#display(test_norm)

In [949]:
preprocessed_tables_copy = copy_table(preprocessed_tables)
test_df = preprocessed_tables_copy["COLLECTIONS"]["test_bug_reports"]
train_df = preprocessed_tables_copy["COLLECTIONS"]["train_bug_reports"]
source_df = preprocessed_tables_copy["SPR"]["source_files"]

#display(test_df.shape)

test_sim_mat = findSimilarity(test_df["text"], source_df["code"])
#display(test_sim_mat)

test_sim_df = similarityDataFrame_Method1(test_sim_mat, source_df)
#display(test_sim_df)

Unnamed: 0,filename,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,a,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.001624 a 0.001624 a 0.00...,filename a 0.000758 a 0.000758 a 0.00...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.109470 a 0.109515 a 0.10...,filename a 0.002172 a 0.002173 a 0.00...,filename a 0.006035 a 0.006038 a 0.00...,filename a 0.006611 a 0.006614 a 0.00...,filename a 0.002520 a 0.002521 a 0.00...,filename a 0.112397 a 0.112444 a 0.11...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.002415 a 0.002416 a 0.00...,filename a 0.004898 a 0.004900 a 0.00...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...,filename a 0.042440 a 0.042458 a 0.04...,filename a 0.000732 a 0.000733 a 0.00...,filename a 0.0 a 0.0 a 0.0 a 0.0 a...
1,org.springframework.aop.advisor.java,0,0.000893783,0.00194865,0.000200016,0.00297327,0.00244167,0.00631371,0.0193676,0.00167634,0.00336383,0.00609406,0,0.00123243,0.00989426,0,0.00318492,0,0.0113086,0.00324459
2,org.springframework.aop.afteradvice.java,0,0.00124659,0.000511992,0.000553629,0.00600766,0.00203997,0.0162918,0.0023363,0.000340541,0,0.00130373,0,0.00374382,0.00619083,0,0.00440561,0.0109152,0.000738284,0.00604713
3,org.springframework.aop.afterreturningadvice.java,0,0.000754186,0.0225462,0.000438426,0.0220842,0.013351,0.00464527,0.0122686,0.00356756,0.00993801,0.00103244,0,0.00288895,0.0443393,0,0.0336462,0.000815047,0.00605962,0.0358545
4,org.springframework.aop.aopinvocationexception...,0,0.00025194,0.000796521,0.000366681,0.00382947,0.00101677,0.00279627,0.0148857,0.000225548,0,0.000863488,0,0.000360225,0.00718196,0,0,0.00234137,0.000488982,0.0450976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5893,org.springframework.expression.spel.support.be...,0,0.00155543,0.00661967,0.000396938,0.00707275,0.00334875,0.00249433,0.0032833,0.00498862,0.00236821,0.000854872,0,0.00231465,0.0107053,0.00108663,0.0209324,0.00123035,0.00183576,0.00549577
5894,org.springframework.expression.spel.support.sp...,0.0246422,0.10861,0.0507371,0.0167184,0.0132038,0.0108387,0.0162407,0.012641,0.0258956,0.00473159,0.00114484,0.0581133,0.031532,0.0382651,0.0659823,0.0133263,0.00335401,0.000306607,0.0638191
5895,org.springframework.scheduling.annotation.sche...,0.0057694,0.000423937,0.0070068,0.00317331,0.0048621,0.00737044,0.00934883,0.011913,0.00550193,0.00127018,0.00206781,0.006939,0.00708707,0.00227357,0.000297688,0,0.00695508,0.00610999,0.0317461
5896,org.springframework.transaction.annotation.ena...,0.00929497,0.0125744,0.00940238,0.0104521,0.00379277,0.00525216,0.0088451,0.0142643,0.00752574,0.00145809,0.00270765,0.0114798,0.00640222,0.0096354,0.00938495,0.000821567,0.0121715,0.0141643,0.0253058


In [966]:
revised_df = findRevisedSimilarity(test_sim_df, source_df)

display(revised_df)

Unnamed: 0,filename,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,org.springframework.aop.advisor.java,0,7.04249e-06,1.53543e-05,1.57601e-06,2.34277e-05,1.9239e-05,4.97484e-05,0.000152606,1.32086e-05,2.65051e-05,4.80177e-05,0,9.71083e-06,7.79611e-05,0,2.50953e-05,0,8.91052e-05,2.55655e-05
2,org.springframework.aop.afteradvice.java,0,9.77188e-06,4.01346e-06,4.33985e-06,4.70935e-05,1.59911e-05,0.00012771,1.8314e-05,2.66947e-06,0,1.02198e-05,0,2.93474e-05,4.85293e-05,0,3.45351e-05,8.55634e-05,5.78734e-06,4.74029e-05
3,org.springframework.aop.afterreturningadvice.java,0,5.92426e-06,0.000177104,3.44391e-06,0.000173475,0.000104875,3.64894e-05,9.63721e-05,2.80238e-05,7.80648e-05,8.10997e-06,0,2.26932e-05,0.000348293,0,0.000264296,6.40233e-06,4.75993e-05,0.000281643
4,org.springframework.aop.aopinvocationexception...,0,1.97865e-06,6.2556e-06,2.87979e-06,3.00753e-05,7.9854e-06,2.19609e-05,0.000116907,1.77138e-06,0,6.78154e-06,0,2.82908e-06,5.64046e-05,0,0,1.83883e-05,3.8403e-06,0.000354181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5893,org.springframework.expression.spel.support.be...,0,1.23905e-05,5.27321e-05,3.16199e-06,5.63413e-05,2.6676e-05,1.98698e-05,2.61546e-05,3.97392e-05,1.88651e-05,6.80988e-06,0,1.84385e-05,8.52782e-05,8.65607e-06,0.000166747,9.8009e-06,1.46236e-05,4.37791e-05
5894,org.springframework.expression.spel.support.sp...,0.000194503,0.000857266,0.000400473,0.00013196,0.000104219,8.55512e-05,0.000128189,9.97763e-05,0.000204397,3.73469e-05,9.03631e-06,0.000458694,0.000248885,0.00030203,0.000520804,0.000105186,2.64735e-05,2.42008e-06,0.00050373
5895,org.springframework.scheduling.annotation.sche...,4.62891e-05,3.40134e-06,5.62171e-05,2.54602e-05,3.90096e-05,5.91346e-05,7.50077e-05,9.55806e-05,4.41432e-05,1.0191e-05,1.65905e-05,5.56731e-05,5.68611e-05,1.82414e-05,2.38841e-06,0,5.58021e-05,4.90217e-05,0.000254705
5896,org.springframework.transaction.annotation.ena...,7.63742e-05,0.00010332,7.72567e-05,8.58817e-05,3.11641e-05,4.31556e-05,7.26777e-05,0.000117205,6.18369e-05,1.19807e-05,2.2248e-05,9.4326e-05,5.26053e-05,7.91714e-05,7.71135e-05,6.75058e-06,0.00010001,0.000116384,0.000207931


In [344]:
#display(method1_similarity_results)

Unnamed: 0,filename,0
0,\gitrepo\src\java\org\apache\commons\collectio...,0.00253327
1,\gitrepo\src\java\org\apache\commons\collectio...,0.00034301
2,\gitrepo\src\java\org\apache\commons\collectio...,0.0019743
3,\gitrepo\src\java\org\apache\commons\collectio...,0.0524455
4,\gitrepo\src\java\org\apache\commons\collectio...,0.078968
...,...,...
10456,\gitrepo\src\test\java\org\springframework\exp...,0.00881417
10457,\gitrepo\src\test\java\org\springframework\exp...,0.0516207
10458,\gitrepo\src\test\java\org\springframework\sch...,0.051348
10459,\gitrepo\src\test\java\org\springframework\tra...,0.031881


# Method 2

In [870]:
method2_similarity_results = { project: None for project in projects }

#display(method2_similarity_results)

In [874]:
for project in projects: # _to_analyze
    source_files = preprocessed_tables[project]["source_files"]
    train_bug_reports = preprocessed_tables[project]["train_bug_reports"]
    test_bug_reports = preprocessed_tables[project]["test_bug_reports"]
    
    method2_similarity_results[project] = runMethod2(source_files, train_bug_reports, test_bug_reports)
    
    #final_results.at[project, "Method2_MRR"] = m2_MRR
    #final_results.at[project, "Method2_MAP"] = m2_MAP
    #MRR = meanReciprocalRank(similarity_results, test_bug_reports)
    #MAP = meanAveragePrecision(similarity_results, test_bug_reports)

In [877]:
display(method2_similarity_results["COLLECTIONS"])

Unnamed: 0,filename,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,org.apache.commons.collections.arraystack.java,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,org.apache.commons.collections.bag.java,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,org.apache.commons.collections.bagutils.java,0.00876411,0.020892,0.00749909,0.00657784,0.000199108,0.00148778,0.00223039,0.000556309,0.00585301,0.00143307,0.00167792,0.0163279,0.00626374,0.0077066,0.0111345,0.000244294,0,0.00152324,0.00754813
3,org.apache.commons.collections.beanmap.java,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,org.apache.commons.collections.bidimap.java,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,org.apache.commons.collections.set.testtransfo...,0.00876411,0.020892,0.00749909,0.00657784,0.000199108,0.00148778,0.00223039,0.000556309,0.00585301,0.00143307,0.00167792,0.0163279,0.00626374,0.0077066,0.0111345,0.000244294,0,0.00152324,0.00754813
472,org.apache.commons.collections.set.testtypedse...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
473,org.apache.commons.collections.set.testtypedso...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
474,org.apache.commons.collections.set.testunmodif...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [872]:
def runMethod2(source_files, train_bug_reports, test_bug_reports):
    # Find similarity between new bug and previous bugs
    similarity_matrix = findSimilarity(test_bug_reports["text"], train_bug_reports["text"])
    
    similarity_results_m2 = similarityDataFrame_Method2(similarity_matrix, train_bug_reports, source_files)
    return similarity_results_m2

In [849]:
def stackBugReportFixLengths(bug_reports, total_bug_reports):
    bugs_and_lengths = []
    
    for bug_number in range(total_bug_reports):
        test_bug = bug_reports.iloc[bug_number]
        
        numFilesFixed = len(test_bug["fix"])
        entry = {"bug": bug_number, "files_fixed": numFilesFixed }
        bugs_and_lengths.append(entry)
    
    return pd.DataFrame().from_dict(bugs_and_lengths)

def stackSimilarityMatrix(similarity_matrix):
    total_test_bugs, total_historical_bugs = similarity_matrix.shape
    stacked_similarity = {"test_bug": [], "bug": [], "similarity": []}
    
    for test_bug_index, historic_bug_index in np.ndindex(similarity_matrix.shape):
        similarity = similarity_matrix[test_bug_index, historic_bug_index]
        
        stacked_similarity["test_bug"].append(test_bug_index)
        stacked_similarity["bug"].append(historic_bug_index)
        stacked_similarity["similarity"].append(similarity)
    
    df = pd.DataFrame(stacked_similarity)
    #display(df)
    return df

def scaleSimilarity(row):
    scaled_sim = row["similarity"] / row["files_fixed"] if row["files_fixed"] != 0 else 0
    row["scaled_similarity"] = scaled_sim
    return row

def populateSimilarityDataFrame_Method2(dimensions, historic_bugs_and_fixes, files_fixed_per_bug, test_to_historical_similarity, source_files):
    total_bug_reports, total_files = dimensions
    
    similarity_results = initializeSimilarityDF(total_bug_reports, total_files)

    #all_filenames = pd.unique(historic_bugs_and_fixes["filename"])
    
    for bug_index in range(total_bug_reports):
        # filename iteration independent of current bug, as we base on train_bug_df
        for file_index in range(total_files):
            filename = source_files.iloc[file_index]["filename"]
            connected_bugs_to_file = historic_bugs_and_fixes[historic_bugs_and_fixes["filename"] == filename]
            
            similarity_to_current_test_bug = test_to_historical_similarity[test_to_historical_similarity["test_bug"] == bug_index]

            interim_df = pd.merge(connected_bugs_to_file, similarity_to_current_test_bug,
                on=["bug"],
                how="left")
            
            #print("\n merge similarity")
            #display(interim_df)

            interim_df = pd.merge(interim_df, files_fixed_per_bug,
                on=["bug"],
                how="left")
            
            #print("\n merge number of fixes")
            #display(interim_df)
            
            interim_df["scaled_similarity"] = np.NaN
            interim_df = interim_df.apply(lambda row: scaleSimilarity(row), axis=1)

            #print("\n scaled similarity")
            #display(interim_df)
            
            simi_score = interim_df["scaled_similarity"].sum()
            #print("\n simi_score " + str(simi_score))

            similarity_results.at[file_index, "filename"] = filename
            similarity_results.at[file_index, bug_index] = simi_score
            
    return similarity_results

def similarityDataFrame_Method2(similarity_matrix, train_bug_reports, source_files):
    total_historic_bug_reports = len(train_bug_reports)
    total_test_bug_reports = len(similarity_matrix)
    total_files = len(source_files)
    
    historic_bugs_and_fixes = stackBugReportFixes(train_bug_reports, total_historic_bug_reports)
    files_fixed_per_bug = stackBugReportFixLengths(train_bug_reports, total_historic_bug_reports)
    test_to_historical_similarity = stackSimilarityMatrix(similarity_matrix)
    
    dimensions = (total_test_bug_reports, total_files)
    #print(dimensions)
    return populateSimilarityDataFrame_Method2(dimensions, historic_bugs_and_fixes, files_fixed_per_bug, test_to_historical_similarity, source_files)

# Analyzing Results

## meanAveragePrecision MAP

In [764]:
def meanAveragePrecision(similarity_results, test_bug_reports):
    #https://towardsdatascience.com/breaking-down-mean-average-precision-map-ae462f623a52
    
    total_files = len(similarity_results)
    total_bug_reports = len(test_bug_reports)
    
    rank_template = [rank for rank in range(1, total_files+1)]
    
    for bug_report_number in range(total_bug_reports):
        #print("results of report " + str(bug_report_number))
        
        current_bug_report_similarity = similarity_results[["filename", bug_report_number]]
        top_similar_files = current_bug_report_similarity.sort_values(by=bug_report_number, ascending=False)
        
        top_similar_files["rank"] = rank_template
        
        # Get only the rows where the result file == the fix file
        bug_report = test_bug_reports.iloc[bug_report_number]
        query_matches = top_similar_files[top_similar_files["filename"].isin(bug_report.fix)]
        #print("\nquery match")
        #display(query_matches)
        
        precision_sum = 0.0
        
        # We can start at 1 because we've filtered out all files that don't match
        # meaning we know we encounter a matching file on first iteration
        count_fixes_encountered = 1
        for query_index, file_row in query_matches.iterrows():
            file_rank = file_row["rank"]
            precision = float(count_fixes_encountered) / file_rank
            #query_matches.loc[query_index, "precision"] = precision
            precision_sum += precision
            
            count_fixes_encountered += 1
        
        #test_bug_reports.at[bug_report_number, "average_precision"] = query_matches["precision"].mean()
        total_positive_matches = len(query_matches)
        average_precision = precision_sum / total_positive_matches if total_positive_matches != 0 else 0
        
        average_precision_index = 6
        test_bug_reports.iat[bug_report_number, average_precision_index] = average_precision
        
        #display(test_bug_reports.loc[bug_report_number])
    
    MAP = test_bug_reports["average_precision"].mean()
    return MAP

## meanReciprocalRank MRR

In [805]:
def stackBugReportFixes(bug_reports, total_bug_reports):
    bugs_and_fixes = {"bug": [], "filename": [] }
    
    for test_bug_number in range(total_bug_reports):
        #print(test_bug_number)
        test_bug = bug_reports.iloc[test_bug_number]
        #display(test_bug)
        
        number_fixes = len(test_bug.fix)
        bug_column = [test_bug_number for tmp in range(number_fixes)]
        
        bugs_and_fixes["bug"].extend(bug_column)
        bugs_and_fixes["filename"].extend(test_bug.fix)
    
    return pd.DataFrame(bugs_and_fixes)

In [982]:
def meanReciprocalRank(similarity_results, test_bug_reports):
    # https://softwaredoug.com/blog/2021/04/21/compute-mrr-using-pandas.html
    
    # First element is rank 1
    # Convert bug_report_test to have
    # for every bug
        # get rank for first correct answer
        # take its inverse
        # add to sum
    # Divide sum by number of bugs
    
    #fix	text	fixdate	summary	description	project	average_precision
    
    total_bug_reports = len(test_bug_reports)
    
    bugs_and_fixes = stackBugReportFixes(test_bug_reports, total_bug_reports)
    
    #display(bugs_and_fixes)
    
    # Convert similarity_results to have bugs and files as column-based, sorted by rank
    
    # len = #docs x #bugs
    columns = ["bug", "filename", "rank"]
    combined_sorted_results = pd.DataFrame(columns=columns)
    #display(combined_sorted_results)
    
    total_files = len(similarity_results)
    rank_template = [rank for rank in range(1, total_files+1)]
    
    for bug_report_number in range(total_bug_reports):
        #print("results of report " + str(bug_report_number))
        
        bug_column = [bug_report_number for tmp in range(total_files)]
        
        current_bug_report_similarity = similarity_results[["filename", bug_report_number]]
        #display(current_bug_report_similarity)
        top_similar_files = current_bug_report_similarity.sort_values(by=bug_report_number, ascending=False)
        
        top_similar_files["bug"] = bug_column
        top_similar_files["rank"] = rank_template
        
        #display(top_similar_files)
        
        combined_sorted_results = combined_sorted_results.append(top_similar_files, ignore_index=True)
        #display(combined_sorted_results)

    MAX_RANK = 100000

    hits = pd.merge(bugs_and_fixes, combined_sorted_results,
        on=["bug", "filename"],
        how="left").fillna(MAX_RANK)

    return (1 / hits.groupby('bug')['rank'].min()).mean()

In [973]:
final_result_columns = ["Method1_MRR", "Method2_MRR", "Method1_MAP", "Method2_MAP"]

final_results = pd.DataFrame(index=projects, columns=final_result_columns)
#display(final_results)

## Method 1 Results

In [1013]:
projects_m1_output = ['COLLECTIONS', 'CONFIGURATION', 'DATACMNS', 'DATAMONGO', 'DATAREST', 'ELY', 'LANG',  'SOCIALFB',] 
# Getting an uknown error on projects marked with X
# 'ELY', X'IO', 'LANG', X'LDAP', X'SEC', 'SOCIALFB', X'SPR'

In [993]:
for project in projects_m1_output:
    print("running " + project)
    test_bug_reports = preprocessed_tables[project]["test_bug_reports"]
    
    similarity_results = method1_similarity_results[project]
    
    m1_MRR = meanReciprocalRank(similarity_results, test_bug_reports)
    final_results.at[project, "Method1_MRR"] = m1_MRR

running COLLECTIONS
running CONFIGURATION
running DATACMNS
running DATAMONGO
running DATAREST
running ELY
running LANG
running SOCIALFB


In [999]:
for project in projects_m1_output:
    print("running " + project)
    test_bug_reports = preprocessed_tables[project]["test_bug_reports"]
    
    similarity_results = method1_similarity_results[project]
    
    m1_MAP = meanAveragePrecision(similarity_results, test_bug_reports)
    final_results.at[project, "Method1_MAP"] = m1_MAP

running COLLECTIONS
running CONFIGURATION
running DATACMNS
running DATAMONGO
running DATAREST
running ELY
running LANG
running SOCIALFB


## Method 2 Results

In [1014]:
projects_m2_output = ['COLLECTIONS', 'CONFIGURATION', 'DATACMNS', 'DATAMONGO', 'DATAREST', 'ELY', 'LANG',  'SOCIALFB',] 

In [985]:
combined_similarity_results = { project: None for project in projects }

#display(method1_similarity_results)

In [1029]:
for project in projects_m2_output:
    print("running " + project)
    
    m1_similarity_results = method1_similarity_results[project]
    m2_similarity_results = method2_similarity_results[project]
    
    combined_similarity_results[project] = combineRanking(m1_similarity_results, m2_similarity_results, 0.2)

running COLLECTIONS
running CONFIGURATION
running DATACMNS
running DATAMONGO
running DATAREST
running ELY
running LANG
running SOCIALFB


In [1028]:
def normalizeSimilarityResults(similarity_rank):
    similarity_array = [similarity_rank.to_numpy()]
    normalized_similarity = normalize(similarity_array)
    
    #print(type(normalized_similarity))
    return normalized_similarity[0]

def combineRanking(method1_similarity, method2_similarity, alpha):
    # for each file
    # (1-alpha)*normalize(rVSM score) + alpha*normalize(SimiScore)
    
    oneMinusAlpha = 1 - alpha
    
    total_files = len(method1_similarity)
    total_bug_reports = len(method1_similarity.columns) - 1 # -1 to ignore filename
    
    combined_similarity_results = initializeSimilarityDF(total_bug_reports, total_files)
    
    for bug_index in range(total_bug_reports):
        m1_norm = normalizeSimilarityResults(method1_similarity[bug_index])
        m1_norm = oneMinusAlpha * m1_norm
        #display(m1_norm.shape)
        
        m2_norm = normalizeSimilarityResults(method2_similarity[bug_index])
        m2_norm = alpha * m2_norm
        
        combined_bug_sim = m1_norm + m2_norm
        
        combined_similarity_results[bug_index] = combined_bug_sim
    
    combined_similarity_results["filename"] = method1_similarity["filename"]
    #display(combined_similarity_results)
    
    return combined_similarity_results

In [1030]:
for project in projects_m2_output:
    print("running " + project)
    test_bug_reports = preprocessed_tables[project]["test_bug_reports"]
    
    
    similarity_results = combined_similarity_results[project]
    #similarity_results = method2_similarity_results[project]
    
    m2_MRR = meanReciprocalRank(similarity_results, test_bug_reports)
    m2_MAP = meanAveragePrecision(similarity_results, test_bug_reports)
    final_results.at[project, "Method2_MRR"] = m2_MRR
    final_results.at[project, "Method2_MAP"] = m2_MAP

running COLLECTIONS
running CONFIGURATION
running DATACMNS
running DATAMONGO
running DATAREST
running ELY
running LANG
running SOCIALFB


In [1031]:
display(final_results)

Unnamed: 0,Method1_MRR,Method2_MRR,Method1_MAP,Method2_MAP
COLLECTIONS,0.36892,0.477094,0.299534,0.472218
CONFIGURATION,0.544123,0.679306,0.373892,0.545935
DATACMNS,0.463499,0.455156,0.35038,0.374882
DATAMONGO,0.302155,0.466876,0.222569,0.366136
DATAREST,0.33445,0.503187,0.260501,0.385275
ELY,0.366671,0.400004,0.383333,0.416667
IO,,,,
LANG,0.583146,0.622603,0.527897,0.587529
LDAP,,,,
SEC,,,,


# Scratchpad

In [796]:
preprocessed_tables_copy = copy_table(preprocessed_tables)
test_df = preprocessed_tables_copy["COLLECTIONS"]["test_bug_reports"]
train_df = preprocessed_tables_copy["COLLECTIONS"]["train_bug_reports"]
source_df = preprocessed_tables_copy["COLLECTIONS"]["source_files"]

#display(test_df.shape)

test_sim_mat = findSimilarity(test_df["text"], train_df["text"])
#display(test_sim_mat)

test_sim_df = similarityDataFrame(test_sim_mat, source_df)

#display(test_df)

In [826]:
#test_map = meanAveragePrecision(test_sim_df, test_df)
#print(test_map)

#display(train_df)
#print(train_df.shape)
#historic_bugs_and_fixes = stackBugReportFixes(train_df, len(train_df))
#test_filename = source_df.iloc[0].filename
#boolSeries = historic_bugs_and_fixes["filename"] == test_filename
#display(historic_bugs_and_fixes[boolSeries])

#fixed_files_per_bug = stackBugReportFixLengths(train_df, len(train_df))
#display(fixed_files_per_bug)

#test_sim_df = stackSimilarityMatrix(test_sim_mat)
#display(test_sim_df)

In [851]:
method2_results = similarityDataFrame_Method2(test_sim_mat, train_df, source_df)

In [857]:
#display(method2_results)
#display(method2_results[method2_results[0] != 0])

In [850]:
#for _, historic_bug in train_bug_reports.iterrows():
    # for each historic_bug that connects to fix file
        # get similarity of historic_bug to current bug
        # number_files_fixed_for_bug = len(historic_bug.fix)
        # divide similarity by number_files_fixed_for_bug
        
historic_bugs_and_fixes = pd.DataFrame().from_dict([
    {"bug": 0, "filename": "one.java",},
    {"bug": 1, "filename": "one.java",},
    {"bug": 1, "filename": "two.java",},
    {"bug": 2, "filename": "one.java",},
    {"bug": 2, "filename": "two.java",},
    {"bug": 2, "filename": "three.java",},
])
#display(historic_bugs_and_fixes)

files_fixed_per_bug = pd.DataFrame().from_dict([
    {"bug": 0, "files_fixed": 1,},
    {"bug": 1, "files_fixed": 2,},
    {"bug": 2, "files_fixed": 3,},
])
#display(files_fixed_per_bug)

all_filenames = pd.unique(train_bug_df["filename"])

test_to_historical_similarity = pd.DataFrame().from_dict([
    {"test_bug": 0, "bug": 0, "similarity": 0.4,},
    {"test_bug": 0, "bug": 1, "similarity": 0.6,},
    {"test_bug": 0, "bug": 2, "similarity": 0.8,},
    {"test_bug": 1, "bug": 0, "similarity": 0.1,},
    {"test_bug": 1, "bug": 1, "similarity": 0.2,},
    {"test_bug": 1, "bug": 2, "similarity": 0.3,},
])

#display(test_to_historical_similarity)

my_sf = pd.DataFrame([
    {"filename": "one.java",},
    {"filename": "two.java",},
    {"filename": "three.java",},
])

#display(my_sf)

#dim = (2, 3)
#test_m2 = populateSimilarityDataFrame_Method2(dim, historic_bugs_and_fixes, files_fixed_per_bug, test_to_historical_similarity, my_sf)
#display(test_m2)

Unnamed: 0,filename,0,1
0,one.java,0.966667,0.3
1,two.java,0.566667,0.2
2,three.java,0.266667,0.1


In [None]:
all_test_bugs = pd.unique(test_to_historical_similarity["test_bug"])

similarity_results = initializeSimilarityDF(len(all_test_bugs), len(all_filenames))

display(similarity_results)

for bug_index in range(len(all_test_bugs)):
    # filename iteration independent of current bug, as we base on train_bug_df
    for file_index in range(len(all_filenames)):
        filename = all_filenames[file_index]

        connected_bugs_to_file = historic_bugs_and_fixes[historic_bugs_and_fixes["filename"] == filename]
        #print("\nbugs connected to " + filename)
        #display(connected_bugs_to_file)

        hits = pd.merge(connected_bugs_to_file, test_to_historical_similarity,
            on=["bug"],
            how="left")

        hits = pd.merge(hits, files_fixed_per_bug,
            on=["bug" ],
            how="left")

        hits = hits.apply(lambda row: similarityPerFileFixed(row), axis=1)

        #print("\n sim")
        #display(hits)

        simi_score = hits["scaled_similarity"].sum()
        #print("\n simi_score " + str(simi_score))

        similarity_results.at[file_index, "filename"] = filename
        similarity_results.at[file_index, bug_index] = simi_score
    
    

In [329]:
for (columnName, columnData) in top_similar_files.iteritems():
   print('Colunm Name : ', columnName)
   print('Column Contents : ', columnData.values)

Colunm Name :  0
Column Contents :  [0.4695189531435297 0.4369636901016136 0.42121832357619104 ... 0.0 0.0 0.0]


In [597]:
bugs_and_fixes = pd.DataFrame.from_dict([
    { "fix": ["x"] },
    { "fix": ["y", "z"] },
    { "fix": ["q"] },
])

my_cols = ["filename", 0, 1, 2]

testSimMatrix = [["x", 1, 2, 9], 
                 ["y", 3, 4, 8], 
                 ["z", 5, 6, 7]]

test_sim = pd.DataFrame(testSimMatrix, columns=my_cols)

#print(meanReciprocalRank(test_sim, bugs_and_fixes))

print(meanAveragePrecision(test_sim, bugs_and_fixes))

#display(test_sim)

HEY
results of report 0

query match


Unnamed: 0,filename,0,rank
0,x,1,3


file_row


filename    x
0           1
rank        3
Name: 0, dtype: object

fix                       [x]
average_precision    0.333333
Name: 0, dtype: object

results of report 1

query match


Unnamed: 0,filename,1,rank
2,z,6,1
1,y,4,2


file_row


filename    z
1           6
rank        1
Name: 2, dtype: object

file_row


filename    y
1           4
rank        2
Name: 1, dtype: object

fix                  [y, z]
average_precision         1
Name: 1, dtype: object

results of report 2

query match


Unnamed: 0,filename,2,rank


fix                  [q]
average_precision      0
Name: 2, dtype: object

0.4444444444444444


In [353]:
df2 = session[["filename", 2]]
display(df2)

Unnamed: 0,filename,2
0,x,2
1,y,4
