In [1]:
from typing import Iterable
import pandas as pd
from IPython.core.display import display
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def getRawData():
    bug_report_raw=pd.read_pickle("Output/allBugReports.pickle")
    bug_report_raw = bug_report_raw[bug_report_raw['description'].notna()].reset_index()
    display(bug_report_raw)

    source_code_raw = pd.read_pickle("Output/allSourceCodes.pickle")
    source_code_raw = source_code_raw[source_code_raw['unprocessed_code'].notna()]
    display(source_code_raw)
    return bug_report_raw, source_code_raw

## Cleaning the Data

Snowball Stemmer is used since it is a more powerful stemmer than what is used by the research paper on IRFL.
  
In addition to NLTK's stop words, Python and Java keywords have been removed to provide a better model for analysis. The Python keywords will come from a built-in library called "keyword", while Java language keywords will be extracted from a file provided by the original GitHub source found here: https://github.com/exatoa/Bench4BL/blob/master/scripts/languages.txt

Camel case words such as setValue will be split into two words, "set" and "Value".

All letters will be turned into lowercases so there would not be any distinction between "Value" and "value", for example.

In [2]:
import re, string
import pandas as pd
import numpy as np
import keyword
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt 
import html

%matplotlib inline

stemmer = SnowballStemmer("english")

java_keywords = None
with open('languages.txt', 'r') as file:
    java_keywords = file.read().split('\n')

# set for o(1) lookup performance
stop_words = set(list(ENGLISH_STOP_WORDS) + keyword.kwlist + java_keywords)

def clean_text(text):
    # This is to remove the double html encodings such as &amp;apos; into simply &apos;.
    # This html encoding is easier to convert into punctuation or special characters for removal.
    text = text.replace("&amp;", '&')
    text = html.unescape(text)

    # split words and return them if it is not in the stop words list
    def getIndividualWords(all_text:str) ->Iterable[str]:
        # this regex splits apart camel case variables.
        # first half of | handles cases which are traditionally camel cased, or are just lower case.
        # second half of | handles cases where its all capital letters, ie a variable named VARIABLE
        # this also implicitly gets rid of any punctuation and any whitespace. This is because the regex
        # skips over any non-letter characters
        for word in re.findall('[A-Z]*[a-z]+|[A-Z]{2,}', all_text):
            word_lower=word.lower()
            if word_lower not in stop_words:
                yield stemmer.stem(word_lower)

    text=' '.join(getIndividualWords(text))
    return text

# print("Before:\n", bug_report_raw["description"][0])
# print("\n\nAfter:\n", clean_text(bug_report_raw["description"][0]))

In [3]:
def getProcessedData():
    bug_report_raw,source_code_raw=getRawData()
    bug_report_raw["processed_description"]=bug_report_raw["description"].map(clean_text)
    bug_report_raw["processed_summary"]=bug_report_raw["summary"].map(clean_text)
    bug_report_raw["processed_all"]=bug_report_raw["processed_description"] + bug_report_raw["processed_summary"]
    processed_bug_report=bug_report_raw[["fix","processed_all","project"]]
    display(bug_report_raw)
    source_code_raw["processed_code"]=source_code_raw["unprocessed_code"].map(clean_text)
    display(source_code_raw)
    processed_source_code=source_code_raw[["filename","processed_code","project"]]
    return processed_bug_report, processed_source_code

bug_report_raw,source_code_raw=getProcessedData()

Unnamed: 0,id,fix,text,fixdate,summary,description,project,average_precision
0,217,[org.apache.commons.collections.map.flat3map.j...,,2006-07-18 22:02:11,Flat3Map.Entry.setValue() overwrites other Ent...,Flat3Map&amp;apos;s Entry objects will overwri...,COLLECTIONS,0.0
1,214,[org.apache.commons.collections.testextendedpr...,,2006-07-18 22:44:33,ExtendedProperties - field include should be n...,"The field ""include"" in ExtendedProperties is c...",COLLECTIONS,0.0
2,222,[org.apache.commons.collections.testlistutils....,,2006-08-18 19:01:22,CollectionUtils removeAll is actually retainAll,"The removeAll(Collection collection, Collectio...",COLLECTIONS,0.0
3,261,[org.apache.commons.collections.map.flat3map.j...,,2007-08-20 14:11:54,Flat3Map.remove() does not return the correct ...,final Flat3Map m = new Flat3Map();\n ...,COLLECTIONS,0.0
4,264,[org.apache.commons.collections.fasttreemap.java],,2007-08-31 09:39:59,FastTreeMap forgets the comparator,In line 359 and 582 of the current 3.2 release...,COLLECTIONS,0.0
...,...,...,...,...,...,...,...,...
1785,692,[org.wildfly.security.auth.realm.legacypropert...,,2016-11-02 09:35:48,Add tests for special chars in LegacyPropertie...,Add tests for issue https://issues.jboss.org/b...,ELY,0.0
1786,691,[org.wildfly.security.auth.realm.legacypropert...,,2016-11-02 09:36:13,Elytron properties-realm is not compatible wit...,When users properties file (e.g. mgmt-users.pr...,ELY,0.0
1787,637,[org.wildfly.security.auth.server.serverauthen...,,2016-11-03 15:03:29,No log messages comming from Elytron - permiss...,Elytron is missing any log messages related to...,ELY,0.0
1788,757,[org.wildfly.security.ssl.sslauthenticationtes...,,2016-11-21 09:24:47,Don&apos;t use String toUpperCase/toLowerCase ...,The String.toUpperCase() and String.toLowerCas...,ELY,0.0


Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
1,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
2,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
3,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
4,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
...,...,...,...
63,\gitrepo\src\main\java\org\wildfly\security\ut...,"/*\n * JBoss, Home of Professional Open Source...",ELY
64,\gitrepo\src\main\java\org\wildfly\security\_p...,"/*\n * JBoss, Home of Professional Open Source...",ELY
65,\gitrepo\src\test\java\org\wildfly\security\ma...,"/*\n * JBoss, Home of Professional Open Source...",ELY
66,\gitrepo\src\test\java\org\wildfly\security\ma...,"/*\n * JBoss, Home of Professional Open Source...",ELY


Unnamed: 0,id,fix,text,fixdate,summary,description,project,average_precision,processed_description,processed_summary,processed_all
0,217,[org.apache.commons.collections.map.flat3map.j...,,2006-07-18 22:02:11,Flat3Map.Entry.setValue() overwrites other Ent...,Flat3Map&amp;apos;s Entry objects will overwri...,COLLECTIONS,0.0,flat map s entri object overwrit entri s valu ...,flat map entri set valu overwrit entri valu,flat map s entri object overwrit entri s valu ...
1,214,[org.apache.commons.collections.testextendedpr...,,2006-07-18 22:44:33,ExtendedProperties - field include should be n...,"The field ""include"" in ExtendedProperties is c...",COLLECTIONS,0.0,field includ extend properti current instanc s...,extend properti field includ non,field includ extend properti current instanc s...
2,222,[org.apache.commons.collections.testlistutils....,,2006-08-18 19:01:22,CollectionUtils removeAll is actually retainAll,"The removeAll(Collection collection, Collectio...",COLLECTIONS,0.0,remov collect collect collect remov method cal...,collect util remov actual retain,remov collect collect collect remov method cal...
3,261,[org.apache.commons.collections.map.flat3map.j...,,2007-08-20 14:11:54,Flat3Map.remove() does not return the correct ...,final Flat3Map m = new Flat3Map();\n ...,COLLECTIONS,0.0,flat map m flat map m integ integ m integ inte...,flat map remov doe correct valu size,flat map m flat map m integ integ m integ inte...
4,264,[org.apache.commons.collections.fasttreemap.java],,2007-08-31 09:39:59,FastTreeMap forgets the comparator,In line 359 and 582 of the current 3.2 release...,COLLECTIONS,0.0,line current releas replac map tree map map tr...,fast tree map forget compar,line current releas replac map tree map map tr...
...,...,...,...,...,...,...,...,...,...,...,...
1785,692,[org.wildfly.security.auth.realm.legacypropert...,,2016-11-02 09:35:48,Add tests for special chars in LegacyPropertie...,Add tests for issue https://issues.jboss.org/b...,ELY,0.0,add test issu https issu jboss org brows eli h...,add test special char legaci properti secur realm,add test issu https issu jboss org brows eli h...
1786,691,[org.wildfly.security.auth.realm.legacypropert...,,2016-11-02 09:36:13,Elytron properties-realm is not compatible wit...,When users properties file (e.g. mgmt-users.pr...,ELY,0.0,user properti file e g mgmt user properti use ...,elytron properti realm compat legaci user prop...,user properti file e g mgmt user properti use ...
1787,637,[org.wildfly.security.auth.server.serverauthen...,,2016-11-03 15:03:29,No log messages comming from Elytron - permiss...,Elytron is missing any log messages related to...,ELY,0.0,elytron miss log messag relat permiss assign l...,log messag com elytron permiss assign,elytron miss log messag relat permiss assign l...
1788,757,[org.wildfly.security.ssl.sslauthenticationtes...,,2016-11-21 09:24:47,Don&apos;t use String toUpperCase/toLowerCase ...,The String.toUpperCase() and String.toLowerCas...,ELY,0.0,string upper string lower method depend local ...,don t use string upper lower testsuit local pr...,string upper string lower method depend local ...


Unnamed: 0,filename,unprocessed_code,project,processed_code
0,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS,licens apach softwar foundat asf contributor l...
1,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS,licens apach softwar foundat asf contributor l...
2,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS,licens apach softwar foundat asf contributor l...
3,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS,licens apach softwar foundat asf contributor l...
4,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS,licens apach softwar foundat asf contributor l...
...,...,...,...,...
63,\gitrepo\src\main\java\org\wildfly\security\ut...,"/*\n * JBoss, Home of Professional Open Source...",ELY,jboss home profession open sourc copyright red...
64,\gitrepo\src\main\java\org\wildfly\security\_p...,"/*\n * JBoss, Home of Professional Open Source...",ELY,jboss home profession open sourc copyright red...
65,\gitrepo\src\test\java\org\wildfly\security\ma...,"/*\n * JBoss, Home of Professional Open Source...",ELY,jboss home profession open sourc copyright red...
66,\gitrepo\src\test\java\org\wildfly\security\ma...,"/*\n * JBoss, Home of Professional Open Source...",ELY,jboss home profession open sourc copyright red...


In [4]:
from scipy.sparse import lil_matrix

def getLabelsForBugReport():
    source_file_names_dot_format=source_code_raw["filename"].map(lambda fname:"org"+fname.partition("org")[2].replace("\\","."))

    fileToIndex={}
    for idx,value in enumerate(source_file_names_dot_format.values):
        fileToIndex[value]=idx
    num_files=len(source_file_names_dot_format);
    num_bug_reports=len(bug_report_raw["fix"])
    matrix=lil_matrix((num_bug_reports,num_files))
    
    for idx,file_names in enumerate(bug_report_raw["fix"].values):
        for file_name in file_names:
            try:
                matrix[idx,fileToIndex[file_name]]=1
            except KeyError:
                # note, there are many files which were 'fixes' in bug reports, but where not in the tar files given on d2l.
                # we would assume then that the tar files posted on d2l were probably missing some source files (our theory
                # is the source code is from an older version of the software)
                pass
        return matrix

    arrays=bug_report_raw["fix"].map(getSparseArrayOfIndexesFromFixedFiles)
    

labels=getLabelsForBugReport()
labels

<1790x10461 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in List of Lists format>

In [5]:
def method1():
    method1_vectorizer=TfidfVectorizer(lowercase=True,)
    method1_code_vector=method1_vectorizer.fit_transform(source_code_raw["processed_code"].to_numpy())
    query_vector=method1_vectorizer.transform(bug_report_raw["processed_all"].to_numpy())
    display(method1_code_vector)
    display(query_vector)
    return cosine_similarity(query_vector,method1_code_vector)

similarities=method1()
similarities


<10461x11984 sparse matrix of type '<class 'numpy.float64'>'
	with 910922 stored elements in Compressed Sparse Row format>

<1790x11984 sparse matrix of type '<class 'numpy.float64'>'
	with 66752 stored elements in Compressed Sparse Row format>

array([[7.00226962e-02, 7.84133356e-02, 6.30993610e-03, ...,
        7.64040827e-02, 1.00947386e-03, 9.23715463e-03],
       [1.02292219e-02, 2.08101947e-02, 4.00015104e-03, ...,
        2.46271443e-02, 1.45008980e-03, 1.12057143e-03],
       [1.22357255e-01, 2.96698794e-01, 1.85910434e-02, ...,
        8.86713076e-04, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.10141567e-02, 3.18678375e-04, 5.75575703e-03, ...,
        5.34586298e-03, 4.66788826e-03, 2.11818555e-04],
       [4.68278253e-03, 9.42174647e-03, 1.61858952e-03, ...,
        1.02084572e-02, 8.68649786e-04, 6.31521700e-03],
       [0.00000000e+00, 4.90901004e-03, 0.00000000e+00, ...,
        2.81617019e-02, 2.29831749e-03, 5.92015958e-04]])

In [6]:
mrr_score_method1=label_ranking_average_precision_score(labels.todense(),similarities)
mrr_score_method1

0.9994978717744081