In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re

In [2]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()

display(all_projects_source_codes.head())
display(all_projects_bugreports.head())

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
1,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
2,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
3,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
4,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS


Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
217,[org.apache.commons.collections.map.flat3map.j...,,2006-07-18 22:02:11,Flat3Map.Entry.setValue() overwrites other Ent...,Flat3Map&amp;apos;s Entry objects will overwri...,COLLECTIONS,0.0
214,[org.apache.commons.collections.testextendedpr...,,2006-07-18 22:44:33,ExtendedProperties - field include should be n...,"The field ""include"" in ExtendedProperties is c...",COLLECTIONS,0.0
222,[org.apache.commons.collections.testlistutils....,,2006-08-18 19:01:22,CollectionUtils removeAll is actually retainAll,"The removeAll(Collection collection, Collectio...",COLLECTIONS,0.0
261,[org.apache.commons.collections.map.flat3map.j...,,2007-08-20 14:11:54,Flat3Map.remove() does not return the correct ...,final Flat3Map m = new Flat3Map();\n ...,COLLECTIONS,0.0
264,[org.apache.commons.collections.fasttreemap.java],,2007-08-31 09:39:59,FastTreeMap forgets the comparator,In line 359 and 582 of the current 3.2 release...,COLLECTIONS,0.0


#### Removing New Lines

In [3]:
#remove next line characters:
def remove_new_lines(text):
    text = str(text)
    COMBINE_WHITE_SPACE = re.compile(r"(?a:\s+)")
    text = COMBINE_WHITE_SPACE.sub(' ', text)
    return text.replace('*', '').replace('/', '').replace('\\','')
    
# clean up the various white space and remove some *
def clean_new_lines_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

# clean up the description and summary, they will both be used for the query
def clean_new_lines_bug_report(df):
    df.summary = df.summary.apply(remove_new_lines)
    df['description'] = df['description'].astype('|U')
    df.description = df.description.apply(remove_new_lines)
    return df

#### Cleaning file paths

In [4]:
# changes file path to be just the filename + extension for source code files
def clean_sc_file(x):
    file = x.split("\\")
    return ''.join(file[-1:])

# changes file path to be just the filename + extension for bug report fixes 
def clean_bug_file(x):
    fixes = []

    for file in x:
        file = file.split(".")
        file = '.'.join(file[-2:])
        fixes.append(file)
    return fixes


def clean_sc_filepath(df):
    df.filename = df.filename.map(clean_sc_file)
    return df


def clean_bug_filepath(df):
    df['fix'] = df['fix'].map(clean_bug_file)
    return df

#### Cleaning Composite Words

In [5]:
#splitting composite words
#splits using camlecase syntax
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_composite_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

def clean_composite_bug_report(df):
    df.summary = df.summary.apply(findCompositeWords)
    df.description = df.description.apply(findCompositeWords)
    return df

#### Remove fixes that can't be found

In [6]:
# look through the src data frame to find where the fix is. 
def get_fix_indexes(bug, src):
    fix_list = list()
    for fixes in bug["fix"]:
        fix_sub=list()
        for fix in fixes:
            df = src[src["filename"].str.match(fix)]
            if(df.shape[0] != 0):
                fix_sub.append(df.index[0])
            else:
                fix_sub.append(-1)
        fix_list.append(fix_sub)
    # this is a list of the indexes of the file where the fix was located
    return fix_list

In [115]:
def removeFixesNotFound(bug, src):
    bug["fix_indexes"] = get_fix_indexes(bug, src)
    fixes = bug.fix.tolist()
    fix_indexes = bug.fix_indexes.tolist()
    fixes_return = []
    fixes_indexes_return = []
    numFixes = []
    for i in range(len(fixes)):
        fixes_temp = []
        indexes_temp = []
        numFixes.append(len(fix_indexes[i]))
        for l in range(len(fix_indexes[i])):
            if fix_indexes[i][l] != -1:           
                fixes_temp.append(fixes[i][l])
                indexes_temp.append(fix_indexes[i][l])
        if len(fixes_temp) == 0:
            fixes_return.append(np.nan)
            fixes_indexes_return.append(np.nan)
        else:
            fixes_return.append(fixes_temp)
            fixes_indexes_return.append(indexes_temp)
        
#         print(fixes_return)
#         print(fixes_indexes_return)
    bug['numFixes'] = numFixes
    bug['fix'] = fixes_return
    bug['fix_indexes'] = fixes_indexes_return 
    
    return bug

#### Calling cleaning functions

In [8]:
# clean up the unprocessed code column
def clean_source_df(df):
    # clean up the new lines
    df = clean_new_lines_source_code(df)
    # clean up composite words
    df = clean_composite_source_code(df)
    # clean filepaths
    df = clean_sc_filepath(df)
    return df

# add the summary and description together and clean the data
def clean_combine_bug_df(df):
    # clean up new lines
    df = clean_new_lines_bug_report(df)
    # clean composite words
    df = clean_composite_bug_report(df)
    # clean file path
    df = clean_bug_filepath(df)
    # combine summary and descriptions to create query
    df["query"] = df["summary"] + df["description"]
    return df



### Run Cleaning and Setup Functions

In [116]:
all_projects_bugreports = all_projects_bugreports.dropna(axis=0, subset=['fix'], how='all')

#  get clean versions of the dataframes
sc_df = clean_source_df(all_projects_source_codes)
br_df = clean_combine_bug_df(all_projects_bugreports)


# remove fixes that aren't found
br_df = removeFixesNotFound(br_df, sc_df)
br_df = br_df.dropna(axis=0, subset=['fix','fix_indexes'], how='all')


### Save the clean DFs as pickle files to prevent having to clean them again

In [12]:
sc_df.to_pickle("./Output/cleanSource.pickle")
br_df.to_pickle("./Output/cleanBugs.pickle")

#### Combining stop words, keywords and operators

In [14]:
# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
#contains english stop words, java keywords and java operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)

# remove the stem and stop words
# takes in an array of strings returns an array of strings
def stem_stop(text):
    stemmer = PorterStemmer()   #"english"
    text = text.split()
    text = [w for w in text if not w in STOP_WORDS]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()
    return text



## Gensim


### Get a Series with all the source code files and all the bug reports

In [15]:
# create a series with all the source code and all the
sc_df.reset_index(drop=True, inplace=True)
training_src = sc_df.iloc[:, 0:3].copy()
training_src.columns = ['filename', 'query', 'project']

In [101]:
training_bugs = br_df[["query", "project"]].copy()
training_bugs['filename'] = 'bug'
training_bugs.reset_index(drop=True, inplace=True)
training_bugs = training_bugs[['filename', 'query','project']]

In [17]:
# combine the two columns to create a single data frame to train the model
training_data = pd.concat([training_src, training_bugs], ignore_index=True)
training_data['query'] = training_data['query'].map(stem_stop)
training_data

Unnamed: 0,filename,query,project
0,arraystack.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
1,bag.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
2,bagutils.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
3,beanmap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
4,bidimap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
...,...,...,...
12140,bug,Undertow H T T P S listener offers no cipher ...,ELY
12141,bug,Missing null check in equals() method of Abst...,ELY
12142,bug,No log messages comming from Elytron - group ...,ELY
12143,bug,Elytron introduces S S L T L S protocol const...,ELY


### Now start training the Gensim model

In [19]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# get the tagged documents for the doc2vec model
training_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(training_data['query']))]
training_docs

# initialize model
# vector_size = dimensionality of the feature vectors (25, 100, 200)
# window = the max distance bweteeen the predicted word and context words "TUNE"
# alpha = the initial learning rate (says that this will drop as training progresses) start at 0.05? 0.025?
# seed = for reproducibility (MAKE SURE IT WORKS) "TUNE"  [says you need 1 worker for reproducibility]
# min_count = ignore all words with a frequency lower than this
# max_vocab size = limit RAM during vocab building every 1 million words needs 1GB of RAM
# workers = number of worker threads used to train. 
# epochs = number of epochs over the corpus (10-20??)

# time the single run
%timeit -n1 -r1 doc_model = Doc2Vec(training_docs, vector_size=25, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)

# build vocabulary

# train model



5min 41s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [20]:
%timeit -n1 -r1 doc_model3 = Doc2Vec(training_docs, vector_size=200, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)


6min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [105]:
%timeit -n1 -r1 doc_model5 = Doc2Vec(training_docs, vector_size=200, window=3, alpha=0.05, min_count=1, seed=42, workers=6, epochs=20)


3min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Currently working with model3

In [29]:
# print the model vocabular
tst_model = Doc2Vec(training_docs, vector_size=200, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)
tst_model.save("test_model")

In [68]:
training_data["vector"] =""
for i in range(12145):
    training_data["vector"].iloc[i] = tst_model[i]
training_data

Unnamed: 0,filename,query,project,vector,doc_vector
0,arraystack.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS,"[0.6373648, 0.36343274, -0.21556364, 0.0261671...",
1,bag.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS,"[0.39248475, 0.9095622, -0.31791908, -0.447419...",
2,bagutils.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS,"[-0.3806162, 0.014073659, 0.83630836, 0.206107...",
3,beanmap.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS,"[-0.018915145, -0.20437856, 0.5265517, -0.2119...",
4,bidimap.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS,"[0.4636378, 0.3857351, -0.26127335, -0.1524405...",
...,...,...,...,...,...
12140,bug,undertow H T T P S listen offer cipher suit D ...,ELY,"[0.5873729, -0.26671422, 0.26491788, 0.1084738...",
12141,bug,miss null check equals() method abstract permi...,ELY,"[-0.41595843, -0.3777295, 0.15903723, -0.05837...",
12142,bug,No log messag com elytron group assignmentelyt...,ELY,"[0.10762189, 0.04025764, 0.188332, 0.17249116,...",
12143,bug,elytron introduc S S L T L S protocol constrai...,ELY,"[-0.066642396, -0.86785316, 0.024531504, 0.084...",


In [46]:
print(len(training_data["vector"].iloc[0]))
print(len(training_data["vector"].iloc[1]))
print(len(training_data["vector"].iloc[2]))
print(len(training_data["vector"].iloc[12144]))

200
200
200
200


In [65]:
# testing some of the similarities and the .infer_vector function
from sklearn.metrics.pairwise import cosine_similarity
cosSim = cosine_similarity(training_data["vector"].iloc[0].reshape(1,-1), training_data["vector"].iloc[12144].reshape(1,-1)).flatten()
display(cosSim)
cosSim2 = cosine_similarity(training_data["vector"].iloc[0].reshape(1,-1), training_data["vector"].iloc[12143].reshape(1,-1)).flatten()
display(cosSim2)
tst_vector = tst_model.infer_vector(training_data["query"].iloc[12143].split())


# test to se what the .infer_vector function does
print(training_data["vector"].iloc[12143][:10])

# the test vector comes out different every time
print(tst_vector[:10])
cosSim3 = cosine_similarity(training_data["vector"].iloc[0].reshape(1,-1), tst_vector.reshape(1,-1)).flatten()
cosSim3

array([0.16581872], dtype=float32)

array([-0.07493816], dtype=float32)

[-0.0666424  -0.86785316  0.0245315   0.08457747 -0.78628224  1.4840965
  0.6094659   0.11590197  0.4568097  -0.42331797]
[ 0.14347313 -1.155734    1.0134165  -0.11994062 -0.18458024  0.6229553
 -0.04828192  0.03377128  0.66259235 -0.23978281]


array([-0.04695935], dtype=float32)

### Add the vector column to the orginal data frames (sc_df & br_df)

In [117]:
sc_df["vector"] = training_data.loc[0:sc_df.shape[0], 'vector']

t_bugs = training_data.iloc[sc_df.shape[0]:training_data.shape[0]].copy()
vect_list = t_bugs.vector.tolist()
br_df['vector'] = vect_list


### Group the data again, perform same functions as before

In [129]:
# get a list of the projects 
projects = sc_df.project.unique()

# group the data frames
sc_grouped_df = sc_df.groupby(sc_df.project)
bg_grouped_df = br_df.groupby(br_df.project)

### Run slightly different version of method 2 code to generate similarity scores
- We don't have to train and run a vectorizer. 
- Just have to iterate through the bugs in the project and generate similarity scores between it's vector and the vector for each of the source code files (direct and indirect)
- For each query we want to have an array of similarity scores where each item is for a source code file

### Calculate Direct and Indirect scores

In [124]:
# code for determining the number of terms in a source code file

# min max scaler 
def custom_min_max(arr):
    min_val = np.amin(arr)
    max_val = np.amax(arr)
    f = lambda x: (x - min_val) / (max_val - min_val)
    result = f(arr)
        
    return result

# generate number of terms based off of length of file 
def gen_num_terms(len_arr):
    len_norm = custom_min_max(len_arr)
    f = lambda x: 1 / (1 + np.exp(-1 * x))
    num_terms = f(len_norm)
    return num_terms
    

In [125]:
# calculate the similarity when using revised Vector Space Model
def calculate_rVSM_similarity(vect, src_vect, query_data, num_terms):
    query = vect.transform(query_data)
    cosSim = cosine_similarity(query, src_vect).flatten()
#     result = np.multiply(cosSim, num_terms)
#     return result

    for i in range(len(cosSim)):
        cosSim[i] = cosSim[i] * num_terms[i]
    return cosSim

# generates direct and indirect scores
# source - dataframe for source code files
# query - dataframe for 
def generate_scores_list(source, query):
    direct_scores = []
    indirect_scores = []
    
    # create hash lookup table to decrease search time for filename index.
    lookup_table = dict()
    names = source.filename.tolist()
    for i in range(len(names)):
        lookup_table[names[i]] = i
    
    # used to define the number of terms for each source code file
    source_lengths = source['unprocessed_code'].map(lambda x: len(x.split()))
    
    # get the number of terms for each file
    num_terms_list = gen_num_terms(source_lengths)
    
    # get the DIRECT and INDIRECT similarity scores for the bug reports
    src_code_len = len(source['unprocessed_code'])
    prev_bugs = query["fix"].tolist()
    num_fixes = query["numFixes"].tolist()
    for q in query["query"]:
        # calculate direct similarity and append it to the list
        similarity = calculate_rVSM_similarity(vect, src_vect, [q], num_terms_list)
        direct_scores.append(similarity)   
        
        # calculate indirect similarity and append it to the list
        
        indirect_similarity = calculate_indirect_scores(src_code_len, [q], 
                                                        query, prev_bugs, num_fixes, lookup_table)
        indirect_scores.append(indirect_similarity)

#     print(len(direct_scores))
#     print(len(indirect_scores))
    return direct_scores, indirect_scores

# caclulate the similarity between new bugs and old bugs.
def calculate_indirect_scores(src_len, query_data, query_df, prev_bugs, num_fixes, table):

 
    
    # np array of zeros, update the values as needed.
    sim_scores = np.zeros(src_len)

    bug_vect = vect.transform(query_df["query"])
    
    # CAN'T COMPARE A BUG TO IT'S SELF
    # get similarity between the query and the prev bug query
    bugs_sim = calculate_similarity(bug_vect, query) # one of the entries should be 1
    
    num_bugs = len(prev_bugs)
#     print(num_bugs)
#     print(len(bugs_sim))
    for indx in range(num_bugs):
        
#         print("Index: ", indx)

    # get the number of fixes, used for calculating similarity
        num_fix = num_fixes[indx]
        
        # for each fix find it's index in the source['filename'] column
        for fix_indx in range(num_fix):
#             print("fix index: ", fix_indx)
#             print("Previous Bugs at indx:" , prev_bugs[indx])
#             print("Single bug? " ,prev_bugs[indx][fix_indx] )
            sim_indx = table.get(prev_bugs[indx][fix_indx])
            if(sim_indx):
                if(bugs_sim[indx] == 1):
                    # don't add the similarity values if they are for the same bug
                    pass
                else:
                    sim_scores[sim_indx] = sim_scores[sim_indx] + (bugs_sim[indx]/num_fix)
            else:
                missing_count += 1
    # now we have a list of indirect similarity scores for a single bug and all src code files
#     print("Number missing: ", missing_count)
    return sim_scores

    
    

### Rank the similarity scores and Compute MAP and MRR

In [126]:
# rank all the similarity scores
def rank_sim_scores(scores):
    sim_scores = list()
    
    for score in scores:
        indicies = range(len(score))

        scores_tuple = tuple(zip(score,indicies))
        sorted_tuple = sorted(scores_tuple, reverse = True)

        sim_scores.append(sorted_tuple)
    
    return sim_scores

#Checks the average precision for each bug
def average_precision(fix_indexes,ranked_sim):
    ap_list = list()
    for fixes,ranked_list in zip(fix_indexes,ranked_sim):
        hit_list = list()
        countTrue=0
        for i in range(len(ranked_list)):
            # check if source file is actually where bug is located
            if(ranked_list[i][1] in fixes):
                countTrue+=1
                hit_list.append(countTrue/(i+1))
        if(countTrue != 0):
            ap_list.append(sum(hit_list)/countTrue)
        else:
            ap_list.append(0)
    return ap_list


#reciprocal rank is 1/n, where n is the first position of a source file where the bug is located in the ranked_sim column
def reciprocal_rank(fix_indexes,ranked_sim):
    rr_list = list()
    for fixes,ranked_list in zip(fix_indexes,ranked_sim):
        rr = 0
        for i in range(len(ranked_list)):
            # check if source file is actually where bug is located
            if(ranked_list[i][1] in fixes):
                rr = 1/(i+1)
                break
        rr_list.append(rr)
    return rr_list

# Gets a list containing the rank of all fixes that were found in the ranked similarity list
def get_fix_rank(bug, isCosineSim=True):
    fix_list = list()
    ranked_sim = 'ranked_sim'
    if not isCosineSim:
        ranked_sim = 'ranked_eq7_sim'
    for index, row in bug.iterrows():
        i_list = list()
        for i in range(len(row[ranked_sim])):
            if(row[ranked_sim][i][1] in row['fix_indexes']):
                i_list.append(i+1)
        fix_list.append(i_list)
    return fix_list

### Get metrics into the data frames

In [127]:
# take in the source code df for a project and a single query return scores
import warnings
warnings.filterwarnings("ignore")

def generate_all_scores():
    
    all_bugs = []
    all_src = []
    # iterate through the list of 12 projects
    i = 0
    for proj in projects:
        print("Getting scores for project ",proj,"...")
        # create dataframes for each project
        src_df = sc_grouped_df.get_group(proj)
        bug_df = bg_grouped_df.get_group(proj).copy()
        
        # generate the direct and indirect scores
        direct_scores, indirect_scores = generate_scores_list(src_df, bug_df)
        
        #append direct scores list to the bug dataframe
        bug_df["direct_sim"] = direct_scores # the only way that the matrix is related to the src code 
                                        # is through the index.
            
        #append indirect scores list to bug dataframe
        bug_df["indirect_sim"] = indirect_scores
        
        
        bug_df["fix_indexes"] = get_fix_indexes(bug_df, src_df)

        
      
        # maintain a list of all the dataframes
        all_bugs.append(bug_df)
        all_src.append(src_df)
    # concatenate all the data frames in order    
    all_bug_df = pd.concat(all_bugs, ignore_index=True)
    all_src_df = pd.concat(all_src, ignore_index=True)
    return all_bug_df, all_src_df

In [130]:
bugs, sources = generate_all_scores()

Getting scores for project  COLLECTIONS ...


NameError: name 'vect' is not defined