In [4]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re

In [21]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()

display(all_projects_source_codes.head())
display(all_projects_bugreports.head())

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
1,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
2,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
3,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS
4,\gitrepo\src\java\org\apache\commons\collectio...,/*\n * Licensed to the Apache Software Founda...,COLLECTIONS


Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
217,[org.apache.commons.collections.map.flat3map.j...,,2006-07-18 22:02:11,Flat3Map.Entry.setValue() overwrites other Ent...,Flat3Map&amp;apos;s Entry objects will overwri...,COLLECTIONS,0.0
214,[org.apache.commons.collections.testextendedpr...,,2006-07-18 22:44:33,ExtendedProperties - field include should be n...,"The field ""include"" in ExtendedProperties is c...",COLLECTIONS,0.0
222,[org.apache.commons.collections.testlistutils....,,2006-08-18 19:01:22,CollectionUtils removeAll is actually retainAll,"The removeAll(Collection collection, Collectio...",COLLECTIONS,0.0
261,[org.apache.commons.collections.map.flat3map.j...,,2007-08-20 14:11:54,Flat3Map.remove() does not return the correct ...,final Flat3Map m = new Flat3Map();\n ...,COLLECTIONS,0.0
264,[org.apache.commons.collections.fasttreemap.java],,2007-08-31 09:39:59,FastTreeMap forgets the comparator,In line 359 and 582 of the current 3.2 release...,COLLECTIONS,0.0


#### Removing New Lines

In [15]:
#remove next line characters:
def remove_new_lines(text):
    text = str(text)
    COMBINE_WHITE_SPACE = re.compile(r"(?a:\s+)")
    text = COMBINE_WHITE_SPACE.sub(' ', text)
    return text.replace('*', '').replace('/', '').replace('\\','')
    
# clean up the various white space and remove some *
def clean_new_lines_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

# clean up the description and summary, they will both be used for the query
def clean_new_lines_bug_report(df):
    df.summary = df.summary.apply(remove_new_lines)
    df['description'] = df['description'].astype('|U')
    df.description = df.description.apply(remove_new_lines)
    return df

#### Cleaning file paths

In [7]:
# changes file path to be just the filename + extension for source code files
def clean_sc_file(x):
    file = x.split("\\")
    return ''.join(file[-1:])

# changes file path to be just the filename + extension for bug report fixes 
def clean_bug_file(x):
    fixes = []

    for file in x:
        file = file.split(".")
        file = '.'.join(file[-2:])
        fixes.append(file)
    return fixes


def clean_sc_filepath(df):
    df.filename = df.filename.map(clean_sc_file)
    return df


def clean_bug_filepath(df):
    df['fix'] = df['fix'].map(clean_bug_file)
    return df

#### Cleaning Composite Words

In [13]:
#splitting composite words
#splits using camlecase syntax
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_composite_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

def clean_composite_bug_report(df):
    df.summary = df.summary.apply(findCompositeWords)
    df.description = df.description.apply(findCompositeWords)
    return df

#### Remove fixes that can't be found

In [8]:
# look through the src data frame to find where the fix is. 
def get_fix_indexes(bug, src):
    fix_list = list()
    for fixes in bug["fix"]:
        fix_sub=list()
        for fix in fixes:
            df = src[src["filename"].str.match(fix)]
            if(df.shape[0] != 0):
                fix_sub.append(df.index[0])
            else:
                fix_sub.append(-1)
        fix_list.append(fix_sub)
    # this is a list of the indexes of the file where the fix was located
    return fix_list

In [9]:
def removeFixesNotFound(bug, src):
    bug["fix_indexes"] = get_fix_indexes(bug, src)
    fixes = bug.fix.tolist()
    fix_indexes = bug.fix_indexes.tolist()
    fixes_return = []
    fixes_indexes_return = []
    for i in range(len(fixes)):
        fixes_temp = []
        indexes_temp = []
        for l in range(len(fix_indexes[i])):
            if fix_indexes[i][l] != -1:           
                fixes_temp.append(fixes[i][l])
                indexes_temp.append(fix_indexes[i][l])
        if len(fixes_temp) == 0:
            fixes_return.append(np.nan)
            fixes_indexes_return.append(np.nan)
        else:
            fixes_return.append(fixes_temp)
            fixes_indexes_return.append(indexes_temp)
#         print(fixes_return)
#         print(fixes_indexes_return)
    bug['fix'] = fixes_return
    bug['fix_indexes'] = fixes_indexes_return 
    
    return bug

#### Calling cleaning functions

In [11]:
# clean up the unprocessed code column
def clean_source_df(df):
    # clean up the new lines
    df = clean_new_lines_source_code(df)
    # clean up composite words
    df = clean_composite_source_code(df)
    # clean filepaths
    df = clean_sc_filepath(df)
    return df

# add the summary and description together and clean the data
def clean_combine_bug_df(df):
    # clean up new lines
    df = clean_new_lines_bug_report(df)
    # clean composite words
    df = clean_composite_bug_report(df)
    # clean file path
    df = clean_bug_filepath(df)
    # combine summary and descriptions to create query
    df["query"] = df["summary"] + df["description"]
    return df



### Run Cleaning and Setup Functions

In [22]:
all_projects_bugreports = all_projects_bugreports.dropna(axis=0, subset=['fix'], how='all')

#  get clean versions of the dataframes
sc_df = clean_source_df(all_projects_source_codes)
br_df = clean_combine_bug_df(all_projects_bugreports)


# remove fixes that aren't found
br_df = removeFixesNotFound(br_df, sc_df)
br_df = br_df.dropna(axis=0, subset=['fix','fix_indexes'], how='all')


### Save the clean DFs as pickle files to prevent having to clean them again

In [24]:
sc_df.to_pickle("./Output/cleanSource.pickle")
br_df.to_pickle("./Output/cleanBugs.pickle")

#### Combining stop words, keywords and operators

In [10]:
# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
#contains english stop words, java keywords and java operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)

# remove the stem and stop words
# takes in an array of strings returns an array of strings
def stem_stop(text):
    stemmer = PorterStemmer()   #"english"
    text = text.split()
    text = [w for w in text if not w in STOP_WORDS]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()
    return text



## Gensim


### Get a Series with all the source code files and all the bug reports

In [88]:
# create a series with all the source code and all the
sc_df.reset_index(drop=True, inplace=True)
training_src = sc_df.iloc[:, 0:3].copy()
training_src.columns = ['filename', 'query', 'project']

In [87]:
training_bugs = br_df[["query", "project"]].copy()
training_bugs['filename'] = 'bug'
training_bugs.reset_index(drop=True, inplace=True)
training_bugs = training_bugs[['filename', 'query','project']]

In [71]:
training_data = pd.concat([training_src, training_bugs], ignore_index=True)
training_data

Unnamed: 0,filename,query,project
0,arraystack.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
1,bag.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
2,bagutils.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
3,beanmap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
4,bidimap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
...,...,...,...
12140,bug,Undertow H T T P S listener offers no cipher ...,ELY
12141,bug,Missing null check in equals() method of Abst...,ELY
12142,bug,No log messages comming from Elytron - group ...,ELY
12143,bug,Elytron introduces S S L T L S protocol const...,ELY


### Get rid of stop words and stemming

In [86]:
training_data['query'] = training_data['query'].map(stem_stop)
training_data

Unnamed: 0,filename,query,project
0,arraystack.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS
1,bag.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS
2,bagutils.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS
3,beanmap.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS
4,bidimap.java,licens apach softwar foundat A S F) contributo...,COLLECTIONS
...,...,...,...
12140,bug,undertow H T T P S listen offer cipher suit D ...,ELY
12141,bug,miss null check equals() method abstract permi...,ELY
12142,bug,No log messag com elytron group assignmentelyt...,ELY
12143,bug,elytron introduc S S L T L S protocol constrai...,ELY


### Now start training the Gensim model

In [100]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# get the tagged documents for the doc2vec model
training_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(training_data['query']))]
training_docs

# initialize model
# vector_size = dimensionality of the feature vectors (25, 100, 200)
# window = the max distance bweteeen the predicted word and context words "TUNE"
# alpha = the initial learning rate (says that this will drop as training progresses)
# seed = for reproducibility (MAKE SURE IT WORKS) "TUNE"  [says you need 1 worker for reproducibility]
# min_count = ignore all words with a frequency lower than this
# max_vocab size = limit RAM during vocab building every 1 million words needs 1GB of RAM
# workers = number of worker threads used to train. 
# epochs = number of epochs over the corpus (10-20??)

# time the single run
%timeit -n1 -r1 doc_model = Doc2Vec(training_docs, vector_size=25, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)

# build vocabulary

# train model



5min 41s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [104]:
%timeit -n1 -r1 doc_model2 = Doc2Vec(training_docs, vector_size=100, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)


6min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [102]:
%timeit -n1 -r1 doc_model3 = Doc2Vec(training_docs, vector_size=200, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)


6min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [103]:
%timeit -n1 -r1 doc_model4 = Doc2Vec(training_docs, vector_size=200, window=3, alpha=0.05, min_count=1, seed=42, workers=4, epochs=20)


3min 10s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%timeit -n1 -r1 doc_model5 = Doc2Vec(training_docs, vector_size=25, window=3, alpha=0.05, min_count=1, seed=42, workers=1, epochs=20)
