In [187]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import re
import math

In [232]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('GlobalOutput/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('GlobalOutput/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()

display(all_projects_source_codes.head())
display(all_projects_bugreports.head())

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\camel-core\src\main\java\org\apache\c...,/**\n * Licensed to the Apache Software Founda...,CAMEL
1,\gitrepo\camel-core\src\main\java\org\apache\c...,/**\n * Licensed to the Apache Software Founda...,CAMEL
2,\gitrepo\camel-core\src\main\java\org\apache\c...,/**\n * Licensed to the Apache Software Founda...,CAMEL
3,\gitrepo\camel-core\src\main\java\org\apache\c...,/**\n * Licensed to the Apache Software Founda...,CAMEL
4,\gitrepo\camel-core\src\main\java\org\apache\c...,/**\n * Licensed to the Apache Software Founda...,CAMEL


Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
72,[org.apache.camel.component.file.fileconfigure...,,2007-07-09 09:00:19,FileConfigureTest can&apos;t pass in Windows box,Because of the File.separator is different bet...,CAMEL,0.0
81,[org.apache.camel.impl.servicesupport.java],,2007-07-30 16:49:10,Stop logic a bit off in ServiceSupport.java,"With the current logic, during stop the servic...",CAMEL,0.0
85,[org.apache.camel.component.vm.vmcomponent.java],,2007-08-03 20:18:56,VM Component should extend Seda not Queue,It appears that the deprecation of the Queue c...,CAMEL,0.0
105,[org.apache.camel.component.file.fileproducer....,,2007-08-14 21:43:05,FileProducer truncates message bodies > 256KB,Thanks to NIO&amp;apos;s awesomely intuitive b...,CAMEL,0.0
103,[org.apache.camel.spring.camelcontextfactorybe...,,2007-08-17 04:42:11,ClassCastException when using GenericApplicati...,\nCaused by: java.lang.ClassCastException:\nor...,CAMEL,0.0


#### Removing New Lines

In [233]:
#remove next line characters:
def remove_new_lines(text):
    text = str(text)
    COMBINE_WHITE_SPACE = re.compile(r"(?a:\s+)")
    text = COMBINE_WHITE_SPACE.sub(' ', text)
    return text.replace('*', '').replace('/', '').replace('\\','')
    
# clean up the various white space and remove some *
def clean_new_lines_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

# clean up the description and summary, they will both be used for the query
def clean_new_lines_bug_report(df):
    df.summary = df.summary.apply(remove_new_lines)
    df['description'] = df['description'].astype('|U')
    df.description = df.description.apply(remove_new_lines)
    return df

#### Cleaning file paths

In [234]:
# changes file path to be just the filename + extension for source code files
def clean_sc_file(x):
    file = x.split("\\")
    return ''.join(file[-1:])

# changes file path to be just the filename + extension for bug report fixes 
def clean_bug_file(x):
    fixes = []

    for file in x:
        file = file.split(".")
        file = '.'.join(file[-2:])
        fixes.append(file)
    return fixes


def clean_sc_filepath(df):
    df.filename = df.filename.map(clean_sc_file)
    return df


def clean_bug_filepath(df):
    df['fix'] = df['fix'].map(clean_bug_file)
    return df

#### Cleaning Composite Words

In [235]:
#splitting composite words
#splits using camlecase syntax
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_composite_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

def clean_composite_bug_report(df):
    df.summary = df.summary.apply(findCompositeWords)
    df.description = df.description.apply(findCompositeWords)
    return df

#### Remove fixes that can't be found

In [236]:
# look through the src data frame to find where the fix is. 
def get_fix_indexes(bug, src):
    fix_list = list()
    for fixes in bug["fix"]:
        fix_sub=list()
        for fix in fixes:
            df = src[src["filename"].str.match(fix)]
            if(df.shape[0] != 0):
                fix_sub.append(df.index[0])
            else:
                fix_sub.append(-1)
        fix_list.append(fix_sub)
    # this is a list of the indexes of the file where the fix was located
    return fix_list

In [237]:
def removeFixesNotFound(bug, src):
    bug["fix_indexes"] = get_fix_indexes(bug, src)
    fixes = bug.fix.tolist()
    fix_indexes = bug.fix_indexes.tolist()
    fixes_return = []
    fixes_indexes_return = []
    numFixes = []
    for i in range(len(fixes)):
        fixes_temp = []
        indexes_temp = []
        numFixes.append(len(fix_indexes[i]))
        for l in range(len(fix_indexes[i])):
            if fix_indexes[i][l] != -1:           
                fixes_temp.append(fixes[i][l])
                indexes_temp.append(fix_indexes[i][l])
        if len(fixes_temp) == 0:
            fixes_return.append(np.nan)
            fixes_indexes_return.append(np.nan)
        else:
            fixes_return.append(fixes_temp)
            fixes_indexes_return.append(indexes_temp)
        
#         print(fixes_return)
#         print(fixes_indexes_return)
    bug['numFixes'] = numFixes
    bug['fix'] = fixes_return
    bug['fix_indexes'] = fixes_indexes_return 
    
    return bug

#### Calling cleaning functions

In [238]:
# clean up the unprocessed code column
def clean_source_df(df):
    # clean up the new lines
    df = clean_new_lines_source_code(df)
    # clean up composite words
    df = clean_composite_source_code(df)
    # clean filepaths
    df = clean_sc_filepath(df)
    return df

# add the summary and description together and clean the data
def clean_combine_bug_df(df):
    # clean up new lines
    df = clean_new_lines_bug_report(df)
    # clean composite words
    df = clean_composite_bug_report(df)
    # clean file path
    df = clean_bug_filepath(df)
    # combine summary and descriptions to create query
    df["query"] = df["summary"] + df["description"]
    return df



### Run Cleaning and Setup Functions

In [239]:
all_projects_bugreports = all_projects_bugreports.dropna(axis=0, subset=['fix'], how='all')

#  get clean versions of the dataframes
sc_df = clean_source_df(all_projects_source_codes)
br_df = clean_combine_bug_df(all_projects_bugreports)


# remove fixes that aren't found
br_df = removeFixesNotFound(br_df, sc_df)
br_df = br_df.dropna(axis=0, subset=['fix','fix_indexes'], how='all')


### Save the clean DFs as pickle files to prevent having to clean them again

In [240]:
sc_df.to_pickle("./GlobalOutput/cleanSource.pickle")
br_df.to_pickle("./GlobalOutput/cleanBugs.pickle")

#### Combining stop words, keywords and operators

In [241]:
# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
#contains english stop words, java keywords and java operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)

# remove the stem and stop words
# takes in an array of strings returns an array of strings
def stem_stop(text):
    stemmer = PorterStemmer()   #"english"
    text = text.split()
    text = [w for w in text if not w in STOP_WORDS]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()
    return text



## Gensim


### Get a Series with all the source code files and all the bug reports

In [242]:
# create a series with all the source code and all the
sc_df.reset_index(drop=True, inplace=True)
training_src = sc_df.iloc[:, 0:3].copy()
training_src.columns = ['filename', 'query', 'project']

In [243]:
training_bugs = br_df[["query", "project"]].copy()
training_bugs['filename'] = 'bug'
training_bugs.reset_index(drop=True, inplace=True)
training_bugs = training_bugs[['filename', 'query','project']]

In [244]:
# combine the two columns to create a single data frame to train the model
training_data = pd.concat([training_src, training_bugs], ignore_index=True)
training_data['query'] = training_data['query'].map(stem_stop)
training_data

Unnamed: 0,filename,query,project
0,alreadystoppedexception.java,licens apach softwar foundat A S F) contributo...,CAMEL
1,asynccallback.java,licens apach softwar foundat A S F) contributo...,CAMEL
2,asyncprocessor.java,licens apach softwar foundat A S F) contributo...,CAMEL
3,asyncproducercallback.java,licens apach softwar foundat A S F) contributo...,CAMEL
4,attachments.java,licens apach softwar foundat A S F) contributo...,CAMEL
...,...,...,...
82377,bug,X M L parser load eagerlyw need make use A P I...,WFLY
82378,bug,cluster X M L reader load eagerlyp M] radoslav...,WFLY
82379,bug,cannot commun server port offset properti seti...,WFMP
82380,bug,commands#fail On error default fals unless set...,WFMP


In [245]:
training_data.to_pickle("./GlobalOutput/cleanTrainingData.pickle")

### Now start training the Gensim model

In [255]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from datetime import datetime
# get the tagged documents for the doc2vec model
training_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(list(training_data['query']))]
training_docs

# initialize model
# dm = 0 This will set the model to use DBOW and not DMPV
# vector_size = 300
# window = DBOW = 15, DMPV = 5
# alpha = 0.01
# min_alpha = 0.0001
# seed = for reproducibility (MAKE SURE IT WORKS) "TUNE"  [says you need 1 worker for reproducibility]
# min_count = 1
# workers = number of worker threads used to train. 
# epochs = DBOW = 20, DMPV = 600

# time the single run with 6 workers Started at 2:43

dateTimeObj1 = datetime.now()
print(dateTimeObj1)
doc_model_dbow = Doc2Vec(training_docs, dm=0, vector_size=300, window=15, alpha=0.01, min_alpha=0.0001, min_count=1, seed=42, workers=6, epochs=20)
doc_model_dbow.save("dbow_model")

dateTimeObj2 = datetime.now()
print(dateTimeObj2)

doc_model_pvdm = Doc2Vec(training_docs, dm=1, vector_size=300, window=5, alpha=0.01, min_alpha=0.0001, min_count=1, seed=42, workers=6, epochs=600)
doc_model_pvdm.save("pvdm_model")

dateTimeObj3 = datetime.now()
print(dateTimeObj3)

2020-12-17 16:24:39.344846
2020-12-17 16:41:51.378098


KeyboardInterrupt: 

In [250]:
training_data["vector"] =""
for i in range(82382):
    training_data["vector"].iloc[i] = doc_model[i]
training_data

Unnamed: 0,filename,query,project,vector
0,alreadystoppedexception.java,licens apach softwar foundat A S F) contributo...,CAMEL,"[-0.11593803, 0.3264231, -0.39266086, -0.17832..."
1,asynccallback.java,licens apach softwar foundat A S F) contributo...,CAMEL,"[-0.09054362, 0.2255133, -0.33315194, -0.30768..."
2,asyncprocessor.java,licens apach softwar foundat A S F) contributo...,CAMEL,"[0.14799617, 0.19888866, -0.36354512, -0.49668..."
3,asyncproducercallback.java,licens apach softwar foundat A S F) contributo...,CAMEL,"[0.01663894, 0.16521521, -0.1554454, -0.478559..."
4,attachments.java,licens apach softwar foundat A S F) contributo...,CAMEL,"[-0.072032474, 0.084826775, 0.0063502714, -0.2..."
...,...,...,...,...
82377,bug,X M L parser load eagerlyw need make use A P I...,WFLY,"[-0.004771417, 0.17575903, -0.24528211, -0.152..."
82378,bug,cluster X M L reader load eagerlyp M] radoslav...,WFLY,"[-0.047766443, 0.122389704, -0.3051204, -0.101..."
82379,bug,cannot commun server port offset properti seti...,WFMP,"[0.08153363, 0.35014084, -0.09306045, -0.25451..."
82380,bug,commands#fail On error default fals unless set...,WFMP,"[-0.044513583, 0.27791062, -0.1961836, 0.01690..."


In [252]:
# all of the vectors have 300 dimensions
print(len(training_data["vector"].iloc[0]))
print(len(training_data["vector"].iloc[1]))
print(len(training_data["vector"].iloc[82381]))
print(len(training_data["vector"].iloc[12144]))

300
300
300
300


In [253]:
# testing some of the similarities and the .infer_vector function
from sklearn.metrics.pairwise import cosine_similarity
cosSim = cosine_similarity(training_data["vector"].iloc[0].reshape(1,-1), training_data["vector"].iloc[12144].reshape(1,-1)).flatten()
display(cosSim)
cosSim2 = cosine_similarity(training_data["vector"].iloc[0].reshape(1,-1), training_data["vector"].iloc[12143].reshape(1,-1)).flatten()
display(cosSim2)
tst_vector = tst_model.infer_vector(training_data["query"].iloc[12143].split())


display(training_data["vector"].shape)
lst_vects = training_data["vector"].tolist()
display(len(lst_vects))
print("list")
simList = cosine_similarity(training_data['vector'].iloc[12143].reshape(1,-1), lst_vects).flatten()
display(simList)
print(len(simList))

array([0.1862311], dtype=float32)

array([0.44858396], dtype=float32)

(82382,)

82382

list


array([0.44858396, 0.40365702, 0.41283527, ..., 0.17609774, 0.18451622,
       0.33470625], dtype=float32)

82382


### Add the vector column to the orginal data frames (sc_df & br_df)

In [117]:
sc_df["vector"] = training_data.loc[0:sc_df.shape[0], 'vector']

t_bugs = training_data.iloc[sc_df.shape[0]:training_data.shape[0]].copy()
vect_list = t_bugs.vector.tolist()
br_df['vector'] = vect_list


### Group the data again, perform same functions as before

In [129]:
# get a list of the projects 
projects = sc_df.project.unique()

# group the data frames
sc_grouped_df = sc_df.groupby(sc_df.project)
bg_grouped_df = br_df.groupby(br_df.project)

### Run slightly different version of method 2 code to generate similarity scores
- We don't have to train and run a vectorizer. 
- Just have to iterate through the bugs in the project and generate similarity scores between it's vector and the vector for each of the source code files (direct and indirect)
- For each query we want to have an array of similarity scores where each item is for a source code file

### Calculate Direct and Indirect scores

In [124]:
# code for determining the number of terms in a source code file

# min max scaler 
def custom_min_max(arr):
    min_val = np.amin(arr)
    max_val = np.amax(arr)
    f = lambda x: (x - min_val) / (max_val - min_val)
    result = f(arr)
        
    return result

# generate number of terms based off of length of file 
def gen_num_terms(len_arr):
    len_norm = custom_min_max(len_arr)
    f = lambda x: 1 / (1 + np.exp(-1 * x))
    num_terms = f(len_norm)
    return num_terms
    

In [228]:
# calculate the similarity when using revised Vector Space Model
def calculate_rVSM_similarity(src_vect, query, num_terms):
    result = []
    cosSim = cosine_similarity(query.reshape(1,-1), src_vect).flatten()
  
    for i in range(len(cosSim)):   
        result.append(cosSim[i] * num_terms[i])
    return cosSim

# calculate the similarity when using basic cosine similarity, used in indirect similarity calculation   
def calculate_similarity(src_vect, query):    
    cosSim = cosine_similarity(query.reshape(1,-1), src_vect).flatten()
    return cosSim


# generates direct and indirect scores
# source - dataframe for source code files
# query - dataframe for 
def generate_scores_list(source, query):
    direct_scores = []
    indirect_scores = []
    
    # create hash lookup table to decrease search time for filename index.
    lookup_table = dict()
    names = source.filename.tolist()
    for i in range(len(names)):
        lookup_table[names[i]] = i
    
    # used to define the number of terms for each source code file
    source_lengths = source['unprocessed_code'].map(lambda x: len(x.split())).tolist()
    
    # get the number of terms for each file
    num_terms_list = gen_num_terms(source_lengths)
    
    # get the DIRECT and INDIRECT similarity scores for the bug reports
    src_vects = source['vector'].tolist()
    src_code_len = len(source['unprocessed_code'])
    prev_bug_fixes = query["fix"].tolist()
    num_fixes = query["numFixes"].tolist()
    prev_vects = query['vector'].tolist()
    
    for q in prev_vects:
        # calculate direct similarity and append it to the list
        similarity = calculate_rVSM_similarity(src_vects, q, num_terms_list)
        direct_scores.append(similarity)

        
#         print("Its length: ", len(similarity))
#         print("Min: ", min(similarity))
#         print("Max: ", max(similarity))


        # calculate indirect similarity and append it to the list
        
        # num_fixes is the previous number of fixes, not how many remain in the array
        indirect_similarity = calculate_indirect_scores(src_code_len, q, prev_vects, 
                                                        prev_bug_fixes, num_fixes, lookup_table)
        indirect_scores.append(indirect_similarity)

   
    return direct_scores, indirect_scores

# caclulate the similarity between new bugs and old bugs.
def calculate_indirect_scores(src_len, query_vect, prev_vects, prev_bugs, num_fixes, table):
    sim_1 = 0
    # np array of zeros, indexed to match the source code files.
    sim_scores = np.zeros(src_len)
    
    # CAN'T COMPARE A BUG TO IT'S SELF
    # get similarities between the query and all the prev bug query
    bugs_sim = calculate_similarity(prev_vects, query_vect) # one of the entries should be 1
    
#     print("Min: ", min(bugs_sim))
#     print("Max: ", max(bugs_sim))

    num_bugs = len(prev_bugs)

    # for every bug find where it's fixes were found and update the similarity score 
    #    at that index in the sim_scores add the similarity score
    for indx in range(num_bugs):
        
    # get the number of fixes, used for calculating similarity
        num_fix = num_fixes[indx]
        

#         print("The original number of fixes: ", num_fixes[indx])
#         print("The current number of fixes: ", len(prev_bugs[indx]))
#         print(prev_bugs[indx])


        # for each fix find it's index in the source['filename'] column
        for fix_indx in range(len(prev_bugs[indx])):

            sim_indx = table.get(prev_bugs[indx][fix_indx])
            if(sim_indx):
                if(math.isclose(bugs_sim[indx], 1, abs_tol=0.0001)):
                    # don't add the similarity values if they are for the same bug
                    sim_1 +=1
                    pass
                else:
                    sim_scores[sim_indx] = sim_scores[sim_indx] + (bugs_sim[indx]/num_fix)

    # now we have a list of indirect similarity scores for a single bug and all src code files

    
#     print("The number of similarities = 1: ", sim_1)
#     print("It should be: " , num_fixes)
    return sim_scores

    
    

### Rank the similarity scores and Compute MAP and MRR

In [126]:
# rank all the similarity scores
def rank_sim_scores(scores):
    sim_scores = list()
    
    for score in scores:
        indicies = range(len(score))

        scores_tuple = tuple(zip(score,indicies))
        sorted_tuple = sorted(scores_tuple, reverse = True)

        sim_scores.append(sorted_tuple)
    
    return sim_scores

#Checks the average precision for each bug
def average_precision(fix_indexes,ranked_sim):
    ap_list = list()
    for fixes,ranked_list in zip(fix_indexes,ranked_sim):
        hit_list = list()
        countTrue=0
        for i in range(len(ranked_list)):
            # check if source file is actually where bug is located
            if(ranked_list[i][1] in fixes):
                countTrue+=1
                hit_list.append(countTrue/(i+1))
        if(countTrue != 0):
            ap_list.append(sum(hit_list)/countTrue)
        else:
            ap_list.append(0)
    return ap_list


#reciprocal rank is 1/n, where n is the first position of a source file where the bug is located in the ranked_sim column
def reciprocal_rank(fix_indexes,ranked_sim):
    rr_list = list()
    for fixes,ranked_list in zip(fix_indexes,ranked_sim):
        rr = 0
        for i in range(len(ranked_list)):
            # check if source file is actually where bug is located
            if(ranked_list[i][1] in fixes):
                rr = 1/(i+1)
                break
        rr_list.append(rr)
    return rr_list

# Gets a list containing the rank of all fixes that were found in the ranked similarity list
def get_fix_rank(bug, isCosineSim=True):
    fix_list = list()
    ranked_sim = 'ranked_sim'
    if not isCosineSim:
        ranked_sim = 'ranked_eq7_sim'
    for index, row in bug.iterrows():
        i_list = list()
        for i in range(len(row[ranked_sim])):
            if(row[ranked_sim][i][1] in row['fix_indexes']):
                i_list.append(i+1)
        fix_list.append(i_list)
    return fix_list

### Get metrics into the data frames

In [229]:
# take in the source code df for a project and a single query return scores
import warnings
warnings.filterwarnings("ignore")

projects2 = projects[1:2]

def generate_all_scores():
    
    all_bugs = []
    all_src = []
    # iterate through the list of 12 projects
    for proj in projects:
        print("Getting scores for project ",proj,"...")
        # create dataframes for each project
        src_df = sc_grouped_df.get_group(proj)
        bug_df = bg_grouped_df.get_group(proj).copy()
        
        # generate the direct and indirect scores
        direct_scores, indirect_scores = generate_scores_list(src_df, bug_df)
        
        #append direct scores list to the bug dataframe
        bug_df["direct_sim"] = direct_scores # the only way that the matrix is related to the src code 
                                        # is through the index.
            
        #append indirect scores list to bug dataframe
        bug_df["indirect_sim"] = indirect_scores
        
        
        bug_df["fix_indexes"] = get_fix_indexes(bug_df, src_df)

        
      
        # maintain a list of all the dataframes
        all_bugs.append(bug_df)
        all_src.append(src_df)
    # concatenate all the data frames in order    
    all_bug_df = pd.concat(all_bugs, ignore_index=True)
    all_src_df = pd.concat(all_src, ignore_index=True)
    return all_bug_df, all_src_df

In [230]:
bugs, sources = generate_all_scores()

Getting scores for project  COLLECTIONS ...
Getting scores for project  CONFIGURATION ...
Getting scores for project  IO ...
Getting scores for project  LANG ...
Getting scores for project  DATACMNS ...
Getting scores for project  DATAMONGO ...
Getting scores for project  DATAREST ...
Getting scores for project  LDAP ...
Getting scores for project  SEC ...
Getting scores for project  SOCIALFB ...
Getting scores for project  SPR ...
Getting scores for project  ELY ...


In [231]:
display(bugs)
display(sources)

Unnamed: 0,fix,text,fixdate,summary,description,project,average_precision,query,fix_indexes,numFixes,vector,direct_sim,indirect_sim
0,"[flat3map.java, testflat3map.java]",,2006-07-18 22:02:11,Flat3 Map. Entry.set Value() overwrites oth...,Flat3 Map&amp;apos;s Entry objects will over...,COLLECTIONS,0.0,Flat3 Map. Entry.set Value() overwrites oth...,"[233, 436]",2,"[-0.54710054, 0.8093695, -1.1743436, 0.1350008...","[-0.09654778, -0.07379957, -0.061074868, -0.08...","[0.0, 0.0, 0.01401775038901416, 0.0, 0.0, 0.0,..."
1,"[testextendedproperties.java, extendedproperti...",,2006-07-18 22:44:33,Extended Properties - field include should be...,"The field ""include"" in Extended Properties i...",COLLECTIONS,0.0,Extended Properties - field include should be...,"[292, 22]",2,"[-0.010263926, -0.29718, -0.04093454, -0.13734...","[0.10770507, 0.11548331, 0.121062756, 0.205946...","[0.0, 0.0, 0.0269814923959674, 0.0, 0.0, 0.0, ..."
2,"[testlistutils.java, testcollectionutils.java,...",,2006-08-18 19:01:22,Collection Utils remove All is actually reta...,"The remove All( Collection collection, Coll...",COLLECTIONS,0.0,Collection Utils remove All is actually reta...,"[303, 288, 15]",3,"[0.01646587, -0.16084479, -0.042548787, -0.062...","[0.04446762, 0.14768608, 0.14675924, 0.10587, ...","[0.0, 0.0, 0.02720463365129317, 0.0, 0.0, 0.0,..."
3,"[flat3map.java, testflat3map.java]",,2007-08-20 14:11:54,Flat3 Map.remove() does not return the correc...,Flat3 Map m = new Flat3 Map(); m.put( new ...,COLLECTIONS,0.0,Flat3 Map.remove() does not return the correc...,"[233, 436]",2,"[0.14817916, -0.18698174, 0.00082106725, -0.12...","[0.36616066, 0.06840743, 0.062879205, 0.113124...","[0.0, 0.0, 0.002627203000852418, 0.0, 0.0, 0.0..."
4,[fasttreemap.java],,2007-08-31 09:39:59,Fast Tree Map forgets the comparator,In line 359 and 582 of the current 3.2 release...,COLLECTIONS,0.0,Fast Tree Map forgets the comparatorIn line ...,[27],1,"[-0.13191527, -0.18016613, 0.10219742, -0.1410...","[0.12645435, 0.21623495, 0.23732753, 0.1655188...","[0.0, 0.0, 0.02531724106223614, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,[ciphersuiteselector.java],,2016-06-01 13:52:37,Undertow H T T P S listener offers no cip...,No cipher suites are available for handshake w...,ELY,0.0,Undertow H T T P S listener offers no cip...,[10433],1,"[0.5873729, -0.26671422, 0.26491788, 0.1084738...","[0.23647064, 0.23673111, -0.034009416, 0.29516...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.231..."
1680,[abstractpermission.java],,2016-07-29 15:23:39,Missing null check in equals() method of Abst...,There is missing null check in org.wildfly.sec...,ELY,0.0,Missing null check in equals() method of Abst...,[-1],1,"[-0.41595843, -0.3777295, 0.15903723, -0.05837...","[-0.040133074, -0.0024438687, 0.15373051, 0.02...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.146..."
1681,[elytronmessages.java],,2016-10-12 10:32:51,No log messages comming from Elytron - group ...,Elytron is missing any log messages related to...,ELY,0.0,No log messages comming from Elytron - group ...,[10457],1,"[0.10762189, 0.04025764, 0.188332, 0.17249116,...","[0.2632262, 0.21124604, 0.24380568, 0.19322138...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.199..."
1682,[protocol.java],,2016-10-25 14:39:59,Elytron introduces S S L T L S protocol ...,"L I S T, ""description"" =&gt; "" The enabled...",ELY,0.0,Elytron introduces S S L T L S protocol ...,[10447],1,"[-0.066642396, -0.86785316, 0.024531504, 0.084...","[0.027853105, -0.041121133, -0.23069385, -0.03...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.132..."


Unnamed: 0,filename,unprocessed_code,project,vector
0,arraystack.java,Licensed to the Apache Software Foundation ...,COLLECTIONS,"[0.6373648, 0.36343274, -0.21556364, 0.0261671..."
1,bag.java,Licensed to the Apache Software Foundation ...,COLLECTIONS,"[0.39248475, 0.9095622, -0.31791908, -0.447419..."
2,bagutils.java,Licensed to the Apache Software Foundation ...,COLLECTIONS,"[-0.3806162, 0.014073659, 0.83630836, 0.206107..."
3,beanmap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS,"[-0.018915145, -0.20437856, 0.5265517, -0.2119..."
4,bidimap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS,"[0.4636378, 0.3857351, -0.26127335, -0.1524405..."
...,...,...,...,...
10456,package-info.java,"J Boss, Home of Professional Open Source....",ELY,"[-0.09667181, -0.21859899, -0.253523, -0.35537..."
10457,elytronmessages.java,"J Boss, Home of Professional Open Source....",ELY,"[0.431815, -0.65128237, 0.6918912, -0.583043, ..."
10458,testpermissionactions.java,"J Boss, Home of Professional Open Source....",ELY,"[-0.8184448, 0.2443168, 0.14690226, -0.8504981..."
10459,teststackinspector.java,"J Boss, Home of Professional Open Source....",ELY,"[0.110203765, -0.09885522, -0.17536747, 0.0646..."
