# Phase 1 

### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer

### Now load the pickle data into dataframes from the Output folder one directory above

In [32]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()
display(all_projects_bugreports.iloc[0])
display(all_projects_source_codes.iloc[2])
display(all_projects_bugreports.iloc[0].name)
display(all_projects_source_codes.iloc[2].name)

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


'217'

2

In [6]:
# need to get the file names in the bug report an source code to be the same
display(all_projects_bugreports.iloc[0].fix)
display(all_projects_source_codes.filename)


array(['org.apache.commons.collections.map.flat3map.java',
       'org.apache.commons.collections.map.testflat3map.java'],
      dtype='<U52')

0     \gitrepo\src\java\org\apache\commons\collectio...
1     \gitrepo\src\java\org\apache\commons\collectio...
2     \gitrepo\src\java\org\apache\commons\collectio...
3     \gitrepo\src\java\org\apache\commons\collectio...
4     \gitrepo\src\java\org\apache\commons\collectio...
                            ...                        
63    \gitrepo\src\main\java\org\wildfly\security\ut...
64    \gitrepo\src\main\java\org\wildfly\security\_p...
65    \gitrepo\src\test\java\org\wildfly\security\ma...
66    \gitrepo\src\test\java\org\wildfly\security\ma...
67    \gitrepo\src\test\java\org\wildfly\security\ss...
Name: filename, Length: 10461, dtype: object

## Removing composite varaibles

In [22]:
import re
#remove next line characters:
def remove_new_lines(text):
#     return str(x).replace('\n', '').replace('*', '').replace('/', '').replace('\\','').replace('\t','')
    text = str(text)
    COMBINE_WHITE_SPACE = re.compile(r"(?a:\s+)")
    text = COMBINE_WHITE_SPACE.sub(' ', text)
    return text.replace('*', '').replace('/', '').replace('\\','')
    
# clean up the various white space and remove some *
def clean_new_lines_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

# clean up the description and summary, they will both be used for the query
def clean_new_lines_bug_report(df):
    df.summary = df.summary.apply(remove_new_lines)
    df['description'] = df['description'].astype('|S')
    df.description = df.description.apply(remove_new_lines)
    return df

# need to reformat the source code so it can be compared to the bug reports fix array
def format_sc_filename(x):
  
    x = x.split('apache')
    if len(x) != 2:
        x = x[0].split('springframework')
        if len(x) != 2:
            x = x[0].split('wildfly')
            if len(x) == 2:
                x = 'org.wildfly' + x[1]
        else:
             x = 'org.springframework' + x[1]
    else:
         x = 'org.apache' + x[1]
    if len(x) == 1:
        x = x[0]
   
    x = str(x).replace("\\",".")
    return x

# apply the fixes to the filename 
def clean_sc_filepath(df):
    df.filename = df.filename.apply(format_sc_filename)
    return df



In [8]:
#splitting composite words
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_composite_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

def clean_composite_bug_report(df):
    df.summary = df.summary.apply(findCompositeWords)
    df.description = df.description.apply(findCompositeWords)
    return df


In [9]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)
# STOP_WORDS


### Start working on TF-IDF and Cosine similarity calculations

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# clean up the unprocessed code column
def clean_source_df(df):
    df = clean_new_lines_source_code(df)
    df = clean_composite_source_code(df)
    df = clean_sc_filepath(df)
    return df

# add the summary and description together and clean the data
def clean_combine_bug_df(df):
    df = clean_new_lines_bug_report(df)
    df = clean_composite_bug_report(df)
    df["query"] = df["summary"] + df["description"]
    return df

# get clean versions of the dataframes
sc_df = clean_source_df(all_projects_source_codes)
br_df = clean_bug_df(all_projects_bugreports)


# get a list of the projects 
print(sc_df.project.unique())
projects = sc_df.project.unique()

# group the data frames
sc_grouped_df = sc_df.groupby(sc_df.project)

# example of getting a data frame for a single project
col_df = sc_grouped_df.get_group("COLLECTIONS")
display(col_df)




['COLLECTIONS' 'CONFIGURATION' 'IO' 'LANG' 'DATACMNS' 'DATAMONGO'
 'DATAREST' 'LDAP' 'SEC' 'SOCIALFB' 'SPR' 'ELY']


Unnamed: 0,filename,unprocessed_code,project
0,org.apache.commons.collections.arraystack.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
1,org.apache.commons.collections.bag.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
2,org.apache.commons.collections.bagutils.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
3,org.apache.commons.collections.beanmap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
4,org.apache.commons.collections.bidimap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
...,...,...,...
471,org.apache.commons.collections.set.testtransfo...,Licensed to the Apache Software Foundation ...,COLLECTIONS
472,org.apache.commons.collections.set.testtypedse...,Licensed to the Apache Software Foundation ...,COLLECTIONS
473,org.apache.commons.collections.set.testtypedso...,Licensed to the Apache Software Foundation ...,COLLECTIONS
474,org.apache.commons.collections.set.testunmodif...,Licensed to the Apache Software Foundation ...,COLLECTIONS


In [56]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_similarity(src_vect, query):
    cosSim = cosine_similarity(query, src_vect).flatten()
    return cosSim

# fit the vectorizer and transform data
def get_similarity(vect, src_vect, query_data):
    query = vect.transform(query_data)
    similarity = calculate_similarity(src_vect,query)
     
    return similarity

# remove the stem and stop words
# takes in an array of strings returns an array of strings
def stem_stop(text):
    stemmer = PorterStemmer()   #"english"
    text = text.split()
    text = [w for w in text if not w in STOP_WORDS]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()
    return text

# take in the source code df for a project and a signle query return scores
def generate_scores(source, query):
    
    # get a list of strings from the data frames to be vectorized
    source['unprocessed_code'] = source['unprocessed_code'].map(stem_stop)
    query["query"] = query["query"].map(stem_stop)
    query_str = query["query"].iloc[0]
     
    # fit a vectorizer to the data
    vect = TfidfVectorizer(min_df=1).fit(source['unprocessed_code'])
    src_vect = vect.transform(source['unprocessed_code'])
    display(src_vect)
    
    #get the similarity scores for the bug reports
    
    #iterate through the query
    for q in query["query"]:
        similarity = get_similarity(vect, src_vect, [q])
        # some sort of data structure (dict) to collect the queries to add it to the dataframe later
    
    return # the data structure of the similarity scores
    
generate_scores(col_df, br_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<476x3470 sparse matrix of type '<class 'numpy.float64'>'
	with 62997 stored elements in Compressed Sparse Row format>

array([0.05491227, 0.08307253, 0.00525185, 0.13749   , 0.25435228,
       0.03154885, 0.00409805, 0.03625311, 0.09471012, 0.05533985,
       0.01266685, 0.01520771, 0.00794535, 0.01807229, 0.02798844,
       0.1110047 , 0.02495558, 0.06850777, 0.12213955, 0.40626593,
       0.16665006, 0.01509533, 0.19119782, 0.02706737, 0.01797271,
       0.04131702, 0.24868529, 0.22858314, 0.01121147, 0.05541785,
       0.31931149, 0.0449976 , 0.1419158 , 0.02953132, 0.16821687,
       0.28199789, 0.25851874, 0.26881468, 0.17298281, 0.14359683,
       0.0111001 , 0.1793797 , 0.08785956, 0.07853233, 0.01950166,
       0.03024614, 0.23415322, 0.29455995, 0.01034233, 0.00932091,
       0.2886287 , 0.20109146, 0.02197036, 0.15004874, 0.21342391,
       0.00579909, 0.05282834, 0.04924567, 0.07070813, 0.03696102,
       0.01329643, 0.03427723, 0.22176788, 0.00554332, 0.07351123,
       0.02505379, 0.008805  , 0.04811389, 0.00472125, 0.04227651,
       0.00737919, 0.07936347, 0.00781077, 0.0074546 , 0.06595

- Now we need to run each project through the above code, and all the bugs for that project
- Store the similarity score for each bug in the dataframe
- Figure out index of the top 20
- Figure out index of files where the fix was ( this is in the bug report )
- Calculate MAP and MRR for each

In [45]:
col_df['unprocessed_code'] = col_df['unprocessed_code'].map(stem_stop)
br_df["query"] = br_df["query"].map(stem_stop)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Tired and trying to figure out what I need to do

In [46]:
vect = TfidfVectorizer(min_df=1).fit(col_df['unprocessed_code'])
src_vect = vect.transform(col_df['unprocessed_code'])

In [49]:
query = br_df["query"].iloc[0]
query

'flat3 map. entry.set value() overwrit entri valuesflat3 map&amp;apos; entri object overwrit entry&amp;apos; valu entry.set value() call one. It overwrit entri hand.n i&amp;apos;v look source, statement incorrectli fall through, return like should:n flat3 map.java, line 646-660:n object set value( object value) {n (can remov false) n{n illeg state exception( abstract hash map. S E T V A L U E_ I N V A L I d);n }n object old = value();n (next index) n{n 3: n parent.value3 = value;n 2:n parent.value2 = value;n 1:n parent.value1 = value;n }n old;n }n with code, I set valu item entri set, valu set value.\'"\'\'\'\'\'\'\''

In [52]:
qv = vect.transform(query.split())
qv

<101x3470 sparse matrix of type '<class 'numpy.float64'>'
	with 73 stored elements in Compressed Sparse Row format>

In [53]:
qv = vect.transform([query])
qv

<1x3470 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>