# Phase 1 

### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer

### Now load the pickle data into dataframes from the Output folder one directory above

In [32]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()
display(all_projects_bugreports.iloc[0])
display(all_projects_source_codes.iloc[2])
display(all_projects_bugreports.iloc[0].name)
display(all_projects_source_codes.iloc[2].name)

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


'217'

2

In [66]:
# need to get the file names in the bug report an source code to be the same

# there is no index connection between bugreports index (.name) and the index of the source code
display(all_projects_bugreports.iloc[0].fix)
display(all_projects_source_codes.iloc[217].filename)


array(['org.apache.commons.collections.map.flat3map.java',
       'org.apache.commons.collections.map.testflat3map.java'],
      dtype='<U52')

'org.apache.commons.collections.list.transformedlist.java'

## Removing composite varaibles

In [22]:
import re
#remove next line characters:
def remove_new_lines(text):
#     return str(x).replace('\n', '').replace('*', '').replace('/', '').replace('\\','').replace('\t','')
    text = str(text)
    COMBINE_WHITE_SPACE = re.compile(r"(?a:\s+)")
    text = COMBINE_WHITE_SPACE.sub(' ', text)
    return text.replace('*', '').replace('/', '').replace('\\','')
    
# clean up the various white space and remove some *
def clean_new_lines_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

# clean up the description and summary, they will both be used for the query
def clean_new_lines_bug_report(df):
    df.summary = df.summary.apply(remove_new_lines)
    df['description'] = df['description'].astype('|S')
    df.description = df.description.apply(remove_new_lines)
    return df

# need to reformat the source code so it can be compared to the bug reports fix array
def format_sc_filename(x):
  
    x = x.split('apache')
    if len(x) != 2:
        x = x[0].split('springframework')
        if len(x) != 2:
            x = x[0].split('wildfly')
            if len(x) == 2:
                x = 'org.wildfly' + x[1]
        else:
             x = 'org.springframework' + x[1]
    else:
         x = 'org.apache' + x[1]
    if len(x) == 1:
        x = x[0]
   
    x = str(x).replace("\\",".")
    return x

# apply the fixes to the filename 
def clean_sc_filepath(df):
    df.filename = df.filename.apply(format_sc_filename)
    return df



In [8]:
#splitting composite words
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_composite_source_code(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

def clean_composite_bug_report(df):
    df.summary = df.summary.apply(findCompositeWords)
    df.description = df.description.apply(findCompositeWords)
    return df


In [9]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)
# STOP_WORDS


### Start working on TF-IDF and Cosine similarity calculations

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# clean up the unprocessed code column
def clean_source_df(df):
    df = clean_new_lines_source_code(df)
    df = clean_composite_source_code(df)
    df = clean_sc_filepath(df)
    return df

# add the summary and description together and clean the data
def clean_combine_bug_df(df):
    df = clean_new_lines_bug_report(df)
    df = clean_composite_bug_report(df)
    df["query"] = df["summary"] + df["description"]
    return df

# get clean versions of the dataframes
sc_df = clean_source_df(all_projects_source_codes)
br_df = clean_bug_df(all_projects_bugreports)


# get a list of the projects 
print(sc_df.project.unique())
projects = sc_df.project.unique()

# group the data frames
# TODO ignore index here? Then maintain original index and concat the data frames
sc_grouped_df = sc_df.groupby(sc_df.project)
bg_grouped_df = br_df.groupby(br_df.project)
# example of getting a data frame for a single project
col_df = sc_grouped_df.get_group("COLLECTIONS")
display(col_df)




['COLLECTIONS' 'CONFIGURATION' 'IO' 'LANG' 'DATACMNS' 'DATAMONGO'
 'DATAREST' 'LDAP' 'SEC' 'SOCIALFB' 'SPR' 'ELY']


Unnamed: 0,filename,unprocessed_code,project
0,org.apache.commons.collections.arraystack.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
1,org.apache.commons.collections.bag.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
2,org.apache.commons.collections.bagutils.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
3,org.apache.commons.collections.beanmap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
4,org.apache.commons.collections.bidimap.java,Licensed to the Apache Software Foundation ...,COLLECTIONS
...,...,...,...
471,org.apache.commons.collections.set.testtransfo...,Licensed to the Apache Software Foundation ...,COLLECTIONS
472,org.apache.commons.collections.set.testtypedse...,Licensed to the Apache Software Foundation ...,COLLECTIONS
473,org.apache.commons.collections.set.testtypedso...,Licensed to the Apache Software Foundation ...,COLLECTIONS
474,org.apache.commons.collections.set.testunmodif...,Licensed to the Apache Software Foundation ...,COLLECTIONS


In [62]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_similarity(src_vect, query):
    cosSim = cosine_similarity(query, src_vect).flatten()
    return cosSim

# fit the vectorizer and transform data
def get_similarity(vect, src_vect, query_data):
    query = vect.transform(query_data)
    similarity = calculate_similarity(src_vect,query)
     
    return similarity

# remove the stem and stop words
# takes in an array of strings returns an array of strings
def stem_stop(text):
    stemmer = PorterStemmer()   #"english"
    text = text.split()
    text = [w for w in text if not w in STOP_WORDS]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()
    return text

# take in the source code df for a project and a signle query return scores
def generate_scores_list(source, query):
    scores_list = [] 
    # get a list of strings from the data frames to be vectorized
    source['unprocessed_code'] = source['unprocessed_code'].map(stem_stop)
    query["query"] = query["query"].map(stem_stop)
    query_str = query["query"].iloc[0]
     
    # fit a vectorizer to the data
    vect = TfidfVectorizer(min_df=1).fit(source['unprocessed_code'])
    src_vect = vect.transform(source['unprocessed_code'])
    display(src_vect)
    
    #get the similarity scores for the bug reports    
    #iterate through the query
    for q in query["query"]:
        similarity = get_similarity(vect, src_vect, [q])
        scores_list.append(similarity)
        # some sort of data structure (dict) to collect the queries to add it to the dataframe later
    
    return scores_list
    
def generate_all_scores():
    
    all_bugs = []
    all_src = []
    # iterate through the list of 12 projects
    for proj in projects:
        # create dataframes for each project
        src_df = sc_grouped_df.get_group(proj)
        bug_df = bg_grouped_df.get_group(proj)
        
        # generate the scores list
        scores = generate_scores_list(src_df, bug_df)
    
        #append scores list to the bug dataframe
        bug_df["sim_vect"] = scores # the only way that the matrix is related to the src code 
                                    # is through the index.
        
        # CALCULATE THE MAP AND MRR HERE WITH A FUNCTION AND ADD IT TO THE BUGS DATAFRAME
        
        # maintain a list of all the dataframes
        all_bugs.append(bug_df)
        all_src.append(src_df)
    # concatenate all the data frames in order    
    all_bug_df = pd.concat(all_bugs, ignore_index=True)
    all_src_df = pd.concat(all_src, ignore_index=True)
    return all_bug_df, all_src_df

## TODO list
- Figure out index of the top 20 in the bug sim_vect
- Figure out index of files where the fix was ( this is in the bug report )
- Calculate MAP and MRR for each
- Generate graphs and report on the findings