# Phase 1 

### Preprocessing

In [29]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer

### Now load the pickle data into dataframes from the Output folder one directory above

In [30]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('../Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('../Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()
display(all_projects_bugreports.iloc[0])
display(all_projects_source_codes.iloc[0].unprocessed_code)



*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


fix                  [org.apache.commons.collections.map.flat3map.j...
text                                                               NaN
fixdate                                            2006-07-18 22:02:11
summary              Flat3Map.Entry.setValue() overwrites other Ent...
description          Flat3Map&amp;apos;s Entry objects will overwri...
project                                                    COLLECTIONS
average_precision                                                    0
Name: 217, dtype: object

'/*\n *  Licensed to the Apache Software Foundation (ASF) under one or more\n *  contributor license agreements.  See the NOTICE file distributed with\n *  this work for additional information regarding copyright ownership.\n *  The ASF licenses this file to You under the Apache License, Version 2.0\n *  (the "License"); you may not use this file except in compliance with\n *  the License.  You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n *  Unless required by applicable law or agreed to in writing, software\n *  distributed under the License is distributed on an "AS IS" BASIS,\n *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n *  See the License for the specific language governing permissions and\n *  limitations under the License.\n */\npackage org.apache.commons.collections;\n\nimport java.util.ArrayList;\nimport java.util.EmptyStackException;\n\n/**\n * An implementation of the {@link java.util.Stack}

## Removing composite varaibles

In [31]:
#remove next line characters:
def remove_new_lines(x):
    return x.replace('\n', '').replace('*', '').replace('/', '').replace('\t','')

def clean_new_lines(df):
    df.unprocessed_code = df.unprocessed_code.apply(remove_new_lines)
    return df

df = clean_new_lines(all_projects_source_codes)
df.iloc[0].unprocessed_code

'   Licensed to the Apache Software Foundation (ASF) under one or more   contributor license agreements.  See the NOTICE file distributed with   this work for additional information regarding copyright ownership.   The ASF licenses this file to You under the Apache License, Version 2.0   (the "License"); you may not use this file except in compliance with   the License.  You may obtain a copy of the License at        http:www.apache.orglicensesLICENSE-2.0    Unless required by applicable law or agreed to in writing, software   distributed under the License is distributed on an "AS IS" BASIS,   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   See the License for the specific language governing permissions and   limitations under the License. package org.apache.commons.collections;import java.util.ArrayList;import java.util.EmptyStackException;  An implementation of the {@link java.util.Stack} API that is based on an  <code>ArrayList<code> instead of a <code>Vec

In [32]:
import re

#splitting composite words
def findCompositeWords(s):
    return ' '.join(re.findall('[A-Z][^A-Z]*', s))   


def clean_name_column(df):
    df.unprocessed_code = df.unprocessed_code.apply(findCompositeWords)
    return df

df = clean_name_column(all_projects_source_codes)
df

Unnamed: 0,filename,unprocessed_code,project
0,\gitrepo\src\java\org\apache\commons\collectio...,Licensed to the Apache Software Foundation ...,COLLECTIONS
1,\gitrepo\src\java\org\apache\commons\collectio...,Licensed to the Apache Software Foundation ...,COLLECTIONS
2,\gitrepo\src\java\org\apache\commons\collectio...,Licensed to the Apache Software Foundation ...,COLLECTIONS
3,\gitrepo\src\java\org\apache\commons\collectio...,Licensed to the Apache Software Foundation ...,COLLECTIONS
4,\gitrepo\src\java\org\apache\commons\collectio...,Licensed to the Apache Software Foundation ...,COLLECTIONS
...,...,...,...
63,\gitrepo\src\main\java\org\wildfly\security\ut...,"J Boss, Home of Professional Open Source. ...",ELY
64,\gitrepo\src\main\java\org\wildfly\security\_p...,"J Boss, Home of Professional Open Source. ...",ELY
65,\gitrepo\src\test\java\org\wildfly\security\ma...,"J Boss, Home of Professional Open Source. ...",ELY
66,\gitrepo\src\test\java\org\wildfly\security\ma...,"J Boss, Home of Professional Open Source. ...",ELY


In [33]:
df.iloc[0].unprocessed_code

'Licensed to the  Apache  Software  Foundation ( A S F) under one or more   contributor license agreements.   See the  N O T I C E file distributed with   this work for additional information regarding copyright ownership.    The  A S F licenses this file to  You under the  Apache  License,  Version 2.0   (the " License"); you may not use this file except in compliance with   the  License.   You may obtain a copy of the  License at        http:www.apache.orglicenses L I C E N S E-2.0     Unless required by applicable law or agreed to in writing, software   distributed under the  License is distributed on an " A S  I S"  B A S I S,    W I T H O U T  W A R R A N T I E S  O R  C O N D I T I O N S  O F  A N Y  K I N D, either express or implied.    See the  License for the specific language governing permissions and   limitations under the  License. package org.apache.commons.collections;import java.util. Array List;import java.util. Empty Stack Exception;   An implementation of the {@link

### put Kelvin's key word cleaning above this

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
stop = ENGLISH_STOP_WORDS.union(java_keywords)
stop


#vect = CountVectorizer(min_df = 5, stop_words = stop, analyzer = 'word')

### Start working on TF-IDF and Cosine similarity calculations

In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# clean up the unprocessed code column
def clean_source_code_list(sc_df):
    sc_df = clean_new_lines(sc_df)
    sc_df = clean_name_column(sc_df)
    return sc_df


sc_df = clean_source_code_list(all_projects_source_codes)

# get a list of the projects and their parent project
print(df.project.unique())
projects = df.project.unique()
commons_projects = projects[0:4]
spring_projects = projects[4:11]
wildfly_projects = projects[11]

# group the data frames
grouped_df = sc_df.groupby(df.project)

# example of getting a single data frame
col_df = grouped_df.get_group("COLLECTIONS")
display(col_df)

config_df = grouped_df.get_group(commons_projects[1])
ely_df = grouped_df.get_group(wildfly_projects)
ldap_df = grouped_df.get_group(spring_projects[3])

print(config_df.iloc[0].filename)
print(col_df.iloc[0].filename)
print(ely_df.iloc[0].filename)
print(ldap_df.iloc[0].filename)



['COLLECTIONS' 'CONFIGURATION' 'IO' 'LANG' 'DATACMNS' 'DATAMONGO'
 'DATAREST' 'LDAP' 'SEC' 'SOCIALFB' 'SPR' 'ELY']


0      Licensed to the            Apache            S...
1      Licensed to the            Apache            S...
2      Licensed to the            Apache            S...
3      Licensed to the            Apache            S...
4      Licensed to the            Apache            S...
                             ...                        
471    Licensed to the            Apache            S...
472    Licensed to the            Apache            S...
473    Licensed to the            Apache            S...
474    Licensed to the            Apache            S...
475    Licensed to the            Apache            S...
Name: unprocessed_code, Length: 476, dtype: object

In [None]:

def gen_parent_df(group, names):
    result = pd.DataFrame
    temp = []
    for name in names:
        temp.append(group.get_group(name))
    
    result = pd.concate(temp, ignore_index = true)    
    return result

# don't know if we are looking at these groups OR each of the projects individually
commons_df = gen_parent_df(grouped_df, commons_projects)
sprint_df = gen_parent_df(grouped_df, spring_projects)
wildfly_df = gen_parent_df(grouped_df, wildfly_projects)




In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_similarity(source_code, bug):
    cosSim = cosine_similarity(bug, source_code).flatten()
    return cosSim

# fit the vectorizer and transform data
def transform_data(source_code_data, query_data):
    # DO WE SET min_df?  default normalization is l2
    vect = TfidfVectorizer(min_df=1, stop_words="english").fit(source_code_data)
    X = vect.transform(source_code_data)
    y = vect.transform(query_data)
    similarity = calculate_similarity(X,y)
    return X, y, similarity

# remove the stem and stop words
def stem_stop(text,stop_words):
    stemmer =PorterStemmer("english")
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = list(map(lambda x: stemmer.stem(x), text))
    text = ' '.join(text)
    text = text.strip()

    return text

# first stem and remove stop words from the data frames column

# source_code_df['unprocessed_code'] = source_code_df['unprocessed_code'].map(stem_stop)
# bug_df['unprocessed_code'] = bug_df['unprocessed_code'].map(stem_stop)

# then get the dataframe column you want to pass as source_code_data and as the query_data(bug)


