<center><h1>Step 0 - Preprocessing</h1></center> 

In this section, we first read the data including the bug reports and source code files of all 12 projects and for ease of access, we save them as two pickle files in the ./Data directory. Therefore, this set of code will populate the ./Data directory with "allBugReports.pickle" which is a pandas dataframe that contains all the bug reports from all projects and "allSourceCodes.pickle" which is a pandas dataframe that contains all source files after preprocessing.

### Required Libraries

In [1]:
pip install javalang

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\our.000\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [2]:
from __future__ import division
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
import warnings
import javalang
import re
import glob
import math
import time
from scipy import spatial
import scipy.spatial.distance
import xml.etree.ElementTree as ET
import requests
import multiprocessing
from tqdm import tqdm_notebook
from time import gmtime, strftime
from random import randint
warnings.simplefilter(action='ignore', category=FutureWarning)

<center><h1>Splitting code and natural language</h1></center> 

<center><h1>Loading source codes into pandas Dataframe</h1></center> 

In [3]:
def classNames_methodNames(node):
    result=''
    if isinstance(node,javalang.tree.MethodDeclaration) or isinstance(node,javalang.tree.ClassDeclaration):
        return node.name.lower()+' '
    if not (isinstance(node,javalang.tree.PackageDeclaration) or
        isinstance(node,javalang.tree.FormalParameter) or
       isinstance(node,javalang.tree.Import)):
        if node:
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=classNames_methodNames(childNode)
    return result
    
def traverse_node(node,i=0):
    i+=1
    result=''
    if not(isinstance(node,javalang.tree.PackageDeclaration)
            or isinstance(node,javalang.tree.FormalParameter)            
            or isinstance(node,javalang.tree.Import)
            or isinstance(node,javalang.tree.CompilationUnit)):
        if node:
            if (isinstance(node,int) or isinstance(node,str) or isinstance(node,float)) and i==2:
                result+=node+' '
            if isinstance(node, javalang.ast.Node):
                for childNode in node.children:
                    result+=traverse_node(childNode,i)
    return result

def code_parser(code):
    try:
        tree = javalang.parse.parse(code)
        return ''.join([traverse_node(node) for path, node in tree]) + ' ' + ''.join([classNames_methodNames(node)
                                                                                      for path, node in tree])
    except Exception as e: 
        print(e)
        return ''


def loadSourceFiles2df(PATH,group,project):
    """
    Receives: group name and project name 
    Process: open the source file directory and finds all the java files,
             and after preprocessing(using code_preprocessor) load them into a pandas dataframe 
    Returns: dataframe >> "filename","code","size"
    """
    print('Loading source files of {} from group:{} ...'.format(project,group))
    PATH=os.path.join("data",group,project,"gitrepo")
    all_source_files=glob.glob(PATH+'/**/*.java', recursive=True)
    source_codes_df=pd.DataFrame([])
    sourceCodesList=[]

    for filename in tqdm_notebook(all_source_files):
        code=open(filename,encoding='ISO-8859-1').read()
        if 'src/' in filename:
            sourceCodesList.append(dict({"filename":filename.split('src/')[1].replace('/','.').lower(),
                                         "unprocessed_code":code,
                                         'project':project}))
        else:
            sourceCodesList.append(dict({"filename":filename.split(project)[1].replace('/','.').lower(),
                                         "unprocessed_code":code,
                                         'project':project}))
    source_codes_df=source_codes_df.append(pd.DataFrame(sourceCodesList))
    return source_codes_df

def load_all_SCs(dataPath):
    print('\tLoading all source codes ... ')
    source_codes_df=pd.DataFrame([])
    all_groups=[folder for folder in listdir(dataPath)]
    for group in tqdm_notebook(all_groups):
        all_projects= [folder for folder in listdir(os.path.join(dataPath,group))]
        for project in all_projects:
            source_path=os.path.join(dataPath,group,project,"gitrepo")
            source_codes_df=source_codes_df.append(loadSourceFiles2df(source_path,group,project))
    return source_codes_df

<center><h1>Loading bug reports pandas Dataframe</h1></center> 

In [4]:
def loadBugs2df(PATH,project):
    """
    @Receives: the path to bug repository (the xml file)
    @Process: Parses the xml file and reads the fix files per bug id. 
    @Returns: Returns the dataframe
    """
    print("Loading Bug reports ... ")
    all_bugs_df=pd.DataFrame([],columns=["id","fix","text","fixdate"])
    bugRepo = ET.parse(PATH).getroot()
    buglist=[]                   
    for bug in tqdm_notebook(bugRepo.findall('bug')):
        bugDict=dict({"id":bug.attrib['id'],"fix":[],"fixdate":bug.attrib['fixdate']
                      ,"summary":None,"description":None,"project":project,"average_precision":0.0})
        for bugDetail in bug.find('buginformation'):
            if bugDetail.tag=='summary':
                bugDict["summary"]=bugDetail.text
            elif bugDetail.tag=='description':
                bugDict["description"]=bugDetail.text
        bugDict["fix"]=np.array([fixFile.text.replace('/','.').lower() for fixFile in bug.find('fixedFiles')])
        summary=str(bugDict['summary']) if str(bugDict['summary']) !=np.nan else ""
        description=str(bugDict['description']) if str(bugDict['description']) !=np.nan else ""
        buglist.append(bugDict)
    all_bugs_df=all_bugs_df.append(pd.DataFrame(buglist))
    return all_bugs_df.set_index('id')

def load_all_BRs(dataPath):
    print('\tLoading all bug reports ... ')
    all_bugs_df=pd.DataFrame([])
    all_groups=[folder for folder in listdir(dataPath)]
    for group in tqdm_notebook(all_groups):
        all_projects= [folder for folder in listdir(os.path.join(dataPath,group))]
        for project in all_projects:
            data_path=os.path.join(dataPath,group,project,"bugrepo","repository.xml")
            all_bugs_df=all_bugs_df.append(loadBugs2df(data_path,project))
            print(len(all_bugs_df))
    return all_bugs_df




<center><h1>Main Preprocessing class</h1></center> 

In [5]:
class PreprocessingUnit:

    all_projects_source_codes=pd.DataFrame([])
    all_projects_bugreports=pd.DataFrame([])
    
    def __init__(self,dataPath):

        self.dataPath=dataPath
        self.dataFolder=os.path.join(os.getcwd(),'Output')
        if not os.path.exists(self.dataFolder):
            os.makedirs(self.dataFolder)
            
    def execute(self):

        self.loadEverything()

    def loadEverything(self):
        vectorize=False
        if PreprocessingUnit.all_projects_bugreports.empty:
            bugReportFile=os.path.join(self.dataFolder,'allBugReports.pickle')
            if not os.path.isfile(bugReportFile):
                PreprocessingUnit.all_projects_bugreports=load_all_BRs(dataPath=self.dataPath)
                vectorize=True
                PreprocessingUnit.all_projects_bugreports.to_pickle(bugReportFile)
            else: 
                PreprocessingUnit.all_projects_bugreports=pd.read_pickle(bugReportFile)
        print("*** All bug reports are are preprocessed and stored as: {} ***".format('/'.join(bugReportFile.split('/')[-2:])))

        if PreprocessingUnit.all_projects_source_codes.empty:
            sourceCodeFile=os.path.join(self.dataFolder,'allSourceCodes.pickle')
            if not os.path.isfile(sourceCodeFile):
                PreprocessingUnit.all_projects_source_codes=load_all_SCs(dataPath=self.dataPath)
                vectorize=True
                PreprocessingUnit.all_projects_source_codes.to_pickle(sourceCodeFile)
            else:
                PreprocessingUnit.all_projects_source_codes=pd.read_pickle(sourceCodeFile)
        print("*** All source codes are preprocessed and stored as: {} ***".format('/'.join(sourceCodeFile.split('/')[-2:])))
        

### MAIN

In [8]:
if __name__=="__main__":

    config={'DATA_PATH':os.path.join('data')}
    preprocessor=PreprocessingUnit(dataPath=config['DATA_PATH'])
    preprocessor.execute()
   


	Loading all bug reports ... 


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for group in tqdm_notebook(all_groups):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

Loading Bug reports ... 


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for bug in tqdm_notebook(bugRepo.findall('bug')):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=92.0), HTML(value='')))


92
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=133.0), HTML(value='')))


225
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=91.0), HTML(value='')))


316
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=217.0), HTML(value='')))


533
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=158.0), HTML(value='')))


691
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=271.0), HTML(value='')))


962
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=132.0), HTML(value='')))


1094
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=53.0), HTML(value='')))


1147
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=541.0), HTML(value='')))


1688
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


1703
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=130.0), HTML(value='')))


1833
Loading Bug reports ... 


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25.0), HTML(value='')))


1858

*** All bug reports are are preprocessed and stored as: C:\Users\OUR.000\Desktop\SENG544\Project2-1\Output\allBugReports.pickle ***
	Loading all source codes ... 


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for group in tqdm_notebook(all_groups):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

Loading source files of COLLECTIONS from group:Commons ...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm_notebook(all_source_files):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=476.0), HTML(value='')))


Loading source files of CONFIGURATION from group:Commons ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=224.0), HTML(value='')))


Loading source files of IO from group:Commons ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=81.0), HTML(value='')))


Loading source files of LANG from group:Commons ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=247.0), HTML(value='')))


Loading source files of DATACMNS from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=552.0), HTML(value='')))


Loading source files of DATAMONGO from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=348.0), HTML(value='')))


Loading source files of DATAREST from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=348.0), HTML(value='')))


Loading source files of LDAP from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=566.0), HTML(value='')))


Loading source files of SEC from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1400.0), HTML(value='')))


Loading source files of SOCIALFB from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=253.0), HTML(value='')))


Loading source files of SPR from group:Spring ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5898.0), HTML(value='')))


Loading source files of ELY from group:Wildfly ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=68.0), HTML(value='')))



*** All source codes are preprocessed and stored as: C:\Users\OUR.000\Desktop\SENG544\Project2-1\Output\allSourceCodes.pickle ***


In [9]:
def loadEverything():
    all_projects_bugreports = pd.read_pickle('Output/allBugReports.pickle')
    print("*** All Bug Reports are Loaded. ***")
    all_projects_source_codes = pd.read_pickle('Output/allSourceCodes.pickle')
    print("*** All Source Codes are Loaded. ***")
    return all_projects_bugreports, all_projects_source_codes

all_projects_bugreports, all_projects_source_codes = loadEverything()
display(all_projects_bugreports.iloc[0])
display(all_projects_source_codes.iloc[0].unprocessed_code)

*** All Bug Reports are Loaded. ***
*** All Source Codes are Loaded. ***


fix                  [org.apache.commons.collections.map.flat3map.j...
text                                                               NaN
fixdate                                            2006-07-18 22:02:11
summary              Flat3Map.Entry.setValue() overwrites other Ent...
description          Flat3Map&amp;apos;s Entry objects will overwri...
project                                                    COLLECTIONS
average_precision                                                    0
Name: 217, dtype: object

'/*\n *  Licensed to the Apache Software Foundation (ASF) under one or more\n *  contributor license agreements.  See the NOTICE file distributed with\n *  this work for additional information regarding copyright ownership.\n *  The ASF licenses this file to You under the Apache License, Version 2.0\n *  (the "License"); you may not use this file except in compliance with\n *  the License.  You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n *  Unless required by applicable law or agreed to in writing, software\n *  distributed under the License is distributed on an "AS IS" BASIS,\n *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n *  See the License for the specific language governing permissions and\n *  limitations under the License.\n */\npackage org.apache.commons.collections;\n\nimport java.util.ArrayList;\nimport java.util.EmptyStackException;\n\n/**\n * An implementation of the {@link java.util.Stack}