## Importing Libraries

In [1]:
# NLP
import nltk

In [2]:
# General Libraries
import re
# NLP
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize,word_tokenize
# Pymongo
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError

In [3]:
# Check if corpus of stopwords and wordnet exists
try:
    nltk.find('stopwords')
    nltk.find('wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Kishor
[nltk_data]     Satpute\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Kishor
[nltk_data]     Satpute\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
client = MongoClient('localhost',27017)
db = client['eclipse']
collection = db['initial']

# Created new collection to store processed documents
processedCollection = db['preprocessed']

In [5]:
print(db.list_collection_names())

['pairs', 'initial', 'clear']


In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
##### There are 2156 bug reports whose description is blank
# Check bug_id:99873
# Check bug_id:99323

In [13]:
errors = []
docs = []
lemmatizer = WordNetLemmatizer()
snow_stemmer = SnowballStemmer(language='english')
for document in collection.find():
    doc = document.copy()
    for desc in ['description','short_desc']:
        try:
            # To remove timestamp and date
            doc[desc] = re.sub(r"(([A-Z]{2})* [\([0-9]+\/[0-9]+\/[0-9]+ [0-9]+:[0-9]{2}:[0-9]{2} (AM|PM)\))",'',doc[desc])
            
            # To remove XML code
            doc[desc] = re.sub(r"(\<\?xml[a-zA-Z0-9\.\s\=\?\"\-\>\\n\<\:\/\_]*(\<\/)[a-zA-Z0-9\:\_]*\>)",'',doc[desc])
            
            
            # To remove any hyperlinks
            doc[desc] = re.sub(r"(http|https):(\/{2})(www\.)([a-zA-z0-9]*\.([a-z]*)(\.)*)",'',doc[desc])
            doc[desc] = re.sub(r"(http|https):(\/{2}[a-zA-Z0-9\.\-\/\\n\:]*)",'',doc[desc])
            
            
            # To remove error string like in bug_id:99873
            # (e.g-line: 62\n\tServerTypeDefinitionUtil.getServerClassPathEntry)
            doc[desc] = re.sub(r"(line\:\s[0-9]*([\\n\\t])*([a-zA-Z0-9\(\)\$\[\]\\n\\t\s]*\.[a-zA-Z0-9\(\)\,\\n\[\]\s\$\_]*\))*)",
                               '',doc[desc])
            
            # To remove the org. from error eg.(org.eclipse.ui.internal.Workbench.createAndRunWorkbench(Workbench.java:366))
            doc[desc] = re.sub(r"((\()*(org|sun|java|junit|e.g)\.[a-zA-Z0-9\.\$\(\:\s\-\,\_\\]*(\)+|))",'',doc[desc])
            
            # To remove strings like /usr/lib/libthread.so.1  (bug_id:33431)
            doc[desc] = re.sub(r"((\/opt|\/usr)\/[a-zA-Z0-9\/\.\_\,\-]*)",'',doc[desc])
            
            # To remove hexadecimal numbers (bug_id:33431)
            doc[desc] = re.sub(r"(0[xX][a-fA-F0-9]+)",'',doc[desc])
            
            # To remove cpp,c,java code
            doc[desc] = re.sub(r"((\()*[a-zA-Z]+\.(cpp|java)[a-zA-Z0-9\:\\n\#\s\<\>\;\(\,\*\)\{\"\/\+\.\-\_\\n\=]*(\})*)",
                               '',doc[desc])
            
            # To remove text between {}
            doc[desc] = re.sub(r"(\{[a-zA-Z0-9\s\(\)\\n\\t\{\:\<\-\>\=\'\[\]\"\|\*\.\;\,\?]+\})",'',doc[desc])
            
            # To remove testcase(check bug report no:99844)
            doc[desc] = re.sub(r"(Testcase\:.*\})",'',doc[desc])
            
            # To remove all text within () or [] or <>
            doc[desc] = re.sub(r"(\([a-zA-Z0-9\s\+\*\.\,\<\-\>\?\\n\-\'\_\/\$\[\]\(\"\:\#\;\%\!]*\)+|(\[[a-zA-Z0-9\s\:]*\])|(\<[a-zA-Z0-9\_\.\s\:\<\,]*\>+))",
                               '',doc[desc])

            # To remove alphanumeric string like 1GE8YMJ:
            doc[desc] = re.sub(r"([0-9][a-zA-Z0-9]{6}(\:)*)",'',doc[desc])

            # To remove string starting with CVS/
            doc[desc] = re.sub(r"(CVS(\/)*([a-zA-Z]{1,15}\.[a-zA-Z]*)*)",'',doc[desc])

            # Remove string '....'
            doc[desc] = re.sub(r"\.{2,5}",' ',doc[desc])

            # To remove file name 'org.eclipse.gmt.am3.usecase.osgipluginmanagement.zip'
            doc[desc] = re.sub(r"(org\.[a-zA-Z0-9\.\$\=\_\(\:\s]*(zip|gz|tar))",'',doc[desc])

            # To remove strings like 'Authors: Mathieu Vénisse & Guillaume Doux'
            doc[desc] = re.sub(r"(Authors\:[\sA-Za-z\u00C0-\u00ff\&]+)",'',doc[desc])

            # To remove string 'Best regards'
            doc[desc] = re.sub(r"(Best regards\,.+\.)",'',doc[desc])
            
            # To remove strings like OS=linux, ARCH=x86
            doc[desc] = re.sub(r"([A-Z\.]*(\=)+[a-zA-Z0-9\s\.\_]*)",'',doc[desc])
            
            # To remove strings like (- v, - y)
            doc[desc] = re.sub(r"(\-(\s)+[a-zA-Z]+)",'',doc[desc])
            
            # To remove string like ITPVCM:WINNT
            doc[desc] = re.sub(r"([a-zA-Z]+:[a-zA-Z]+)",'',doc[desc])
            
            # To change words like Don't to Dont
            doc[desc] = re.sub(r"(\')",'',doc[desc])

            # To remove other special chacters
            doc[desc] = re.sub(r"[\-\:\?\/\[\]\"\$\>\<\,\!\+\#\*\_\|\;\}\{\%\.]",' ',doc[desc])

#             #To remove numbers like (4.0)
#             doc[desc] = re.sub(r"([A-Z]*[0-9](\s)*[0-9]*)",'',doc[desc])

            # To remove string like (STACK 0) (bug_id: 88623)
            doc[desc] = re.sub(r"([A-Z]+\s[0-9]+)",'',doc[desc])
        
            # To remove unwanted 2-3.$$$ digit numbers
            doc[desc] = re.sub(r"([0-9]([0-9])+\.*[0-9]*)",'',doc[desc])
            
            # To remove all spaces greater than 2
            doc[desc] = re.sub(r"((\s){2,})",' ',doc[desc])

            # Tokenization is carried out on the string
            word_tokens = wordpunct_tokenize(doc[desc])

            # Stores list of words after removing stop words
            wordsFiltered = []

            for word in word_tokens:
                if word.lower() not in stop_words:
                    wordsFiltered.append(word)
            
            
            """SnowStemming"""
            stem_words = []
            for word in wordsFiltered:
                stem_words.append(snow_stemmer.stem(word))
            
            """ Lemmatization:
                Considered using lemmatization instead of snowball stemming
                Stemming does not consider how the words are being used.
            """
#             lemmatizedWords = []
#             for word in wordsFiltered:
#                 lemmatizedWords.append(lemmatizer.lemmatize(word))

            doc[desc] = ' '.join(stem_words)

        except TypeError:
            # 2156 bug reports with no description
            if(len(doc[desc]) == 0):
                continue
            else:
                errors.append(doc['bug_id'])
        except DuplicateKeyError:
            continue
    # Append all doc in docs list
    docs.append(doc)

# Inserting the processes documents in the database
processedCollection.insert_many(docs)

<pymongo.results.InsertManyResult at 0x360e55540>

In [21]:
print(errors)

[]


In [14]:
doc = collection.find_one({"bug_id":"24"})
doc

{'_id': ObjectId('52e9b44854dc1c25ebdb1efc'),
 'bug_id': '24',
 'product': 'Platform',
 'description': "Since refresh() is called before the catchup/release view opens, it should be as fast as possible.\n\nNOTES:\n\nBB (12/04/2001 11:32:05 AM)\n\tThe most promising optimization idea is: Don't return an IServerResourceTree from fetchRemoteTree\n\tbut instead use a call-back to interleave the server communication with updating the sync info.\n\nBB (12/04/2001 11:30:31 AM)\n\tSee this PR for other performance tuning opportunities: 1GAT2V2: ITPVCM:WINNT - Releasing very slow\n\nMichaelV (19/09/2001 2:14:23 PM)\n\n\tHere are some percentages on a refresh from OTT to Paris with auto-build turned OFF\n\n\t\tTotal Time: 97776 ms\n\n\t\t(57026) 58.3% in Subscriber.refresh()\n\t\t\t(40704) 41.6% CVSAdapter.fetchRemoteTree()\n\t\t\t(14918) 15.3% Subscriber.recursiveRefresh()\n\t\t(13242) 13.5% in SharingManager.manage() from  org.eclipse.vcm.ui.sync.Repository.autoManage()\n\t\t\tWhat is autoMana

In [15]:
doc = processedCollection.find_one({"bug_id":"24"})
doc

{'_id': ObjectId('52e9b44854dc1c25ebdb1efc'),
 'bug_id': '24',
 'product': 'Platform',
 'description': 'sinc refresh call catchup releas view open fast possibl note promis optim idea dont return iserverresourcetre fetchremotetre instead use call back interleav server communic updat sync info see pr perform tune opportun slow michaelv percentag refresh ott pari auto build turn total time ms 3 subscrib refresh 6 3 subscrib recursiverefresh 5 sharingmanag manag automanag 5 javamodelmanag resourcechang 4 3 resourcedeltafactori computedelta hot spot socket read socket write string substr 4 6 string indexof 3 6 path segment 3 2 string compareto 3 2 accumulatingprogressmonitor subtask 3 path append 3 2 string concat auto manag time seem spent determin ignor seem mani way improv michaelv seem lot room improv subscrib recursiverefresh howev general approach use vcm determin chang seem inher inefici client follow step take place updat 1 client send inform load file 2 server make one pass determi