## Importing Libraries

In [3]:
# NLP
import nltk

In [13]:
# General Libraries
import re
# NLP
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize,word_tokenize
# Pymongo
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError

In [8]:
# Check if corpus of stopwords and wordnet exists
try:
    nltk.find('stopwords')
    nltk.find('wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Kishor
[nltk_data]     Satpute\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Kishor
[nltk_data]     Satpute\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
client = MongoClient('localhost',27017)
db = client['eclipse']
collection = db['initial']

# Created new collection to store processed documents
processedCollection = db['preprocessed']

In [10]:
print(db.list_collection_names())

['pairs', 'initial', 'clear', 'preprocessed']


In [11]:
stop_words = set(stopwords.words('english'))

In [5]:
##### There are 2156 bug reports whose description is blank
# Check bug_id:99873
# Check bug_id:99323

In [15]:
errors = []
docs = []
lemmatizer = WordNetLemmatizer()
snow_stemmer = SnowballStemmer(language='english')
for document in collection.find():
    doc = document.copy()
    for desc in ['description','short_desc']:
        try:
            # To remove timestamp and date
            doc[desc] = re.sub(r"(([A-Z]{2})* [\([0-9]+\/[0-9]+\/[0-9]+ [0-9]+:[0-9]{2}:[0-9]{2} (AM|PM)\))",'',doc[desc])
            
            # To remove XML code
            doc[desc] = re.sub(r"(\<\?xml[a-zA-Z0-9\.\s\=\?\"\-\>\\n\<\:\/\_]*(\<\/)[a-zA-Z0-9\:\_]*\>)",'',doc[desc])
            
            
            # To remove any hyperlinks
            doc[desc] = re.sub(r"(http|https):(\/{2})(www\.)([a-zA-z0-9]*\.([a-z]*)(\.)*)",'',doc[desc])
            doc[desc] = re.sub(r"(http|https):(\/{2}[a-zA-Z0-9\.\-\/\\n\:]*)",'',doc[desc])
            
            
            # To remove error string like in bug_id:99873
            # (e.g-line: 62\n\tServerTypeDefinitionUtil.getServerClassPathEntry)
            doc[desc] = re.sub(r"(line\:\s[0-9]*([\\n\\t])*([a-zA-Z0-9\(\)\$\[\]\\n\\t\s]*\.[a-zA-Z0-9\(\)\,\\n\[\]\s\$\_]*\))*)",
                               '',doc[desc])
            
            # To remove the org. from error eg(org.eclipse.ui.internal.Workbench.createAndRunWorkbench(Workbench.java:366))
            doc[desc] = re.sub(r"((\()*(org|sun|java|junit|e.g)\.[a-zA-Z0-9\.\$\(\:\s\-\,\_\\]*(\)+|))",'',doc[desc])
            
            # To remove strings like /usr/lib/libthread.so.1  (bug_id:33431)
            doc[desc] = re.sub(r"((\/opt|\/usr)\/[a-zA-Z0-9\/\.\_\,\-]*)",'',doc[desc])
            
            # To remove hexadecimal numbers (bug_id:33431)
            doc[desc] = re.sub(r"(0[xX][a-fA-F0-9]+)",'',doc[desc])
            
            # To remove cpp,c,java code
            doc[desc] = re.sub(r"((\()*[a-zA-Z]+\.(cpp|java)[a-zA-Z0-9\:\\n\#\s\<\>\;\(\,\*\)\{\"\/\+\.\-\_\\n\=]*(\})*)",
                               '',doc[desc])
            
            # To remove text between {}
            doc[desc] = re.sub(r"(\{[a-zA-Z0-9\s\(\)\\n\\t\{\:\<\-\>\=\'\[\]\"\|\*\.\;\,\?]+\})",'',doc[desc])
            
            # To remove testcase(check bug report no:99844)
            doc[desc] = re.sub(r"(Testcase\:.*\})",'',doc[desc])
            
            # To remove all text within () or [] or <>
            doc[desc] = re.sub(r"(\([a-zA-Z0-9\s\+\*\.\,\<\-\>\?\\n\-\'\_\/\$\[\]\(\"\:\#\;]*\)+|(\[[a-zA-Z0-9\s\:]*\])|(\<[a-zA-Z0-9\_\.\s\:\<\,]*\>+))",
                               '',doc[desc])

            # To remove alphanumeric string like 1GE8YMJ:
            doc[desc] = re.sub(r"([0-9][a-zA-Z0-9]{6}(\:)*)",'',doc[desc])

            # To remove string starting with CVS/
            doc[desc] = re.sub(r"(CVS\/[a-zA-Z]{1,15})",'',doc[desc])

            # Remove string '....'
            doc[desc] = re.sub(r"\.{2,5}",' ',doc[desc])

            # To remove file name 'org.eclipse.gmt.am3.usecase.osgipluginmanagement.zip'
            doc[desc] = re.sub(r"(org\.[a-zA-Z0-9\.\$\=\_\(\:\s]*(zip|gz|tar))",'',doc[desc])

            # To remove strings like 'Authors: Mathieu Vénisse & Guillaume Doux'
            doc[desc] = re.sub(r"(Authors\:[\sA-Za-z\u00C0-\u00ff\&]+)",'',doc[desc])

            # To remove string 'Best regards'
            doc[desc] = re.sub(r"(Best regards\,.+\.)",'',doc[desc])
            
            # To remove strings like OS=linux, ARCH=x86
            doc[desc] = re.sub(r"([A-Z\.]*(\=)+[a-zA-Z0-9\s\.\_]*)",'',doc[desc])
            
            # To remove strings like (- v, - y)
            doc[desc] = re.sub(r"(\-(\s)*[a-zA-Z]+)",'',doc[desc])

            # To remove other special chacters
            doc[desc] = re.sub(r"[\-\'\:\?\/\[\]\"\$\>\<\,\!\+\#\*\_\|\;\}\{]",' ',doc[desc])

#             #To remove numbers like (4.0)
#             doc[desc] = re.sub(r"([A-Z]*[0-9](\s)*[0-9]*)",'',doc[desc])

            # To remove string like (STACK 0) (bug_id: 88623)
            doc[desc] = re.sub(r"([A-Z]+\s[0-9]+)",'',doc[desc])
        
            # To remove unwanted 2-3 digit numbers
            doc[desc] = re.sub(r"([0-9]([0-9])+)",'',doc[desc])
            
            # To remove all spaces greater than 2
            doc[desc] = re.sub(r"((\s){2,}|\.)",' ',doc[desc])

            # Tokenization is carried out on the string
            word_tokens = wordpunct_tokenize(doc[desc])

            # Stores list of words after removing stop words
            wordsFiltered = []

            for word in word_tokens:
                if word.lower() not in stop_words:
                    wordsFiltered.append(word)
            
            
            """SnowStemming"""
            stem_words = []
            for word in wordsFiltered:
                stem_words.append(snow_stemmer.stem(word))
            
            """ Lemmatization:
                Considered using lemmatization instead of snowball stemming
                Stemming does not consider how the words are being used.
            """
#             lemmatizedWords = []
#             for word in wordsFiltered:
#                 lemmatizedWords.append(lemmatizer.lemmatize(word))

            doc[desc] = ' '.join(stem_words)

        except TypeError:
            # 2156 bug reports with no description
            if(len(doc[desc]) == 0):
                continue
            else:
                errors.append(doc['bug_id'])
        except DuplicateKeyError:
            continue
    # Append all doc in docs list
    docs.append(doc)

# Inserting the processes documents in the database
processedCollection.insert_many(docs)

BulkWriteError: batch op errors occurred, full error: {'writeErrors': [{'index': 0, 'code': 11000, 'errmsg': "E11000 duplicate key error collection: eclipse.preprocessed index: _id_ dup key: { : ObjectId('52e9b44754dc1c25ebdb1ee5') }", 'op': {'_id': ObjectId('52e9b44754dc1c25ebdb1ee5'), 'bug_id': '3', 'product': 'Platform', 'description': 'pr delet indic sync viewer subtl would even true someon vision problem ne good suggest use label compar text viewer may also consid better icon pr file project indic file delet workspac look file workspac empti note appear rightfac arrow sync view sent screen cap ne request clarif arrow clearer arrow present otherwis easi miss distinct especi user attent focuss text compar pane expect workspac file label indic delet repositori file label delet repositori', 'bug_severity': 'normal', 'dup_id': [], 'short_desc': 'sync indic delet', 'priority': 'P5', 'version': '2.0', 'component': 'Team', 'delta_ts': '2010-05-07 10:28:53 -0400', 'bug_status': 'RESOLVED', 'creation_ts': '2001-10-10 21:34:00 -0400', 'resolution': 'FIXED'}}], 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 0, 'nMatched': 0, 'nModified': 0, 'nRemoved': 0, 'upserted': []}

In [21]:
print(errors)

[]


In [12]:
doc = processedCollection.find_one({"bug_id":"76219"})
doc

{'_id': ObjectId('52e9c46e54dc1c25ebdc441c'),
 'bug_id': '76219',
 'product': 'JDT',
 'description': 'System 3 1M2 0 code exhibit inconsistant behavior Sun Eclipse compiler interface AA abstract class BB implement AA Note formal type parameter Z AA bounded recursively AA corresponding type argument declared type formal parameter wht method implementation wildcard bound understanding unbounded wildcards implicit bound Object case mean body expression wht z assumed Object method b Eclipse compiler seems reject code reason Bound mismatch type valid substitute bounded parameter type AA hand validly constructed actual parameter wht type AA U would certainly constructed respecting bound Z Perhaps Sun JDK javac accepts code without error know correct behavior posting bug JDK Eclipse bug page',
 'bug_severity': 'normal',
 'dup_id': [],
 'short_desc': '1 5 wildcards method formal parameter type',
 'priority': 'P3',
 'version': '3.1',
 'component': 'Core',
 'delta_ts': '2004-11-04 11:42:30 -0500