In [1]:
import numpy as np
np.random.seed(1337)
import json, re, nltk, string
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from keras.optimizers import RMSprop
from keras.utils import np_utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics.pairwise import cosine_similarity

Using TensorFlow backend.


In [2]:
open_bugs_json = 'D:\\BugTriage\\Chrome\\deep_data.json'
closed_bugs_json = 'D:\\BugTriage\\Chrome\\classifier_data_0.json'

In [3]:
#1. Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5

#2. Classifier hyperparameters
numCV = 10
max_sentence_len = 50
min_sentence_length = 15
rankK = 10
batch_size = 32

In [4]:
with open(open_bugs_json) as data_file:
    data = json.load(data_file, strict=False)

all_data = []
for item in data:
    #1. Remove \r 
    current_title = item['issue_title'].replace('\r', ' ')
    current_desc = item['description'].replace('\r', ' ')    
    #2. Remove URLs
    current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
    #3. Remove Stack Trace
    start_loc = current_desc.find("Stack trace:")
    current_desc = current_desc[:start_loc]    
    #4. Remove hex code
    current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
    current_title= re.sub(r'(\w+)0x\w+', '', current_title)    
    #5. Change to lower case
    current_desc = current_desc.lower()
    current_title = current_title.lower()    
    #6. Tokenize
    current_desc_tokens = nltk.word_tokenize(current_desc)
    current_title_tokens = nltk.word_tokenize(current_title)
    #7. Strip trailing punctuation marks
    current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
    current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]      
    #8. Join the lists
    current_data = current_title_filter + current_desc_filter
    current_data = filter(None, current_data)
    all_data.append(current_data)

In [None]:
wordvec_model = Word2Vec(all_data, min_count=min_word_frequency_word2vec, size=embed_size_word2vec, window=context_window_word2vec)
vocabulary = wordvec_model.vocab
vocab_size = len(vocabulary)

In [8]:
data

[{'id': 1,
  'issue_id': 2,
  'issue_title': 'Testing if chromium id works',
  'reported_time': '2008-08-30 16:00:21',
  'owner': '',
  'description': '\nwhat steps will reproduce the problem\n1\n2\n3\n\r\nwhat is the expected output what do you see instead\n\r\n\r\nplease use labels and text to provide additional information\n \n'},
 {'id': 2,
  'issue_id': 3,
  'issue_title': 'This is a test',
  'reported_time': '2008-08-31 02:47:11',
  'owner': '',
  'description': '\nproduct version       0214927\r\nurls if applicable  httpwwwgooglecom\r\nother browsers tested\nadd ok or fail after other browsers where you have tested this issue\n     safari 3 fail\r\n    firefox 3 fail\r\n         ie 7 fail\r\n\r\nwhat steps will reproduce the problem\n1 eat\r\n2 sleep\r\n\r\nwhat is the expected result\n3 wake up\r\n\r\nwhat happens instead\n3 sleep continues unabated\r\n\r\nplease provide any additional information below attach a screenshot if \r\npossible\r\n \n'},
 {'id': 3,
  'issue_id': 5,
 