# All Imports

In [111]:
from jira import JIRA
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from hyperdash import monitor_cell
import warnings
import os
import pandas as pd


from nltk.corpus import stopwords
from collections import Counter
import pprint
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Doc2Vec Approach

## Helpful Links

https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

https://radimrehurek.com/gensim/models/doc2vec.html

http://jira.readthedocs.io/en/latest/examples.html#comments


https://www.codeproject.com/Articles/11835/WordNet-based-semantic-similarity-measurement

https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

https://pypi.org/project/sematch/1.0.3/

https://stackoverflow.com/questions/16877517/compare-similarity-of-terms-expressions-using-nltk

https://stackoverflow.com/questions/42781292/doc2vec-get-most-similar-documents

https://radimrehurek.com/gensim/models/doc2vec.html

https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python

https://stackoverflow.com/questions/44589872/doc2vec-pull-documents-from-inferred-document

https://www.quora.com/Is-doc2vec-suitable-in-information-retrieval-to-calculate-distance-between-query-and-doc

https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/

https://stackoverflow.com/questions/47022246/warning-message-after-importing-gensim-module-in-windows-os

## Util Functions

In [107]:
# Constants
maxResults = 2000
jira_url = 'https://jira.endurance.com'
username = 'bhavul.g'
password = 'insert-your-password-here'
filter_fnb_tickets = 'project = WSE AND status in (Done, Completed) AND (component in ("Support - FnB") OR "Customer Request Type" = "Support - FnB") AND component not in ("Ad hoc") ORDER BY updated DESC, created ASC, component ASC, priority DESC'
stops = set(stopwords.words("english"))
stemmer = SnowballStemmer('english')
ignored_words_string = "reseller, customer, error, c, r, issue, client, ticket, https, screenshot, support, following, name, getting, kindly, please, reference, see, prntscr, information, trying, need, ref, one, hence, pm, jira, adhocnfoway, www, wse, endurance, gmail, says, thank, png, jpg, jpeg, gif, paye, jul, e, still, color, replicate, 2fwww, 2fservlet, refer, dnot, gets, two, april, entire, asked, many, want, fine, attachment, told, understand, regarding, take, keep, latest, someone, science, hello, might, rahul, backend, yesterday, answer, searched"

In [57]:
def remove_code_from_comments(comment_body):
    return re.sub(r'{code:.*}[\s\S]*?{code}','',str(comment_body))

def get_jira_issue_object(authed_jira, jira_name):
    return authed_jira.issue(jira_name)

def get_title(jira_issue_object):
    return jira_issue_object.fields.summary

def get_summary(jira_issue_object):
    return jira_issue_object.fields.description

def get_jira_id(jira_issue_object):
    return jira_issue_object.key

def get_status(jira_issue_object):
    return jira_issue_object.fields.status

def get_list_of_comments(jira_issue_object):
    return jira_issue_object.fields.comment.comments

def get_reqd_comments_data(list_of_comments):
    ticket_dict = {}
    ticket_dict['comments_data'] = [] 
    ticket_dict['comments_corpus'] = []
    for comment in list_of_comments:
        comment_data = {}
        comment_data['emailAddress'] = comment.author.emailAddress
        comment_data['body'] = comment.body
        comment_data['created'] = comment.created
        comment_data['updated'] = comment.updated
        ticket_dict['comments_data'].append(comment_data)
        comment_corpus_data = remove_code_from_comments(comment_data['body'])
        ticket_dict['comments_corpus'].append(comment_corpus_data)
    return ticket_dict['comments_data'],ticket_dict['comments_corpus']

def filter_crawler(authed_jira, jira_filter):
    print("Crawling the filter...")
    filter_tickets = authed_jira.search_issues(jira_filter, maxResults=maxResults)
    tickets_corpus = []
    for ticket in filter_tickets:
        ticket_dict = {}
        jira_id = get_jira_id(ticket)
        ticket_full_data = authed_jira.issue(jira_id)  
        ticket_dict['jiraid'] = jira_id
        ticket_dict['title'] = get_title(ticket_full_data) 
        ticket_dict['summary'] = get_summary(ticket_full_data)
        list_of_comments = get_list_of_comments(ticket_full_data)
        ticket_dict['comments_data'],ticket_dict['comments_corpus'] = get_reqd_comments_data(list_of_comments)
        tickets_corpus.append(ticket_dict)
    print("Crawling done.")
    return tickets_corpus

## Features yet to Add

- Automatic model improvement using human input (cron that checks for corrected bots comments)
- Take assignee's name into picture while calculating the similarity

## PLAN

Actual Thing that runs every 1 hour: 
- find all open issues and get their names
- Maintain a list for which you've alrdy done, and now make api call only for one you've not done
- for the new ones, Get their body
- Find similar ones via gensim model


2nd cron that runs once a week:
- Find all done issues > some creation date and get their names
- Check in the list you maintain the issues which are correctly marked done
- Figure out any new ones
- Mail those people, as well as dhanya.n and swapnil.b and joel.r

3rd cron that runs once a week:
- Make a corpus of data of all documents till last week
- Ignore bots comments in that
- train new model on this
- SOMEHOW CHECK IF THIS IS BETTER THAN PREVIOUS OR NOT


## Training Model

In [58]:
authed_jira = JIRA(jira_url,auth=(username, password))

In [59]:
%%monitor_cell "crawling jirabot"
fnb_tickets_corpus = filter_crawler(authed_jira=authed_jira, jira_filter=filter_fnb_tickets)
tagged_data_only_summary = [TaggedDocument(words=word_tokenize((str(ticket_dict['title'])+" "+str(ticket_dict['summary'])).lower()), tags=[str(ticket_dict['jiraid'])]) for ticket_dict in fnb_tickets_corpus]
tagged_data_all_text = [TaggedDocument(words=word_tokenize((str(ticket_dict['title'])+" "+str(ticket_dict['summary'])+" "+str(ticket_dict['comments_corpus'])).lower()), tags=[str(ticket_dict['jiraid'])]) for ticket_dict in fnb_tickets_corpus]

Crawling the filter...
Crawling done.
This run of "crawling jirabot" ran for 0:03:05 and logs are available locally at: /Users/bhavul.g/.hyperdash/logs/crawling-jirabot/crawling-jirabot_2018-04-22t07-51-22-709265.log


In [47]:
def train_doc2vec_model(model_name_prefix,tagged_data, vec_size, alpha, min_word_count_per_doc, dm, no_of_epochs):
    model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=alpha,
                min_count=min_word_count_per_doc,
                dm =dm,
                workers = 4)
    
    model.build_vocab(tagged_data)
    
    for epoch in range(no_of_epochs):
        model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha
    modelname = model_name_prefix+"_d2v_"+str(dm)+"dm_"+str(no_of_epochs)+"epoch_"+str(vec_size)+"vecsize_"+str(alpha)+"alpha.model"
    model.save(modelname)
    print("Model trained:",modelname)

In [None]:
%%monitor_cell "doc2vec Models jirabot"
warnings.filterwarnings(action='once')
epochs_list = [100,500,1000]
size_list = [10,50,100,200]
alpha_list = [0.05, 0.01, 0.025]
dm_list = [0,1]

i = 1
for vec_size in size_list:
    for alpha in alpha_list:
        for epoch in epochs_list:
            for dm in dm_list:
                train_doc2vec_model(str(i)+"_summary",tagged_data_only_summary, vec_size, alpha, 1, dm, epoch)
                train_doc2vec_model(str(i)+"_alldata",tagged_data_all_text, vec_size, alpha, 1, dm, epoch)

In [44]:
# models_list = ['1_alldata_d2v_0dm_1000epoch_100vecsize_0.025alpha.model']
# tickets_dev_set = ['WSE-2607']

In [45]:
df = pd.DataFrame({'Modelname':['TestModel'], 'For':['TEST-123'], 'Related-Tickets':['TEST;TEST'], 'Points':[-10]})
df

Unnamed: 0,For,Modelname,Points,Related-Tickets
0,TEST-123,TestModel,-10,TEST;TEST


In [None]:
%%monitor_cell "doc2vec Models jirabot - devset"

tickets_dev_set = ['WSE-2607','WSE-3608','OFB-457','OFB-557','OFB-582','OFB-634','OFB-1002','OFB-1005','OFB-1030']
models_list = os.listdir('./models')
for modelname in models_list:
    print("processing ",modelname,"....")
    model = Doc2Vec.load("./models/"+str(modelname))
    for ticket in tickets_dev_set:
        data_dict = {}
        test_issue_text_data = ''
        data_dict['Modelname'] = modelname
        data_dict['Points'] = 0

        test_issue = authed_jira.issue(ticket)
        data_dict['For'] = get_jira_id(test_issue)
        
        test_issue_text_data += str(get_title(test_issue))
        test_issue_text_data += str(get_summary(test_issue))
        list_of_comments_test = get_list_of_comments(test_issue)
        for comment in list_of_comments_test:
            test_issue_text_data += str(remove_code_from_comments(comment.body))
        
        test_data = word_tokenize(test_issue_text_data.lower())
        test_issue_vector = model.infer_vector(test_data)
        similar_doc = model.docvecs.most_similar(positive=[test_issue_vector], topn=5)
        data_dict['Related-Tickets'] = ":".join(["("+str(doc)+"|"+str("{0:.3f}".format(similar_score))+")" for (doc,similar_score) in similar_doc])
        df.loc[df.index.max() + 1] = data_dict
    print("all done for model - ",modelname)

In [None]:
df

In [51]:
df.to_csv('dev_set_outputs.csv', index=False)

In [10]:

df

Unnamed: 0,Modelname,For,Related-Tickets,Points
,A,B,C,25


# TfIdf Approach

## Helpful links

https://github.com/RaRe-Technologies/gensim/issues/1560
    
https://marketplace.atlassian.com/plugins/com.deniz.jira.similarissues/server/overview
    
http://www.diss.fu-berlin.de/docs/servlets/MCRFileNodeServlet/FUDOCS_derivate_000000003281/JiraExpertFinder.pdf
    
https://www.kaggle.com/currie32/predicting-similarity-tfidfvectorizer-doc2vec
    
https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity/12128777
    
https://docs.google.com/spreadsheets/d/14DFm-6wamaaOMN0b-4tYd5gOtHpqzdytLT-KUwLuolU/edit#gid=1607381971

https://stackoverflow.com/questions/49134593/how-to-expand-the-words-of-tfidf-vectorizer-in-sklearn-without-retraining-the-wh

https://www.quora.com/How-does-doc2vec-represent-feature-vector-of-a-document-Can-anyone-explain-mathematically-how-the-process-is-done

## Practice

In [99]:
# PRACTICE. TO KNOW IF ALL_WORDS that are getting out are correct ones.


from nltk.corpus import stopwords
from collections import Counter
import pprint
from nltk.stem import SnowballStemmer


ignored_words_list = [x.strip() for x in ignored_words_string.split(',')]
words_data = pd.DataFrame({'word':['test'],'count':[-1]})
final_corpus = []
all_words = []
list_of_docs = []
for ticket_dict in fnb_tickets_corpus:
    doc_cleaned_text = ''
    words = (str(ticket_dict['title'])+" "+str(ticket_dict['summary'])).lower()
    words = re.sub('\\b\d+(?:\.\d+)?\s*', '', words)      # remove a number or decimal num followed by a space
    words = re.sub('[rc]*id\s*:*', '', words)              # remove rid, id fields
    words = re.split('\W+', words)
    words = ' '.join(words)
    words = words.split()
    words = [w for w in words if not w in stops]
    words = [w for w in words if not w in ignored_words_list]
    stemmed_words = [stemmer.stem(word) for word in words]
    all_words += stemmed_words
    doc_cleaned_text = ' '.join(stemmed_words)
    list_of_docs.append(doc_cleaned_text)
    final_corpus.append({'jiraid':ticket_dict['jiraid'], 'words':doc_cleaned_text})

words_dict = Counter(all_words)
words_dict = dict(words_dict)
for key in words_dict.keys():
    if not words_dict[key] == 1:
        words_data.loc[words_data.index.max()+1] = [words_dict[key],key]
words_data.to_csv('words_frequencies1.csv',index=False)

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer()
tfidf = tfidfvectorizer.fit_transform(list_of_docs)
tfidf_test = tfidfvectorizer.transform()
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices

array([  0,  50, 156, 165])

## Util Functions - TfIdf Approach

In [134]:
def clean_document(document_of_words):
    document_of_words = document_of_words.lower()
    document_of_words = re.sub('\\b\d+(?:\.\d+)?\s*', '', document_of_words)      # remove a number or decimal num followed by a space
    document_of_words = re.sub('[rc]*id\s*:*', '', document_of_words)              # remove rid, id fields
    document_of_words = re.split('\W+', document_of_words)                        # remove all non-words (make a list)
    document_of_words = [w for w in document_of_words if not w in stops]           # remove stop words
    document_of_words = [w for w in document_of_words if not w in ignored_words_list]    # remove ignored words
    stemmed_words = [stemmer.stem(word) for word in document_of_words]       # stem each word
    return ' '.join(stemmed_words)
    

def extract_clean_documents_from_corpus(corpus):
    print("Extracting and Cleaning documents...")
    final_corpus = []
    list_of_docs = []
    i = 0
    for ticket_dict in corpus:
        print("Processing ",ticket_dict['title'])
        doc_cleaned_text = ''
        document_of_words = (str(ticket_dict['title'])+" "+str(ticket_dict['summary']))
        doc_cleaned_text = clean_document(document_of_words)
        list_of_docs.append(doc_cleaned_text)
        final_corpus.append({'jiraid':ticket_dict['jiraid'], 'words':doc_cleaned_text, 'index':i})
        i+=1
    return list_of_docs,final_corpus

def find_top_n_similar_documents(n,tfidf_test,tfidf_trainingset,cleaned_training_corpus):
    cosine_similarities = linear_kernel(tfidf_test, tfidf_trainingset).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-n:-1]
    related_jira_ids = []
    for ticket in cleaned_training_corpus:
        if(ticket['index'] in related_docs_indices):
            related_jira_ids.append(ticket['jiraid'])
    return related_docs_indices,related_jira_ids

## Training and Testing

### Connecting To JIRA 

In [114]:
authed_jira = JIRA(jira_url,auth=(username, password))

### Filtering Tickets

In [116]:
%%monitor_cell "crawling jirabot"
fnb_tickets_corpus = filter_crawler(authed_jira=authed_jira, jira_filter=filter_fnb_tickets)

Crawling the filter...
Crawling done.
This run of "crawling jirabot" ran for 0:05:17 and logs are available locally at: /Users/bhavul.g/.hyperdash/logs/crawling-jirabot/crawling-jirabot_2018-04-22t12-27-04-970127.log


### Model

In [None]:
tfidf_model = TfidfVectorizer()
list_of_docs,training_ticket_corpus = extract_clean_documents_from_corpus(fnb_tickets_corpus)
tfidf_trainingset = tfidf_model.fit_transform(list_of_docs)

### Testing

In [138]:
# tickets_dev_set = ['WSE-2607','WSE-3608','OFB-457','OFB-557','OFB-582','OFB-634','OFB-1002','OFB-1005','OFB-1030']
tickets_dev_set = ['WSE-3150']
for ticket in tickets_dev_set:
    test_issue = authed_jira.issue(ticket)
    title = get_title(test_issue)
    summary = get_summary(test_issue)
    document_test = str(title)+" "+str(summary)
    cleaned_document = clean_document(document_test)
    cleaned_document = [cleaned_document]
    tfidf_test = tfidf_model.transform(cleaned_document)
    related_indices, related_jiras = find_top_n_similar_documents(5,tfidf_test[0:1],tfidf_trainingset,training_ticket_corpus)
    print("\n",ticket," >>>> ",related_jiras,"\n")       


 WSE-3150  >>>>  ['WSE-3150', 'WSE-3245', 'WSE-2324', 'WSE-2368'] 

