In [1]:
### Import necessary libraries and functions/ classes
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from string import punctuation
import pandas as pd
stemmer = SnowballStemmer("english")


##uncomment and run if wordnet is not downloaded
##import nltk
##nltk.download('wordnet')

In [2]:
### gets the root of the word
def lemmatize_stemming(word):
    return stemmer.stem(WordNetLemmatizer().lemmatize(word, pos ='v'))

### applies root of a word if word 
def wordCleaner(text):
    result = []
    for token in simple_preprocess(text.translate(str.maketrans('','',punctuation))):
        if token not in STOPWORDS and len(token)>2:
            result.append(lemmatize_stemming(token))
    return result


In [3]:
### summary from CASSANDRA-15302 is provided
example = "Pool is busy (no available connection and the queue has reached its max size 256)))"
print('Example Preproccessed: ')
print(wordCleaner(example))


Example Preproccessed: 
['pool', 'busi', 'avail', 'connect', 'queue', 'reach', 'max', 'size']


In [5]:
#### reads the data keeps only the summary
data = pd.read_csv('ticketData.csv')['summary']
print(data.head())
#### applies the preprocess function to each row
dataProcessed = data.apply(wordCleaner)
print(dataProcessed.head())
print("Number of tickets : {} ".format(len(data)))

0    Remove hook initialization in ctor from Bigtab...
1                             Display DAG from the CLI
2       Add interactivity to pre-commit image building
3                      Add typehint to GCP's Task Hook
4    Yamllint is not needed as prerequisite for pre...
Name: summary, dtype: object
0                  [remov, hook, initi, ctor]
1                         [display, dag, cli]
2     [add, interact, precommit, imag, build]
3           [add, typehint, gcps, task, hook]
4    [yamllint, need, prerequisit, precommit]
Name: summary, dtype: object
Number of tickets : 1228 


In [6]:
#### Creates list of words across the documents
#### Removes words that are too common or too rare
dictionary = gensim.corpora.Dictionary(dataProcessed)
dictionary.filter_extremes(no_above = .5)

In [10]:
#### gets word count for each word that is present in a dictionary
bow_corpus = [dictionary.doc2bow(doc) for  doc in dataProcessed]
### Example Word
exampleWord = bow_corpus[300]
for i in range(len(exampleWord)):
    print('Word {} (\"{}") appears {} time.'.format(exampleWord[i][0],dictionary[exampleWord[i][0]],exampleWord[i][1]))


Word 66 ("connect") appears 1 time.
Word 183 ("size") appears 1 time.
Word 204 ("pool") appears 1 time.


In [11]:
### Trains a lda model and prints out words/ weights in topic
lda_model = gensim.models.LdaMulticore(bow_corpus,num_topics= 5,id2word=dictionary, passes = 2, workers = 2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))


Topic: 0 
Words: 0.045*"updat" + 0.031*"use" + 0.030*"fail" + 0.021*"add" + 0.021*"test" + 0.020*"configur" + 0.018*"api" + 0.015*"class" + 0.015*"hive" + 0.015*"implement"
Topic: 1 
Words: 0.081*"add" + 0.040*"support" + 0.024*"fail" + 0.023*"creat" + 0.018*"job" + 0.017*"error" + 0.016*"field" + 0.015*"java" + 0.013*"project" + 0.013*"night"
Topic: 2 
Words: 0.049*"build" + 0.039*"remov" + 0.037*"fail" + 0.020*"improv" + 0.017*"document" + 0.017*"option" + 0.016*"properti" + 0.014*"file" + 0.014*"throw" + 0.014*"partit"
Topic: 3 
Words: 0.031*"support" + 0.021*"new" + 0.020*"releas" + 0.020*"updat" + 0.020*"data" + 0.019*"check" + 0.017*"instal" + 0.016*"doc" + 0.016*"python" + 0.014*"request"
Topic: 4 
Words: 0.039*"fix" + 0.030*"log" + 0.029*"test" + 0.024*"upgrad" + 0.023*"version" + 0.020*"use" + 0.020*"spark" + 0.018*"file" + 0.018*"add" + 0.016*"support"


In [12]:
### Example Scoring to see where it would be classsified
for index, score in sorted(lda_model[exampleWord], key = lambda x: -1*x[1]):
    print("\nScore: {}\t \n Topic: {}". format(score, lda_model.print_topic(index,5)))


Score: 0.797083854675293	 
 Topic: 0.039*"fix" + 0.030*"log" + 0.029*"test" + 0.024*"upgrad" + 0.023*"version"

Score: 0.05195586383342743	 
 Topic: 0.031*"support" + 0.021*"new" + 0.020*"releas" + 0.020*"updat" + 0.020*"data"

Score: 0.050492361187934875	 
 Topic: 0.049*"build" + 0.039*"remov" + 0.037*"fail" + 0.020*"improv" + 0.017*"document"

Score: 0.05040318891406059	 
 Topic: 0.081*"add" + 0.040*"support" + 0.024*"fail" + 0.023*"creat" + 0.018*"job"

Score: 0.05006469041109085	 
 Topic: 0.045*"updat" + 0.031*"use" + 0.030*"fail" + 0.021*"add" + 0.021*"test"


In [13]:
### Testing on new Jira issue:
issueSummary = 'Reorganize public v2 catalog API'
bow_vector = dictionary.doc2bow(wordCleaner(issueSummary))
for index, score in sorted(lda_model[exampleWord], key = lambda x: -1*x[1]):
    print("\nScore: {}\t \n Topic: {}". format(score, lda_model.print_topic(index,5)))



Score: 0.7970552444458008	 
 Topic: 0.039*"fix" + 0.030*"log" + 0.029*"test" + 0.024*"upgrad" + 0.023*"version"

Score: 0.05198429524898529	 
 Topic: 0.031*"support" + 0.021*"new" + 0.020*"releas" + 0.020*"updat" + 0.020*"data"

Score: 0.05049246922135353	 
 Topic: 0.049*"build" + 0.039*"remov" + 0.037*"fail" + 0.020*"improv" + 0.017*"document"

Score: 0.050403278321027756	 
 Topic: 0.081*"add" + 0.040*"support" + 0.024*"fail" + 0.023*"creat" + 0.018*"job"

Score: 0.050064701586961746	 
 Topic: 0.045*"updat" + 0.031*"use" + 0.030*"fail" + 0.021*"add" + 0.021*"test"
