# SCM_POC_NLP

In [1]:
# import libraries  
import numpy as np
import pandas as pd
import nltk
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# spacy for basic preprocessing, optional, can use nltk as well (lemmatisation etc.)
import spacy

In [3]:
# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# Plotting tool for LDA
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
def warn1(*args,**kwargs):
    pass

warnings.warn=warn1

In [9]:
df=pd.read_excel('./SCM_TABLEAU_DAT.xlsx')

In [10]:
df.head(2)

Unnamed: 0,Bugged,Severity,Product_Family,Product_Area,Product,Component,Sub_Component,Root_Cause,Known_Issue,Reason_Code,Summary,Status,Substatus,Date_Opened,Days_Open,Date_Closed
0,No,2,Oracle Platform Cloud,Internet of Things Cloud Service,Internet of Things Cloud Service,Connected Worker,Generic Issues,Cloud-Implementation,No,"14 Undetermined,Cust Abandoned",can't use google from inside oracle network,Closed,Not Entitled,2017-05-06 03:34:25,454,2018-08-02 19:39:28
1,No,2,Oracle Software Cloud,Oracle Enterprise Resource Planning Cloud,Oracle Fusion Inventory Management Cloud Service,Financial Orchestration,Other Setup Tasks,Cloud-Post Go Live,Unspecified,12 Usability/Training-End User,Intercompany ap invoice accrual account is not...,Closed,Resolved with Solution,2017-07-15 19:47:15,391,2018-08-10 10:07:55


In [12]:
df_txt=pd.DataFrame(df.Summary)

In [13]:
df_txt.head()

Unnamed: 0,Summary
0,can't use google from inside oracle network
1,Intercompany ap invoice accrual account is not...
2,"[IMPL SPT]Customer,BankOfAmerica wants to know..."
3,[ER 27439011] Unable to create Multilist attri...
4,Error ORA-00001: unique constraint (FUSION.CML...


In [31]:
#Converting Array to list
data=df_txt.values.tolist()
data[:2]

[["can't use google from inside oracle network"],
 ['Intercompany ap invoice accrual account is not populated']]

In [35]:
# tokenize using gensim simple_preprocess,converts words into lowercase and removes min and max lenght words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))

In [36]:
data_words = list(sent_to_words(data))
print(data_words[1])

['intercompany', 'ap', 'invoice', 'accrual', 'account', 'is', 'not', 'populated']


##### The code below creates a list of stop words. The 'string' module in python comes with a list of punctuation characters, which we'll append to the builtin stopwords of NLTK.

In [37]:
from nltk.corpus import stopwords
stop_words= stopwords.words('english')+ list(string.punctuation)

In [38]:
data_words

[['can', 'use', 'google', 'from', 'inside', 'oracle', 'network'],
 ['intercompany',
  'ap',
  'invoice',
  'accrual',
  'account',
  'is',
  'not',
  'populated'],
 ['impl',
  'spt',
  'customer',
  'bankofamerica',
  'wants',
  'to',
  'know',
  'they',
  'can',
  'maintain',
  'duplicate',
  'suppliers',
  'in',
  'supplr',
  'mgmt'],
 ['er',
  'unable',
  'to',
  'create',
  'multilist',
  'attribute',
  'type',
  'on',
  'change',
  'object'],
 ['error',
  'ora',
  'unique',
  'constraint',
  'fusion',
  'violated',
  'trade',
  'operation'],
 ['er',
  'breaking',
  'out',
  'security',
  'policy',
  'manage',
  'item',
  'change',
  'order',
  'to',
  'handle',
  'independent',
  'privlleges'],
 ['import',
  'files',
  'succesfull',
  'but',
  'no',
  'owner',
  'spoke',
  'system',
  'at',
  'child',
  'level'],
 ['autoinvoice', 'not', 'honoring', 'le', 'timezone', 'setting'],
 ['unable',
  'to',
  'split',
  'requisition',
  'line',
  'in',
  'process',
  'requistion',
  'screen

#### Removing stopwords and lemmatization

In [20]:

def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in text]

data_words_nostops= remove_stopwords(data_words)


In [42]:

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    return texts_out

In [43]:
from nltk.stem import PorterStemmer

In [45]:
# nlp = spacy.load('en', disable=['parser', 'ner'])
# data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


nlp=spacy.load('en',disable=['parser','ner'])
data_lemmatized = lemmatization(data_words_nostops)

In [48]:
print(data_lemmatized[0])

['use', 'oracle', 'network']


In [49]:
# create dictionary and corpus
id2word = corpora.Dictionary(data_lemmatized)

In [54]:
id2word

<gensim.corpora.dictionary.Dictionary at 0x1d4b3f42eb8>

In [55]:
# Create corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
print(corpus[2])

[(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]


In [56]:
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)],
 [(20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)],
 [(21, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1)],
 [(44, 1), (45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)],
 [(56, 1), (57, 1), (58, 1), (59, 1)],
 [(26, 1),
  (28, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1)],
 [(21, 2),
  (24, 1),
  (40, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1)],
 [(73, 1), (74, 1), (75, 1), (76, 1)],
 [(58, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1)],
 [(82, 1), (83, 1), (84, 1), (85, 1)],
 [(28, 2),
  (77, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89,

In [57]:
# human-readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('network', 1), ('oracle', 1), ('use', 1)]]

### Topic Model

In [59]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [61]:
# print the topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

doc_lda

[(0,
  '0.041*"fail" + 0.033*"report" + 0.033*"user" + 0.031*"need" + 0.028*"bug" + '
  '0.023*"search" + 0.021*"task" + 0.021*"miss" + 0.020*"return" + '
  '0.018*"detail"'),
 (1,
  '0.116*"item" + 0.087*"unable" + 0.060*"cost" + 0.028*"service" + '
  '0.016*"upload" + 0.015*"catalog" + 0.013*"access" + 0.013*"load" + '
  '0.013*"document" + 0.013*"management"'),
 (2,
  '0.073*"approval" + 0.058*"not" + 0.055*"can" + 0.045*"request" + '
  '0.043*"accounting" + 0.038*"date" + 0.038*"receipt" + 0.032*"manage" + '
  '0.021*"distribution" + 0.019*"display"'),
 (3,
  '0.124*"order" + 0.083*"error" + 0.061*"shipping" + 0.059*"await" + '
  '0.053*"updateable" + 0.031*"work" + 0.028*"sale" + 0.028*"status" + '
  '0.026*"get" + 0.023*"process"'),
 (4,
  '0.053*"import" + 0.039*"use" + 0.027*"set" + 0.027*"datum" + '
  '0.025*"attribute" + 0.025*"fbdi" + 0.025*"value" + 0.021*"view" + '
  '0.020*"receive" + 0.019*"release"'),
 (5,
  '0.119*"line" + 0.061*"create" + 0.037*"requisition" + 0.035*"

<gensim.interfaces.TransformedCorpus at 0x1d4b45c2390>

In [62]:
# coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3667987842104421


In [63]:
# visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis