In [1]:
import os 

import nltk 
# nltk.download('stopwords')
import numpy as np
import pandas as pd

#Gensim 
import gensim 
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Spacy
import spacy 

#NLTK
from nltk.corpus import stopwords

#Vis
import pyLDAvis
import pyLDAvis.gensim_models 

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
old_dir = os.getcwd()
os.chdir("S:/asanchez/Edward Jones Stuff")

# read in data into dataframe
df = pd.read_excel('InsightExchange_AllQuestionText_misc_removed.xlsx')
df_deduped = df['QuestionText'].drop_duplicates()

df_deduped.head()

0    Approximately how much time do you spend on th...
1    What is the primary reason that you have <u>no...
2    To what extent do you find these communication...
3    Please indicate how much you agree or disagree...
4    When searching for <b>your Edward Jones' Finan...
Name: QuestionText, dtype: object

In [3]:
stopwords = stopwords.words("english")

In [4]:
df_deduped[0][0:90]

'Approximately how much time do you spend on the internet <u>daily</u> (including web brows'

In [5]:
def lemmatization(texts, allowed_tags = ["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_lg", disable = ["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_tags and token.is_oov==False:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return(texts_out)

In [6]:
lemmatized_texts = lemmatization(df_deduped)
print(lemmatized_texts[0][0:90])



approximately how much time do spend internet include web browser mobile app


In [7]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc = True) # remove accents if there
        final.append(new)
    return final

In [8]:
data_words = gen_words(lemmatized_texts)

In [9]:
print(data_words[0][0:20])

['approximately', 'how', 'much', 'time', 'do', 'spend', 'internet', 'include', 'web', 'browser', 'mobile', 'app']


In [10]:
# bigrams and trigrams 
bigram_phrases = gensim.models.Phrases(data_words, min_count = 5, threshold = 100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold = 100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)                                    
trigram = gensim.models.phrases.Phraser(trigram_phrases)

# create a function to reinject above so model can process them
def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [11]:
# TF-IDF Removal
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)
texts = data_bigrams_trigrams
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = TfidfModel(corpus, id2word=id2word) # create tfidf model

low_value = 0.03
words = []
words_missing_in_tfidf = []

# goes through and looks for words that occur with n threshold (low value) if that occurs 
# then it doesnt lend value to clustering algorithm and is excluded 
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids=[id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # the words with tfidf score 0 will be missing
    
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf] 
    corpus[i] = new_bow

In [12]:
''' without tfidf removal 
id2word = corpora.Dictionary(data_words)
corpus = []
for text in data_words:
    new = id2word.doc2bow(text) #create bag of words
    corpus.append(new)
'''

' without tfidf removal \nid2word = corpora.Dictionary(data_words)\ncorpus = []\nfor text in data_words:\n    new = id2word.doc2bow(text) #create bag of words\n    corpus.append(new)\n'

In [13]:
'''print(corpus[0][0:20])
word = id2word[[0][:1][0]]'''

'print(corpus[0][0:20])\nword = id2word[[0][:1][0]]'

In [14]:
'''
#checking to make sure word map makes sense
print(word)'''

'\n#checking to make sure word map makes sense\nprint(word)'

In [15]:
'''# create LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus, 
    id2word = id2word, 
    num_topics = 30, # number of topics 
    random_state = 100, 
    update_every = 1, 
    chunksize=100, 
    passes=10, 
    alpha = "auto"
)'''

# create LDA model for train_test_split
lda_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus[:-1], # training model to everything except second to last item in corpus 
    id2word = id2word, 
    num_topics = 25, # number of topics 
    random_state = 100, 
    update_every = 1, 
    chunksize=100, 
    passes=10, 
    alpha = "auto"
)

In [16]:
# testing model with unseen data 
test_doc = corpus[-1]
vector = lda_model[test_doc]
print(vector)
# different documents and degree to which document falls into that catagory
# organized in ascending order from topic 

[(0, 0.087858364), (2, 0.025184842), (3, 0.022488466), (5, 0.16575836), (7, 0.02424734), (8, 0.015853774), (10, 0.08912644), (12, 0.012440146), (13, 0.23622794), (15, 0.020968262), (16, 0.010925544), (18, 0.0132486), (19, 0.09862222), (21, 0.013852575), (23, 0.08812964), (24, 0.013483859)]


In [17]:
# organize list by most represented topic 
# sort list of lists
def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)

new_vector = Sort(vector)
new_vector

[(13, 0.23622794),
 (5, 0.16575836),
 (19, 0.09862222),
 (10, 0.08912644),
 (23, 0.08812964),
 (0, 0.087858364),
 (2, 0.025184842),
 (7, 0.02424734),
 (3, 0.022488466),
 (15, 0.020968262),
 (8, 0.015853774),
 (21, 0.013852575),
 (24, 0.013483859),
 (18, 0.0132486),
 (12, 0.012440146),
 (16, 0.010925544)]

In [18]:
'''#Saving the model (one way)
os.mkdir('models')
lda_model.save('models/test_model.model') '''

"#Saving the model (one way)\nos.mkdir('models')\nlda_model.save('models/test_model.model') "

In [19]:
# Vusalizing the Data

In [20]:
pyLDAvis.enable_notebook()

In [21]:
vis = pyLDAvis.gensim_models.prepare(lda_model, 
                                     corpus, 
                                     id2word, 
                                     mds="mmds", 
                                     R=30)


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [22]:
vis

In [82]:
'''# loading model
new_model = gensim.models.ldamodel.LdaModel.load('models/test_model.model')'''