In [21]:
import re, os
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [22]:
data_dir = "/Users/zack/Customers/ds_work/topic modeling/samples"
os.chdir(data_dir)
csv = "trading_samples.csv"
messages_df = pd.read_csv(csv, low_memory=False)
messages_df.drop_duplicates(inplace=True, ignore_index=True)
messages_df.dropna(inplace=True)
print(len(messages_df))

15578


In [23]:
# stop_words = ["i", "im", "hey", "hello", "hi", "me", "my", "myself", "we", "our", "ours", "ourselves", 
# "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", 
# "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", 
# "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
# "have", "has", "had", "having", "do", "u", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
# "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", 
# "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", 
# "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", 
# "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", 
# "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "dont", "should", 
# "now", "need", "like", "would", "get", "trying", "new", "know", "got", "use", "one", "go", "g", 
# "cant", "please"]

# maybe words?
# stopwords += ["customer", "good", "set", "thank", "thanks", "yes"]
# stopwords += ["help", "home", "looking", "mobile", "number", "phone", "store", "tmobile", "trouble", "want"]

# remove dupes
# stopwords = set(stopwords)
# stop_words = stopwords.words('english')
# stop_words.extend(["thank", "thanks", "yes", "afternoon", "morning", "evening", "trade"])

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',"you're", \
    "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', \
    'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', \
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', \
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', \
    'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 "don't",
 'should',
 "should've",
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'thank',
 'thanks',
 'yes',
 'afternoon',
 'morning',
 'evening',
 'trade']

### functions for basic work

In [None]:
def sent_to_words(sentences):
    '''remove single characters, lower-case, simple unicode output
    deacc: removes accents and normalizes alphabet'''
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts, stop_ws):
    return [[word for word in simple_preprocess(str(sentence)) if word not in stop_ws] for sentence in texts]

def make_bigrams(texts, b_mod):
    return [b_mod[sentence] for sentence in texts]

def make_trigrams(texts, t_mod):
    return [t_mod[bigram_mod[sentence]] for sentence in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Basic pre-processing of text

In [25]:
input_texts = messages_df['text'].tolist()

# Longer user messages typically contain more than 1 topic
# Decided to limit topic classification samples to 150 characters 
input_texts = [sample for sample in input_texts if len(sample) <= 150]
sent_words = list(sent_to_words(input_texts))
len(sent_words)

13504

In [26]:
# Build the bigram phrases models
# min_count ignores words with occurances lower than the count,
# threshold for generating a phrase is defined by -
# (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold
# with 'a' and 'b' being words, and N being the corpus vocabulary size
bigram = gensim.models.Phrases(sent_words, min_count=1000, threshold=50) # higher threshold fewer phrases.

# excract just the phrases - smaller and faster
bigram_mod = gensim.models.phrases.Phraser(bigram)


In [28]:
# Remove Stop Words
cleaned_words = remove_stopwords(sent_words, stop_words)

# lookup bigrams for cleaned text
sent_words_bigrams = make_bigrams(cleaned_words, bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
processor = spacy.load("en_core_web_trf", disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
lemmatized_data = lemmatization(sent_words_bigrams, processor, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

[[]]


In [29]:
# Create Dictionary - maps normalized words to integer IDs
id2word = corpora.Dictionary(lemmatized_data)

# Term Document Frequency - bag of words with ID and count
corpus = [id2word.doc2bow(text) for text in lemmatized_data]

[[]]


In [30]:
%store cleaned_words
%store sent_words
%store sent_words_bigrams
%store processor
%store corpus
%store id2word
%store lemmatized_data
%store bigram
%store bigram_mod

Stored 'cleaned_words' (list)
Stored 'sent_words' (list)
Stored 'sent_words_bigrams' (list)
Stored 'nlp' (English)
Stored 'corpus' (list)
Stored 'id2word' (Dictionary)
Stored 'data_lemmatized' (list)
Stored 'bigram' (Phrases)
Stored 'bigram_mod' (FrozenPhrases)


In [31]:
# Build Latent Dirichlet Allocation model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           random_state=42,
                                           update_every=0,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
%store lda_model

Stored 'lda_model' (LdaModel)


In [33]:
# review topics to make sure they've been computed properly
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(27,
  '0.588*"think" + 0.027*"trade" + 0.014*"pro" + 0.012*"max" + 0.010*"value" + '
  '0.000*"gor" + 0.000*"slip" + 0.000*"differentiate" + 0.000*"pristine" + '
  '0.000*"responser"'),
 (77,
  '0.183*"phone" + 0.047*"old" + 0.039*"get" + 0.034*"new" + 0.024*"still" + '
  '0.022*"credit" + 0.019*"customer" + 0.019*"number" + 0.017*"device" + '
  '0.016*"mobile"'),
 (91,
  '0.074*"phone" + 0.038*"senior" + 0.038*"get" + 0.037*"airpod" + '
  '0.033*"line" + 0.026*"monthly" + 0.024*"issue" + 0.024*"iphone" + '
  '0.021*"plan" + 0.018*"phone_number"'),
 (95,
  '0.055*"new" + 0.046*"pro" + 0.043*"phone" + 0.043*"mobile" + 0.032*"get" + '
  '0.028*"name" + 0.028*"line" + 0.028*"store" + 0.024*"device" + '
  '0.022*"customer"'),
 (8,
  '0.076*"phone" + 0.045*"get" + 0.043*"next" + 0.043*"want" + 0.037*"max" + '
  '0.032*"glass" + 0.028*"pro" + 0.027*"back" + 0.026*"th" + 0.024*"camera"'),
 (99,
  '0.151*"phone" + 0.067*"new" + 0.048*"get" + 0.024*"old" + 0.024*"still" + '
  '0.022*"work" + 

### loose analysis of model consistency

In [34]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - DON'T USE C_V!!
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized_data, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.180563724536997

Coherence Score:  0.3345672949783372


In [35]:
# %store -r cleaned_words
# %store -r sent_words
# %store -r sent_words_bigrams
# %store -r processor
# %store -r corpus
# %store -r id2word
# %store -r lemmatized_data
# %store -r bigram
# %store -r bigram_mod
# %store -r lda_model

# %store -r trigram
# %store -r trigram_mod

In [36]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(
