In [1]:
# Turn off warnings 
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import seaborn as sns
sns.set_palette("Paired", 9)

In [5]:
## loading data
df = pd.read_csv('data/text_df.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,Review Text,Rating,Recommended
0,Absolutely wonderful - silky and sexy and comf...,4,1
1,Love this dress! it's sooo pretty. i happene...,5,1
2,I had such high hopes for this dress and reall...,3,0
3,"I love, love, love this jumpsuit. it's fun, fl...",5,1
4,This shirt is very flattering to all due to th...,5,1


In [8]:
pip install -U textblob

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 6.1 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
from textblob import TextBlob

In [10]:
testimonial= TextBlob("Textblob is amazingly simple to use. What great fun!")
testimonial.sentiment

Sentiment(polarity=0.39166666666666666, subjectivity=0.4357142857142857)

In [132]:
testimonial= TextBlob("I bigger windows but these are small.")
testimonial.sentiment

Sentiment(polarity=-0.125, subjectivity=0.45)

In [35]:
df['polarity'] = df['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df

Unnamed: 0,Review Text,Rating,Recommended,polarity
0,Absolutely wonderful - silky and sexy and comf...,4,1,0.633333
1,Love this dress! it's sooo pretty. i happene...,5,1,0.339583
2,I had such high hopes for this dress and reall...,3,0,0.073675
3,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0.550000
4,This shirt is very flattering to all due to th...,5,1,0.512891
...,...,...,...,...
23481,I was very happy to snag this dress at such a ...,5,1,0.552667
23482,"It reminds me of maternity clothes. soft, stre...",3,1,0.091667
23483,"This fit well, but the top was very see throug...",3,0,0.414286
23484,I bought this dress for a wedding i have this ...,3,1,0.322222


In [134]:
df.corr()

Unnamed: 0,Rating,Recommended,polarity
Rating,1.0,0.792568,0.386303
Recommended,0.792568,1.0,0.320786
polarity,0.386303,0.320786,1.0


In [133]:
df.loc[23482,'Review Text']

'It reminds me of maternity clothes. soft, stretchy, shiny material. cut is flattering and drapes nicely. i only found one button to close front... looked awkward. nice long sleeves.\nnot for me but maybe for others. just ok.'

In [39]:
item_868 = pd.read_csv('data/item_868.csv')

In [40]:
item_868.shape

(414, 11)

## LDA MODELING
### Finding the latent topics

In [41]:
import re
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import pyLDAvis.gensim_models as gensimvis

# Enable logging for gensim - optional
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [168]:
# prepare stopwords

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


# Convert to list
data = item_868['Review Text'].values.tolist()

#data = item_868[item_868['Rating']<4]['Review Text'].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['Love this cream sleeveless top....it goes with everything and you can dress '
 'it up or down! this will be a go to top all summer long and probably wear '
 'thru the fall as well with a layered sweater, if needed. i typically wear '
 'small or medium size and got the medium hoping for a little longer length. i '
 'am 57, 34c, and overall wt. of approx. 128 lbs...it fits very nicely . thank '
 'you retailer!']


In [136]:
data

['I liked the color of this top but i didnt really like the ruffled stitching around the middle. it looks like someone just tacked on the bottom half. i bought this for my daughter and she likes it. i think it is comfortable and a good top to knock around in.',
 'The styling of this top is really cute. it fits perfectly on the shoulders and gets bigger at the hem for the baby doll look. my biggest complaint is the quality! its really cheap and feels like the quality i would expect to see at a cheap retailer. it catches lint like crazy and because the hem is just a pearl edge, it curls really badly. i buy quite a bit from here and this is the worst quality item i have seen in a long time. not worth the $$ if paying full price.',
 'Like the other reviewer said this top is extremely wide and boxy. it must be pinned in the picture online. its very frustrating when they do that. thank goodness i didnt pay for shipping! for reference, im 52 and 135 pounds and bought the xxs, its going back f

In [169]:
# tokenizing and cleaning the text

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['love', 'this', 'cream', 'sleeveless', 'top', 'it', 'goes', 'with', 'everything', 'and', 'you', 'can', 'dress', 'it', 'up', 'or', 'down', 'this', 'will', 'be', 'go', 'to', 'top', 'all', 'summer', 'long', 'and', 'probably', 'wear', 'thru', 'the', 'fall', 'as', 'well', 'with', 'layered', 'sweater', 'if', 'needed', 'typically', 'wear', 'small', 'or', 'medium', 'size', 'and', 'got', 'the', 'medium', 'hoping', 'for', 'little', 'longer', 'length', 'am', 'and', 'overall', 'wt', 'of', 'approx', 'lbs', 'it', 'fits', 'very', 'nicely', 'thank', 'you', 'retailer']]


In [170]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=3) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=3)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['love_this', 'cream', 'sleeveless', 'top', 'it_goes', 'with', 'everything', 'and', 'you_can', 'dress', 'it', 'up_or_down', 'this', 'will_be', 'go', 'to', 'top', 'all', 'summer', 'long', 'and', 'probably', 'wear', 'thru', 'the', 'fall', 'as_well', 'with', 'layered', 'sweater', 'if', 'needed', 'typically_wear', 'small', 'or_medium', 'size', 'and', 'got_the', 'medium', 'hoping', 'for', 'little', 'longer', 'length', 'am', 'and', 'overall', 'wt', 'of', 'approx', 'lbs', 'it_fits', 'very', 'nicely', 'thank', 'you', 'retailer']


In [171]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [172]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#python3 -m spacy download en

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])

#import en_core_web_trf
#nlp = en_core_web_trf.load()

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['love', 'cream', 'sleeveless', 'top', 'go', 'dress', 'go', 'top', 'summer', 'long', 'probably', 'wear', 'fall', 'well', 'layer', 'sweater', 'need', 'typically_wear', 'small', 'medium', 'size', 'get', 'medium', 'hope', 'little', 'long', 'length', 'overall', 'fit', 'nicely', 'thank', 'retailer']]


In [173]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[1:2])

[[(24, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)]]


In [174]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('cream', 1),
  ('dress', 1),
  ('fall', 1),
  ('fit', 1),
  ('get', 1),
  ('go', 2),
  ('hope', 1),
  ('layer', 1),
  ('length', 1),
  ('little', 1),
  ('long', 2),
  ('love', 1),
  ('medium', 2),
  ('need', 1),
  ('nicely', 1),
  ('overall', 1),
  ('probably', 1),
  ('retailer', 1),
  ('size', 1),
  ('sleeveless', 1),
  ('small', 1),
  ('summer', 1),
  ('sweater', 1),
  ('thank', 1),
  ('top', 2),
  ('typically_wear', 1),
  ('wear', 1),
  ('well', 1)]]

In [175]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [176]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.084*"big" + 0.082*"put" + 0.079*"style" + 0.044*"really" + '
  '0.041*"quality" + 0.037*"bit" + 0.022*"hit" + 0.020*"thin" + 0.019*"blue" + '
  '0.017*"sheer"'),
 (1,
  '0.148*"look" + 0.080*"sweater" + 0.056*"online" + 0.054*"wide" + 0.036*"m" '
  '+ 0.032*"try" + 0.026*"day" + 0.018*"store" + 0.017*"couple" + '
  '0.017*"appear"'),
 (2,
  '0.049*"make" + 0.049*"great" + 0.045*"jean" + 0.044*"large" + 0.036*"low" + '
  '0.034*"cut" + 0.030*"pretty" + 0.027*"hope" + 0.024*"run" + 0.020*"come"'),
 (3,
  '0.104*"soft" + 0.096*"sleeve" + 0.080*"nice" + 0.056*"design" + '
  '0.046*"material" + 0.031*"keep" + 0.029*"slightly" + 0.022*"lbs" + '
  '0.011*"reference" + 0.005*"fringe"'),
 (4,
  '0.051*"fit" + 0.048*"size" + 0.036*"wear" + 0.033*"cute" + 0.028*"model" + '
  '0.026*"get" + 0.025*"however" + 0.024*"black" + 0.024*"person" + '
  '0.024*"back"'),
 (5,
  '0.140*"shirt" + 0.057*"purchase" + 0.056*"order" + 0.038*"comfy" + '
  '0.038*"petite" + 0.031*"perfect" + 0.029*"straig

In [145]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.411911974826841

Coherence Score:  0.3803384928583072


## Another LDA Model

In [177]:
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

In [178]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['love cream sleeveless top go dress be go top summer long probably wear fall as well layered sweater need typically wear small medium size get medium hoping little long length overall lb fit very nicely thank retailer', 'like color top do really ruffled stitch middle look just tack bottom half buy daughter like think be comfortable good top knock around']


In [179]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=2,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{2,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [180]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  2.618166704142052 %


In [181]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=20,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=2,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=2, learning_method='online', max_iter=20,
                          n_components=5, n_jobs=-1, random_state=100)


In [182]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -52542.317445369416
Perplexity:  469.21245411531214
{'batch_size': 2,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 20,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [190]:
#Grid Search
# Define Search Param
search_params = {'n_components': [5,10, 15], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 10, 15]})

In [194]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -13185.344229351685
Model Perplexity:  452.9042039821211


In [196]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(20).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.64,0.33,0.01,0.01,0.01,0
Doc1,0.01,0.01,0.01,0.51,0.45,3
Doc2,0.02,0.02,0.93,0.02,0.02,2
Doc3,0.57,0.42,0.01,0.01,0.01,0
Doc4,0.0,0.0,0.11,0.87,0.0,3
Doc5,0.02,0.02,0.02,0.94,0.02,3
Doc6,0.01,0.96,0.01,0.01,0.01,1
Doc7,0.96,0.01,0.01,0.01,0.01,0
Doc8,0.03,0.03,0.03,0.88,0.03,3
Doc9,0.05,0.8,0.05,0.05,0.05,1


In [197]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()


Unnamed: 0,able,absolutely,accent,accentuate,accord,actual,actually,add,addition,adorable,...,wrinkle,wrong,xl,xs,xsp,xxs,year,yellow,yesterday,zipper
Topic0,0.201366,0.200936,1.200155,0.201774,0.200002,0.200004,3.345366,3.922854,0.200005,0.202277,...,1.198189,2.199563,0.200002,9.832435,0.202477,2.87161,0.200004,0.227159,3.195828,0.200003
Topic1,0.200006,0.645189,1.196477,0.200004,0.200002,0.201397,2.306366,0.200271,0.200005,1.313182,...,0.200012,0.200007,2.19888,0.200822,0.200007,1.528829,2.194552,1.200424,0.200002,2.199989
Topic2,5.198896,7.612091,0.200001,3.198077,0.200125,1.199839,7.771598,4.649641,1.407033,0.200606,...,0.202463,0.200003,0.200312,12.894674,0.200003,2.36469,0.202681,0.208176,0.201679,0.200001
Topic3,1.200095,1.337463,0.200003,1.20014,0.200002,0.200004,0.236645,3.197594,4.993275,3.182625,...,0.200011,1.199204,0.200002,0.737893,1.196902,0.200006,0.201599,0.200011,0.202488,0.200003
Topic4,1.199638,0.204322,0.203364,0.200004,3.199868,1.198757,2.340024,4.029639,1.199682,8.10131,...,1.199326,2.201223,0.200805,23.334175,1.200611,4.034866,4.201165,1.16423,0.200002,0.200003


In [195]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,size,small,fit,shirt,look,love,order,try,just,wide
Topic 1,wear,look,color,love,long,comfortable,fabric,tunic,fit,great
Topic 2,size,look,wear,just,love,shirt,fit,color,fabric,large
Topic 3,wear,love,shirt,look,really,cute,material,fabric,nice,great
Topic 4,love,color,large,xs,wear,great,way,small,cute,look


In [198]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

### plotting words

In [201]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


common_words = get_top_n_words(corpus, 30)
df2 = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

fig = go.Figure([go.Bar(x=df2['unigram'], y=df2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 unigrams in the question text after removing stop words and lemmatization"))
fig.show()

AttributeError: 'list' object has no attribute 'lower'