In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()



In [2]:
data = pd.read_csv('../../yelp-data/new_data/final_data/manip_data.csv')
review_text = data['text'].fillna('')

In [27]:
review_text_1 = data[(data.rating1 == 1)]
review_text_3 = data[(data.rating3 == 1)]
review_text_5 = data[(data.rating5 == 1)]

# Fit and visualize LDA model(s) for reviews with a rating of 1 (5 topics)

In [33]:
review_text_1 = review_text_1['text'].fillna('')

In [43]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True)

dtm_tf = tf_vectorizer.fit_transform(review_text_1)
print(dtm_tf.shape)

(13244, 30181)


In [44]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(review_text_1)
print(dtm_tfidf.shape)

(13244, 30181)


In [45]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=5, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [46]:
rating1_viz_1 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating1_viz_1, 'lda-viz/rating1-ldavis-1.html')
pyLDAvis.display(rating1_viz_1)

In [47]:
rating1_viz_2 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating1_viz_2, 'lda-viz/rating1-ldavis-2.html')
pyLDAvis.display(rating1_viz_2)

#### using different mds functions

In [48]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [49]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

# Fit and visualize LDA model(s) for reviews with a rating of 1 (10 topics)

In [50]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [51]:
rating1_viz_3 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating1_viz_3, 'lda-viz/rating1-ldavis-3.html')
pyLDAvis.display(rating1_viz_3)

In [52]:
rating1_viz_4 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating1_viz_4, 'lda-viz/rating1-ldavis-4.html')
pyLDAvis.display(rating1_viz_4)

# Fit and visualize LDA model(s) for reviews with a rating of 3 (5 topics)

In [53]:
review_text_3 = review_text_3['text'].fillna('')

In [54]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True)

dtm_tf = tf_vectorizer.fit_transform(review_text_3)
print(dtm_tf.shape)

(21870, 41897)


In [55]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(review_text_3)
print(dtm_tfidf.shape)

(21870, 41897)


In [56]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=5, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [57]:
rating3_viz_1 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating3_viz_1, 'lda-viz/rating3-ldavis-1.html')
pyLDAvis.display(rating3_viz_1)

In [58]:
rating3_viz_2 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating3_viz_2, 'lda-viz/rating3-ldavis-2.html')
pyLDAvis.display(rating3_viz_2)

#### using different mds functions

In [59]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [60]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

# Fit and visualize LDA model(s) for reviews with a rating of 3 (10 topics)

In [61]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [62]:
rating3_viz_3 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating3_viz_3, 'lda-viz/rating3-ldavis-3.html')
pyLDAvis.display(rating3_viz_3)

In [63]:
rating3_viz_4 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating3_viz_4, 'lda-viz/rating3-ldavis-4.html')
pyLDAvis.display(rating3_viz_4)

# Fit and visualize LDA model(s) for reviews with a rating of 5 (5 topics)

In [64]:
review_text_5 = review_text_5['text'].fillna('')

In [65]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True)

dtm_tf = tf_vectorizer.fit_transform(review_text_5)
print(dtm_tf.shape)

(60846, 59213)


In [66]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(review_text_5)
print(dtm_tfidf.shape)

(60846, 59213)


In [67]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=5, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [68]:
rating5_viz_1 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating5_viz_1, 'lda-viz/rating5-ldavis-1.html')
pyLDAvis.display(rating5_viz_1)

In [69]:
rating5_viz_2 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating5_viz_2, 'lda-viz/rating5-ldavis-2.html')
pyLDAvis.display(rating5_viz_2)

#### using different mds functions

In [70]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [71]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

# Fit and visualize LDA model(s) for reviews with a rating of 5 (10 topics)

In [72]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [73]:
rating5_viz_3 = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.save_html(rating5_viz_3, 'lda-viz/rating5-ldavis-3.html')
pyLDAvis.display(rating5_viz_3)

In [74]:
rating5_viz_4 = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
pyLDAvis.save_html(rating5_viz_4, 'lda-viz/rating5-ldavis-4.html')
pyLDAvis.display(rating5_viz_4)