# Topic Analysis

In [1]:
import datetime

import pandas as pd
import spacy
import re
import string
import numpy as np

import seaborn as sns 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

import nltk
from nltk.corpus import stopwords
from spacy.tokens import Token
from tqdm import tqdm


import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
from ipywidgets import interact

%matplotlib inline
np.random.seed(500)

In [2]:
df = pd.read_csv("..//data//Womens Clothing E-Commerce Reviews Sentiment v2.csv")

## Tokenizing the review text

In [3]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [4]:
def buildLDA(processed_docs, ip_num_topics=4):
    dictionary = gensim.corpora.Dictionary(processed_docs)

    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=ip_num_topics, id2word=dictionary, passes=2, workers=4, alpha =1)
    
    return dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf

In [5]:
def LDAtopicSummary(model, displaysubplot=[2,2]):
    for idx, topic in model.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))

In [6]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        #print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords',"Text"]
    return(sent_topics_df)


## LDA - Non-Recommendation 

### Non-Recommendation & Negative reviews

In [7]:
Neg_NR_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==0 ) & (df["PA_Polarity"]<0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Neg_NR_processed_docs,8)
len(Neg_NR_processed_docs)

895

In [8]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=Neg_NR_processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.2677924873275451


In [9]:
#LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Non-Recommendation & Positive reviews

In [10]:
Pos_NR_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==0 ) & (df["PA_Polarity"]>0)].map(preprocess)
Pos_NR_processed_docs_org=df["Review Text"][(df["Recommended IND"]==0 ) & (df["PA_Polarity"]>0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Pos_NR_processed_docs,4)

In [11]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=Pos_NR_processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.2680544876605632


In [12]:
# LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [13]:
def demo(title):
    if not np.isnan(title):
        return([" ".join(i) for i in df_dominant_topic["Text"][df_dominant_topic["Dominant_Topic"]==title-1].tolist()][:20])

df_dominant_topic = format_topics_sentences(lda_model_tfidf,bow_corpus,list(Pos_NR_processed_docs_org) )
interact(demo, title=np.sort(df_dominant_topic["Dominant_Topic"].unique())+1);

interactive(children=(Dropdown(description='title', options=(1.0, 2.0, 3.0, 4.0), value=1.0), Output()), _dom_…

## LDA - Recommended Data

### Recommendation & Negative reviews

In [14]:
Neg_R_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==1 ) & (df["PA_Polarity"]<0)].map(preprocess)
Neg_R_processed_docs_org=df["Review Text"][(df["Recommended IND"]==1 ) & (df["PA_Polarity"]<0)].map(preprocess)

dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Neg_R_processed_docs,4)
len(Neg_R_processed_docs)

820

In [15]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=Neg_R_processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.29405933945400564


In [17]:
# LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
def demo(title):
    if not np.isnan(title):
        return([" ".join(i) for i in df_dominant_topic["Text"][df_dominant_topic["Dominant_Topic"]==title-1].tolist()][:20])

df_dominant_topic = format_topics_sentences(lda_model_tfidf,bow_corpus,list(Neg_R_processed_docs_org) )
interact(demo, title=np.sort(df_dominant_topic["Dominant_Topic"].unique())+1);

interactive(children=(Dropdown(description='title', options=(1.0, 2.0, 3.0, 4.0), value=1.0), Output()), _dom_…

### Recommendation & Positive reviews

In [22]:
Pos_R_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==1 ) & (df["PA_Polarity"]>0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Pos_R_processed_docs,4)
len(Pos_R_processed_docs)

17605

In [23]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=Pos_R_processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.26608653578502295


In [24]:
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
