# Topic Analysis

In [3]:
import datetime

import pandas as pd
import spacy
import re
import string
import numpy as np

import seaborn as sns 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

import nltk
from nltk.corpus import stopwords
from spacy.tokens import Token
from tqdm import tqdm


import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
from ipywidgets import interact

%matplotlib inline
np.random.seed(500)

In [4]:
df = pd.read_csv("..//data//Womens Clothing E-Commerce Reviews Sentiment v2.csv")

In [5]:
## Discarding a empty reviews records

df=df[((df.processed_Review_text.isna()==False) & (df.processed_Review_text.isnull()==False) & (df.processed_Review_text!=""))]

## Tokenizing the review text

In [6]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [7]:
def buildLDA(processed_docs, ip_num_topics=4):
    dictionary = gensim.corpora.Dictionary(processed_docs)

    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=ip_num_topics, id2word=dictionary, passes=2, workers=4)
    
    return dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf

In [8]:
def LDAtopicSummary(model, displaysubplot=[2,2]):
    for idx, topic in model.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))

In [9]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        #print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords',"Text"]
    return(sent_topics_df)


## LDA - Non-Recommendation 

### Non-Recommendation & Negative reviews

In [10]:
Neg_NR_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==0 ) & (df["PA_Polarity"]<0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Neg_NR_processed_docs,8)
len(Neg_NR_processed_docs)

893

In [11]:
#LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Non-Recommendation & Positive reviews

In [12]:
Pos_NR_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==0 ) & (df["PA_Polarity"]>0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Pos_NR_processed_docs,9)

In [13]:
# LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [14]:
def demo(title):
    return([" ".join(i) for i in df_dominant_topic["Text"][df_dominant_topic["Dominant_Topic"]==title].tolist()][:20])

df_dominant_topic = format_topics_sentences(lda_model_tfidf,bow_corpus,list(Pos_NR_processed_docs) )
interact(demo, title=range(9));

interactive(children=(Dropdown(description='title', options=(0, 1, 2, 3, 4, 5, 6, 7, 8), value=0), Output()), …

## LDA - Recommended Data

### Recommendation & Negative reviews

In [15]:
Neg_R_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==1 ) & (df["PA_Polarity"]<0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Neg_R_processed_docs,10)
len(Neg_R_processed_docs)

817

In [16]:
# LDAtopicSummary(lda_model_tfidf, [2,4])
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [17]:
df_dominant_topic = format_topics_sentences(lda_model_tfidf,bow_corpus,list(Neg_R_processed_docs) )
def demo(title):
    return([" ".join(i) for i in df_dominant_topic["Text"][df_dominant_topic["Dominant_Topic"]==title].tolist()][:20])

interact(demo, title=range(10));

interactive(children=(Dropdown(description='title', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), value=0), Output()…

### Recommendation & Positive reviews

In [18]:
Pos_R_processed_docs=df["processed_Review_text"][(df["Recommended IND"]==1 ) & (df["PA_Polarity"]>0)].map(preprocess)
dictionary,bow_corpus,corpus_tfidf, lda_model_tfidf =buildLDA(Pos_R_processed_docs,12)
len(Pos_R_processed_docs)

17609

In [19]:
pyLDAvis.gensim.prepare(lda_model_tfidf, bow_corpus, dictionary=lda_model_tfidf.id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
# LDAtopicSummary(lda_model_tfidf, [2,4])
df_dominant_topic = format_topics_sentences(lda_model_tfidf,bow_corpus,list(Pos_R_processed_docs) )
def demo(title):
    return([" ".join(i) for i in df_dominant_topic["Text"][df_dominant_topic["Dominant_Topic"]==title].tolist()][:20])

interact(demo, title=range(12));

interactive(children=(Dropdown(description='title', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), value=0), …