In [4]:
# BEST PRACTICE: import built-in packages first
# from PACKAGE import OBJECT lets us bring only what we need into our namespace
from warnings import filterwarnings
# import PACKAGE, brings the all modules into one object named PACKAGE
import re

# BEST PRACTICE: import third-party packages second
# from PACKAGE import OBJECT as ALIAS, renames the object in our namespace
from geopy.geocoders import Nominatim as geopy_Nominatim
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer as nltk_WordNetLemmatizer
from nltk.tag import pos_tag as nltk_pos_tag
from pandas_profiling import ProfileReport
from wordcloud import (
    # use parenthesis to import multiple objects and even add aliases
    STOPWORDS as wordcloud_STOPWORDS,
    WordCloud
)
import matplotlib.pyplot as plt
import nltk
# aliases can be added when fully importing packages
import pandas as pd
# aliases can also be used when importing an individual module from the package
import plotly.express as px
import plotly.io as pio
import seaborn as sns

# BEST PRACTICE: import custom packages last
# for example: import custom_module as cm

from nltk.corpus import wordnet
import nltk
import re
import contractions
from nltk.corpus import wordnet
from gensim.models import Phrases
from gensim import corpora
import gensim
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [5]:
# overwrite the 'cocoon_pharmacy_df' object with a dataframe of our 'new' file
cocoon_pharmacy_df = pd.read_csv(
    '../data/cocoon_pharmacy_location_added.csv',
    # this dataframe was stored as .csv, with the index in the first position
    index_col = 0
)

In [6]:
body_moisturizers = cocoon_pharmacy_df[cocoon_pharmacy_df['product_cat'] == 'Body Moisturisers']

In [7]:
def get_wordnet_pos(treebank_tag):

  if treebank_tag.startswith('J'):
      return wordnet.ADJ
  elif treebank_tag.startswith('V'):
      return wordnet.VERB
  elif treebank_tag.startswith('N'):
      return wordnet.NOUN
  elif treebank_tag.startswith('R'):
      return wordnet.ADV
  else:
      return wordnet.NOUN  
    
def first_preprocessing(pdf):
  stopwords = nltk.corpus.stopwords.words('english')

  eng = pdf.copy(deep = True)
  
  eng['body_review_cleaned'] = eng['body_review'].apply(lambda x: contractions.fix(x.lower().strip())) #lower case, expand contractions, and strip spaces
  eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: ' '.join(s for s in x.split() if not any(c.isdigit() for c in s))) #remove anyword containing a digit
  eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: "".join([char for char in x if char not in string.punctuation])) #remove punctuations
  eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords])) #remove stopwords
  eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: re.sub("(\")?(\')?(“)?(”)?",'',x)) #remove “ and ”
  eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(word,get_wordnet_pos(pos_tag)) for pos in [nltk.pos_tag(x.split())]  for (word,pos_tag) in pos]))
  
  sentence_stream = [doc.split(" ") for doc in eng['body_review_cleaned'].values]
  
  bigram = gensim.models.Phrases(sentence_stream, min_count=15, threshold=5) # higher threshold fewer phrases.
  trigram = gensim.models.Phrases(bigram[sentence_stream], min_count=15,threshold=5)  

  # Faster way to get a sentence clubbed as a trigram/bigram
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  trigram_mod = gensim.models.phrases.Phraser(trigram)
  
  return pdf, eng, sentence_stream, bigram_mod, trigram_mod
  
def later_preprocessing(text, bigram_mod, trigram_mod):
    
    text = bigram_mod[text.split()] #bigram
    text = trigram_mod[bigram_mod[text]] #trigram
    text = ' '.join([w.strip() for w in text if len(w.strip()) > 2 and w.strip() not in ['no','qc']]) # remove short words

    return text

In [8]:
pdf, eng, sentence_stream, bigram_mod, trigram_mod = first_preprocessing(cocoon_pharmacy_df)
eng['body_review_cleaned'] = eng['body_review_cleaned'].apply(lambda x: later_preprocessing(x,bigram_mod, trigram_mod))

In [9]:
sentence_streams = [doc.split(" ") for doc in eng['body_review_cleaned'].values]
id2word = corpora.Dictionary(sentence_streams)
id2word.filter_extremes(no_below=7, no_above=0.95, keep_n=25000)
corpus = [id2word.doc2bow(text) for text in sentence_streams]
print('Total number of unique words after filtering extremes: {}'.format(len(id2word)))

Total number of unique words after filtering extremes: 236


In [10]:
optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
model_topics = optimal_model.show_topics(formatted=False)

In [11]:
pprint(optimal_model.print_topics())

[(0,
  '0.136*"use" + 0.118*"skin" + 0.046*"cream" + 0.044*"dry_skin" + '
  '0.041*"love" + 0.034*"help" + 0.033*"smell" + 0.025*"really" + 0.023*"year" '
  '+ 0.020*"day"'),
 (1,
  '0.054*"excellent" + 0.043*"cream" + 0.040*"body" + 0.038*"dry" + '
  '0.038*"time" + 0.029*"oil" + 0.025*"greasy" + 0.024*"hand" + 0.023*"skin" + '
  '0.022*"face"'),
 (2,
  '0.374*"good" + 0.093*"like" + 0.055*"one" + 0.048*"work" + 0.045*"balm" + '
  '0.043*"little" + 0.039*"quality" + 0.030*"price" + 0.024*"son" + '
  '0.016*"always"'),
 (3,
  '0.221*"product" + 0.078*"great" + 0.063*"recommend" + 0.053*"buy" + '
  '0.053*"nice" + 0.051*"available" + 0.030*"happy" + 0.028*"perfect" + '
  '0.021*"eczema" + 0.021*"soothe"')]


In [12]:
top_n_words = 10
topics = optimal_model.show_topics(
    num_topics =4, num_words = top_n_words, formatted = False)

for _, infos in topics:
    probs = [prob for _, prob in infos]
    sns.lineplot(range(top_n_words), probs, marker = '*')

plt.xlabel('Word rank')
plt.ylabel('Weights')
plt.title('Weights of Top {} Words in each Topic'.format(top_n_words))
plt.show()



In [13]:
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(optimal_model, corpus, id2word)
LDAvis_prepared