In [None]:
import pandas as pd
import re
import unicodedata
import contractions
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("fivethirtyeight")
pd.set_option('display.max_colwidth', 80)
import matplotlib.patheffects as path_effects
import nltk
import numpy as np
import seaborn as sns
import gensim
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import emoji

In [None]:
def to_lower(text):
    return str(text).lower()

def word_expansion(text):
    return contractions.fix(text)

def text_formatter(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\\\w', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

        
def to_string(text):
    # Convert list to string
    text = ' '.join(text)
    return text

def text_preprocessing(text, expand_contraction = True):
    # 1. Convert words to lower case
    text = to_lower(text)
    
    # 2. Expand contractions
    if expand_contraction:
        text = word_expansion(text)

    # 3. Format words and remove unwanted characters
    text = text_formatter(text)
    
    # 4. Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each word
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]
    
    return text

In [None]:
df = pd.read_csv("../data/MobileAppReviews.csv")

In [None]:
df['reviews_text_clean_list'] = df["reviews_text"].apply(text_preprocessing)
df['reviews_text_clean'] = df["reviews_text_clean_list"].apply(to_string)

In [None]:
# Join  text together
review_words = ','.join(list(df['reviews_text_clean'].values))

# Count each word
counter = Counter(review_words.split())
most_frequent = counter.most_common(30)

fig = plt.figure(1, figsize = (20,10))
_ = pd.DataFrame(most_frequent, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = _, palette = 'winter')
plt.xticks(rotation=45);

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list.extend(['app', 'phone', 'work', 'time', 'use', 'get'])

In [None]:
df['reviews_text_clean_list'] = [[word for word in line if word not in stopwords_list] for line in df['reviews_text_clean_list']]

In [None]:
df['reviews_text_clean'] = df["reviews_text_clean_list"].apply(to_string)

In [None]:
# Join  text together
review_words = ','.join(list(df['reviews_text_clean'].values))

# Count each word
counter = Counter(review_words.split())
most_frequent = counter.most_common(50)

# Bar plot of frequent words
fig = plt.figure(1, figsize = (20,10))
_ = pd.DataFrame(most_frequent, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = _, palette = 'winter')
plt.xticks(rotation=45);

In [None]:
# Generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 200,
                      contour_width = 8,
                      contour_color = "steelblue",
                      collocations=False).generate(review_words)
                      
# Visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
for cat in df['genre'].unique():
    review_words = ','.join(list(df[df['genre'] == cat]['reviews_text_clean'].values))
    # Generate the word cloud
    wordcloud = WordCloud(background_color="white",
                      max_words= 200,
                      contour_width = 8,
                      contour_color = "steelblue",
                      collocations=False).generate(review_words)
    # Visualize the word cloud
    fig = plt.figure(1, figsize = (10, 10))
    plt.title(cat)
    plt.axis('off')
    plt.imshow(wordcloud)
    filename = "../reports/figures/"+cat+".png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show();

In [None]:
# Create Dictionary
dictionary = gensim.corpora.Dictionary(df['reviews_text_clean_list'])

# Create Corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in df['reviews_text_clean_list']]

In [None]:
# Compute Coherence Score
number_of_topics = []
coherence_score = []
for i in range(1,30):
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                id2word=dictionary,
                                                iterations=800,
                                                num_topics=i)
    
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                       texts=df['reviews_text_clean_list'], 
                                                       dictionary=dictionary, 
                                                       coherence='c_v')
    
    coherence_lda = coherence_model_lda.get_coherence()
    
    number_of_topics.append(i)
    
    coherence_score.append(coherence_lda);

# Create a dataframe of coherence score by number of topics 
topic_coherence = pd.DataFrame({'number_of_topics':number_of_topics,
                                'coherence_score':coherence_score})

# Print a line plot
sns.set_context("talk")
ax = sns.lineplot(data=topic_coherence, x='number_of_topics', y='coherence_score')
plt.savefig("../reports/figures/coherence_plot.png", dpi=300, bbox_inches='tight')

In [None]:
# Define the number of topics 
n_topics = 20

# Run the LDA model
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=n_topics, random_state=100, 
                                                    chunksize=10, passes=10, alpha='symmetric', iterations=1000,
                                                    per_word_topics=True)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))