In [None]:
import pandas as pd
import re
import unicodedata
import contractions

import matplotlib.patheffects as path_effects
import nltk
import numpy as np
import seaborn as sns
import gensim
nltk.download(['stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger', 'vader_lexicon'])
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import emoji

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import squarify
sns.set_context("talk")

In [None]:
df = pd.read_csv("../data/processed_data_new.csv")
#df = df[~df["topic_id"].isin([-1])]

In [None]:
def to_lower(text):
    return str(text).lower()

def word_expansion(text):
    return contractions.fix(text)

def text_formatter(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\\\w', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

        
def to_string(text):
    # Convert list to string
    text = ' '.join(text)
    return text

def text_preprocessing(text, expand_contraction = True):
    # 1. Convert words to lower case
    text = to_lower(text)
    
    # 2. Expand contractions
    if expand_contraction:
        text = word_expansion(text)

    # 3. Format words and remove unwanted characters
    text = text_formatter(text)
    
    # 4. Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each word
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]
    
    return text

In [None]:
df['id'] = range(len(df))
df['reviews_clean_list'] = df["reviews_text"].apply(text_preprocessing)

stopwords_list = stopwords.words('english')
stopwords_list.extend(['app', 'phone', 'work', 'time', 'use', 'get'])

df['reviews_clean_list'] = [[word for word in line if word not in stopwords_list] \
                                           for line in df['reviews_clean_list']]
df['reviews_clean'] = df["reviews_clean_list"].apply(to_string)

# Generate Word Cloud

In [None]:
review_words = ','.join(list(df['reviews_clean'].values))
wordcloud = WordCloud(background_color="white",
                      max_words= 200,
                      contour_width = 8,
                      contour_color = "steelblue",
                      collocations=False).generate(review_words)
                      
# Visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
for cat in df['genre'].unique():
    review_words = ','.join(list(df[df['genre'] == cat]['reviews_clean'].values))
    # Generate the word cloud
    wordcloud = WordCloud(background_color="white",
                      max_words= 200,
                      contour_width = 8,
                      contour_color = "steelblue",
                      collocations=False).generate(review_words)
    # Visualize the word cloud
    fig = plt.figure(1, figsize = (10, 10))
    plt.title(cat)
    plt.axis('off')
    plt.imshow(wordcloud)
    filename = "../reports/figures/"+cat+".png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show();

# Topic Creation

In [None]:
reviews_per_class = df.groupby(['topic_id'], as_index = False).agg({'reviews_clean': ' '.join})

In [None]:
# Calculate term frequency and Inverse document frequency
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(reviews_per_class.reviews_clean.values, m=len(df))

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, reviews_per_class, n=10):
    words = count.get_feature_names_out()
    labels = list(reviews_per_class['topic_id'])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

top_n_words = extract_top_n_words_per_topic(tf_idf, count, reviews_per_class, n=5)

In [None]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['topic_id'])
                     .reviews_clean
                     .count()
                     .reset_index()
                     .rename({"topic_id": "Topic ID", "reviews_clean": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

topic_sizes = extract_topic_sizes(df)

# Coherence Model

In [None]:
topics = []
for topic_list in top_n_words.values():
    topics.append(list(map(lambda x: x[0], topic_list)))

dictionary = gensim.corpora.Dictionary(df['reviews_clean_list'])

cm = gensim.models.CoherenceModel(topics=topics, texts=df['reviews_clean_list'], 
                                  dictionary=dictionary, 
                                  coherence='c_v')

coherence_score = cm.get_coherence()
print(coherence_score)
coherence_score_per_topic = cm.get_coherence_per_topic()

In [None]:
topics_str = ['\n '.join(t[:2]) for t in topics]
data_topic_score = pd.DataFrame(data=zip(topics_str, coherence_score_per_topic), columns=['Topic', 'Coherence'])

plt.subplots(figsize=(5,30))
sns.barplot(x="Coherence", y="Topic", data=data_topic_score, color="b", ci=None)
plt.yticks(rotation=0)
plt.show()


# Sentiment Analysis

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
def is_positive(text):
    """True if review has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(text)["pos"]

In [None]:
df["pos_sentiment"] = df["reviews_clean"].apply(is_positive)

In [None]:
sns.set_style("whitegrid")
result = df.groupby(["topic_id"])['pos_sentiment'].aggregate(np.mean).reset_index().sort_values('pos_sentiment').head(20)
sns.barplot(x='topic_id', y="pos_sentiment", data=result, order=result['topic_id'], palette="pastel")
plt.xticks(rotation=90)
plt.savefig("../reports/figures/sentiment_dist_plot.png", dpi=300, bbox_inches='tight')
plt.show()

# User Issues

In [None]:
for i in df["topic_id"].sort_values().unique():
    print(i, top_n_words[i], "\n")

In [None]:
df_list = []
for i in result["topic_id"].unique():
    wrd_list = top_n_words[i]
    df_list += list(map(lambda x: (i, x[0], x[1]), wrd_list))
df_top_topics = pd.DataFrame(df_list, columns =['topic_id', 'word', 'score'])

In [None]:
for group_id in result["topic_id"].unique():
    sizes = df_top_topics[df_top_topics['topic_id'] == group_id]['score'].values[:10]
    label = df_top_topics[df_top_topics['topic_id'] == group_id]['word'].values[:10]
    ax = squarify.plot(sizes=sizes, label=label, alpha=0.6).set(title=f'Topic ID: {group_id}')
    plt.axis('off')
    plt.savefig(f"../reports/figures/top5_dist_{group_id}.png", dpi=300, bbox_inches='tight')
    plt.show()

# User Satisfaction

In [None]:
import statsmodels.api as sm

In [None]:
#[16, 0, 32, 7, 1, 33, 34, 35, 31]
csf_df = df[~df['topic_id'].isin([16, 0, 32, 7, 1, 33, 34, 35, 31, 13, 12])].copy()

In [None]:
csf_df["Usability"] = 0
csf_df["Online Service"] = 0
csf_df["Biometric Auth"] = 0
csf_df["Linked Services"] = 0
csf_df["Payments"] = 0
csf_df["Software Issue"] = 0
csf_df["Information"] = 0
csf_df["Authentication"] = 0
csf_df["Documents"] = 0
csf_df["Account Access"] = 0


csf_df.loc[csf_df["topic_id"].isin([17]), "Usability"] = 1
csf_df.loc[csf_df["topic_id"].isin([2, 30, 10]), "Online Service"] = 1
csf_df.loc[csf_df["topic_id"].isin([19]), "Biometric Auth"] = 1
csf_df.loc[csf_df["topic_id"].isin([9, 14, 3]), "Linked Services"] = 1
csf_df.loc[csf_df["topic_id"].isin([13, 12]), "Payments"] = 1
csf_df.loc[csf_df["topic_id"].isin([-1, 25, 23, 21, 32, 22, 24, 7]), "Software Issue"] = 1
csf_df.loc[csf_df["topic_id"].isin([5]), "Information"] = 1
csf_df.loc[csf_df["topic_id"].isin([20, 27, 18, 26]), "Authentication"] = 1
csf_df.loc[csf_df["topic_id"].isin([15, 29]), "Documents"] = 1
csf_df.loc[csf_df["topic_id"].isin([8]), "Account Access"] = 1

#csf_df["intercept"] = 1
csf_df["binary_scores"] = 0
csf_df.loc[csf_df["scores"].isin([4, 5]), "binary_scores"] = 1
csf_df.columns

In [None]:
#define predictor and response variables
x = csf_df[['Usability', 'Online Service', 'Biometric Auth',
       'Linked Services', 'Software Issue', 'Information',
       'Authentication', 'Documents', 'Account Access']]
y = csf_df['binary_scores']

binomial_model = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial())

binomial_results = binomial_model.fit()

print(binomial_results.summary())