# Import Libraries and data

In [None]:
import re
from tqdm import tqdm
import numpy as np 
import pandas as pd

import emoji
import contractions
import nltk
import gensim
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.cluster import KMeansClusterer, cosine_distance

from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("talk")

In [None]:
df = pd.read_csv("../data/MobileAppReviews.csv")
df = df.dropna()

# Text Cleaning

In [None]:
def to_lower(text):
    return str(text).lower()

def word_expansion(text):
    return contractions.fix(text)

def text_formatter(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\\\w', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

        
def to_string(text):
    # Convert list to string
    text = ' '.join(text)
    return text

def text_preprocessing(text, expand_contraction = True):
    # 1. Convert words to lower case
    text = to_lower(text)
    
    # 2. Expand contractions
    if expand_contraction:
        text = word_expansion(text)

    # 3. Format words and remove unwanted characters
    text = text_formatter(text)
    
    # 4. Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each word
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]
    
    return text

In [None]:
df['reviews_text_clean_list'] = df["reviews_text"].apply(text_preprocessing)
df['reviews_text_clean'] = df["reviews_text"].apply(to_lower)

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list.extend(['app', 'phone', 'work', 'time', 'use', 'get'])
df['reviews_text_clean_list'] = [[word for word in line if word not in stopwords_list] for line in df['reviews_text_clean_list']]

In [None]:
df["token_len"] = df['reviews_text_clean_list'].apply(lambda x: len(x))

In [None]:
# filter reviews with 5 tokens or more
df = df[df["token_len"] >= 5]

# Topic Model


## 1. Embedding: Sentence to Vector 

In [None]:
data = df['reviews_text_clean'].values
model = SentenceTransformer('all-distilroberta-v1')
embeddings = model.encode(data, show_progress_bar=True)

## 2. Dimentionality Reduction

In [None]:
map_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=10, 
                            metric='cosine').fit_transform(embeddings)

In [None]:
map_embeddings_viz = umap.UMAP(n_neighbors=15,
                               n_components=2,
                               metric='cosine').fit_transform(embeddings)

## 3. Clustering 

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=100,
                          metric='euclidean', cluster_selection_method='eom',).fit(map_embeddings)

In [None]:
np.unique(cluster.labels_, return_counts=True)

In [None]:
def plot_clusters(data_2d, cluster_labels):
    result = pd.DataFrame(data_2d, columns=['x', 'y'])
    result['labels'] = cluster_labels
    outliers = result.loc[result.labels == -1, :]
    clustered = result.loc[result.labels != -1, :]
    plt.figure()
    plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
    plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
    plt.colorbar()

plt.rcParams['figure.figsize'] = [30, 20]
plt.rcParams.update({'font.size': 18})
plot_clusters(map_embeddings_viz, cluster.labels_)

In [None]:
df['topic_id'] = cluster.labels_
df['contribution'] = cluster.probabilities_

In [None]:
df[['app_id', 'genre', 'user_name', 'reviews_text', 'scores', 'time', 'topic_id', 'contribution']].to_csv("../data/processed_data_new.csv", index=False)

In [None]:
# Parameter Tuning
df['reviews_clean'] = df["reviews_text_clean_list"].apply(to_string)

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, reviews_per_class, n=10):
    words = count.get_feature_names_out()
    labels = list(reviews_per_class['topic_id'])
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

scores_list = {}
for i in tqdm(range(60, 160, 10)):
    cluster = hdbscan.HDBSCAN(min_cluster_size=i, metric='euclidean', cluster_selection_method='eom',).fit(map_embeddings)
    df['topic_id'] = cluster.labels_
    reviews_per_class = df.groupby(['topic_id'], as_index = False).agg({'reviews_clean': ' '.join})
    tf_idf, count = c_tf_idf(reviews_per_class.reviews_clean.values, m=len(df))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, reviews_per_class, n=5)
    topics = []
    for topic_list in top_n_words.values():
        topics.append(list(map(lambda x: x[0], topic_list)))
    
    dictionary = gensim.corpora.Dictionary(df['reviews_text_clean_list'])
    cm = gensim.models.CoherenceModel(topics=topics, texts=df['reviews_text_clean_list'], 
                                      dictionary=dictionary, 
                                      coherence='c_v')
    coherence_score = cm.get_coherence()
    scores_list[i] = coherence_score