# Import Libraries and Dataset

In [None]:
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tableone import TableOne

sns.set_context("talk")

In [None]:
df = pd.read_csv("../data/MobileAppReviews.csv")

In [None]:
df.head()

In [None]:
# Convert column to datetime
df['time'] = pd.to_datetime(df['time'])

# Generate Descriptive Stats

In [None]:
unique_apps = df["app_id"].nunique()
unique_genre = df["genre"].unique()
study_start = df["time"].min()
study_end = df["time"].max()

print(f"Number of apps reviewed: {unique_apps} \n")
print(f"Number App categories: {len(unique_genre)} \n")
print(f"App categories: {unique_genre} \n")
print(f"App data collected between {study_start} to {study_end} \n")
print(f"Number of missing reviews: {df['reviews_text'].isna().sum()} \n")

In [None]:
unique_users = df['user_name'].nunique()
unknown_users = len(df[df["user_name"] == "A Google user"])
total_reviews = len(df)
mean = df['scores'].mean()

df["unknown_user"] = 0
df.loc[df["user_name"] == "A Google user", "unknown_user"] = 1
dup = df[df["unknown_user"] == 0].groupby("app_id").agg({"unknown_user": "sum"})
repreated_review = max(dup["unknown_user"].values)

print(f"Total textual reviews: {total_reviews} \n")
print(f"Total unique users: {unique_users} \n")
print(f"Total unknown users: {unknown_users} \n")
print(f"Total users who gave multiple reviews: {total_reviews - unique_users - unknown_users}")
print(f"Average rating for all apps based on the reviews: {round(mean,2)} \n")
print(f"Repeated reviews per app: {repreated_review} \n")

In [None]:
df = df.dropna()

In [None]:
# plot the review score distribution.
ax = sns.countplot(x='scores', data=df, palette="pastel")
plt.savefig("../reports/figures/score_dist_plot.png", dpi=300, bbox_inches='tight')

# Generate Summary Measures of the dataset.

In [None]:
columns = ['genre','unknown_user', 'scores']

categorical = ['genre','unknown_user']

groupby = 'unknown_user'

nonnormal = ['scores']

mytable = TableOne(df, columns, categorical, groupby, nonnormal, pval=False)

In [None]:
print(mytable.tabulate(tablefmt = "fancy_grid"))

In [None]:
# Get the year when the review what given.
df['time_year'] = df['time'].dt.year

# Exploring yearly trend for reviews. 

In [None]:
# Generate yearly count for each app genre.
yearly_count = df.groupby(['genre', 'time_year']).count().reset_index()
plt.subplots(figsize=(20,15))
ax = sns.heatmap(yearly_count.pivot("genre", "time_year", "scores"), annot=False, cmap='crest')
plt.savefig("../reports/figures/score_time_dist_plot.png", dpi=300, bbox_inches='tight')

In [None]:
# Generate yearly geometric mean (yearly samples could be low and median maybe biased) for each app genre. 
review_yearly = df.groupby(['genre', 'time_year']).scores.apply(stats.gmean).reset_index()
plt.subplots(figsize=(20,15))
sns.heatmap(review_yearly.pivot("genre", "time_year", "scores"), annot=False, cmap='crest')

# Word Count distribution

In [None]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import emoji
import unicodedata
import contractions

In [None]:
def to_lower(text):
    return str(text).lower()

def word_expansion(text):
    return contractions.fix(text)

def text_formatter(text):
    text = emoji.demojize(text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\\\w', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

        
def to_string(text):
    # Convert list to string
    text = ' '.join(text)
    return text

def text_preprocessing(text, expand_contraction = True):
    # 1. Convert words to lower case
    text = to_lower(text)
    
    # 2. Expand contractions
    if expand_contraction:
        text = word_expansion(text)

    # 3. Format words and remove unwanted characters
    text = text_formatter(text)
    
    # 4. Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each word
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]
    
    return text

In [None]:
df['reviews_text_clean_list'] = df["reviews_text"].apply(text_preprocessing)

In [None]:
df["Token Length"] = df['reviews_text_clean_list'].apply(lambda x: len(x))

In [None]:
ax = sns.histplot(x='Token Length', data=df, palette="pastel")
ax.set(xlim=(0, 125))
plt.savefig("../reports/figures/len_dist_plot.png", dpi=300, bbox_inches='tight')