## App review sentiment analysis

#### Importing Libraries

In [25]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
import psycopg2
from psycopg2 import sql
from src.utils import check_missing_data
from scripts.db_utils import connect, sql_to_dataframe

##### Download datasets for nltk

In [14]:
nltk.download('punkt')  # Download the tokenization data
nltk.download('stopwords')  # Download the stopwords data
nltk.download('averaged_perceptron_tagger')  # Download the part-of-speech tagging data
nltk.download('maxent_ne_chunker')  # Download the named entity recognition data
nltk.download('words')  # Download the words corpus
nltk.download('wordnet')
nltk.download('brown')

[nltk_data] Downloading package punkt to /home/vagrant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vagrant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vagrant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vagrant/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/vagrant/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vagrant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /home/vagrant/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

#### Load data

In [15]:
#opening the connection
conn = connect()

# load BOA App review data 
query = """ SELECT * FROM public.google_play_reviews  """
df_reviews = sql_to_dataframe(conn, query)

#closing the connection
conn.close()

Connecting..
All good, Connection successful!


#### Data Cleaning

In [16]:
# checking for missing values
missing_data_df_reveiws = check_missing_data(df_reviews)
print(missing_data_df_reveiws)

               Column Name  Missing Values  Percentage Missing
5   review_created_version              87                29.0
7            reply_content             300               100.0
8               replied_at             300               100.0
9              app_version              87                29.0
12               sentiment             300               100.0


In [17]:
# Handle missing values 
df_reviews['review_created_version'] = df_reviews['review_created_version'].ffill()
df_reviews['app_version'] = df_reviews['app_version'].ffill()

In [18]:
# checking for missing values
missing_data_df_reveiws = check_missing_data(df_reviews)
print(missing_data_df_reveiws)

               Column Name  Missing Values  Percentage Missing
5   review_created_version               1            0.333333
7            reply_content             300          100.000000
8               replied_at             300          100.000000
9              app_version               1            0.333333
12               sentiment             300          100.000000


In [19]:
df_reviews.dropna(subset=['review_created_version', 'app_version'], inplace=True)

In [20]:
# checking for missing values
missing_data_df_reveiws = check_missing_data(df_reviews)
print(missing_data_df_reveiws)

      Column Name  Missing Values  Percentage Missing
7   reply_content             299               100.0
8      replied_at             299               100.0
12      sentiment             299               100.0


#### Data Preprocessing

In [21]:
# Preprocess the review text
stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
df_reviews['cleaned_text'] = df_reviews['user_comments'].apply(lambda x: ' '.join([lemma.lemmatize(word) for word in str(x).lower().split() if word not in stop]))

#### Perform sentiment analysis

In [22]:
def categorize_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df_reviews['sentiment_score'] = df_reviews['user_comments'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_reviews['sentiment'] = df_reviews['sentiment_score'].apply(categorize_sentiment)

In [23]:
df_reviews.head()

Unnamed: 0,id,review_id,username,user_image,likes,review_created_version,created_at,reply_content,replied_at,app_version,score,user_comments,sentiment,cleaned_text,sentiment_score
1,2,e20abe49-8fe7-42fe-af3a-91399875b21a,Abbatu Ermias,https://play-lh.googleusercontent.com/a/ACg8oc...,0,24.04.23,2024-05-21 05:06:00,,,24.04.23,5,Awe ሃረፍ nawe,neutral,awe ሃረፍ nawe,0.0
2,3,55c833c8-a942-47bb-aaaa-e022f39e28af,Habtsh Darge,https://play-lh.googleusercontent.com/a/ACg8oc...,0,24.04.23,2024-05-20 08:07:00,,,24.04.23,1,በጣም ቀፋፊ አፕ ነው ። ሰርቶ አያውቅም ። በጣም ብዙ ችግር አለበት ከአ...,neutral,በጣም ቀፋፊ አፕ ነው ። ሰርቶ አያውቅም ። በጣም ብዙ ችግር አለበት ከአ...,0.0
3,4,d62d6d95-2900-4038-aa26-c9b42835c971,Usman,https://play-lh.googleusercontent.com/a/ACg8oc...,0,24.04.23,2024-05-19 16:27:00,,,24.04.23,1,This application is very bad . Please modify t...,negative,application bad . please modify like cbe birri...,-0.91
4,5,20c4de51-f5cd-41f3-b21a-efdf8cb4463e,Eskender Million,https://play-lh.googleusercontent.com/a-/ALV-U...,1,24.04.23,2024-05-17 20:37:00,,,24.04.23,1,"It has a very complex, inconvenient to use, an...",negative,"complex, inconvenient use, heavy step, underst...",-0.228958
5,6,f7b29ba8-e90e-4dd6-bf6e-f1480b119456,Leta Teshome,https://play-lh.googleusercontent.com/a-/ALV-U...,2,24.04.23,2024-05-17 00:40:00,,,24.04.23,1,"Never try Apollo. To be honest, this app will ...",positive,"never try apollo. honest, app let down. pictur...",0.036111


In [26]:
#opening the connection
conn = connect()
cur = conn.cursor()

# Update sentiment column for each row in the DataFrame
for index, row in df_reviews.iterrows():
    update_query = sql.SQL("""
    UPDATE public.google_play_reviews
    SET sentiment = %s
    WHERE review_id = %s
    """)
    cur.execute(update_query, (row['sentiment'], row['review_id']))

conn.commit()


#closing the connection
conn.close()

Connecting..
All good, Connection successful!
