# Import libraries

In [1]:
# %pip install langdetect
# %pip install wordcloud

In [2]:
# Import standard libraries
import re
import string
import datetime as dt

# Import data manipulation libraries
import pandas as pd
import numpy as np

# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Import NLP libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from langdetect import detect
import emoji
from textblob import TextBlob

# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

# Set random seed for reproducibility
seed = 0
np.random.seed(seed)

# Suppress chained assignment warning
pd.options.mode.chained_assignment = None

2024-11-05 09:12:40.280754: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-05 09:12:40.283261: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-05 09:12:40.290055: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730772760.302158   18406 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730772760.305620   18406 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 09:12:40.318108: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

# Load dataset

In [3]:
df = pd.read_csv('../app_review.csv')

In [4]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,9b1e9713-f88e-4547-be41-b87d840089cc,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"Great game, and I enjoy playing it, but now it...",2,1,2.6.0,2024-10-29 13:45:27,,,2.6.0
1,057d6353-31e5-4cf1-9a6d-a9947631bece,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,For me the game itself is very well-made in te...,5,0,2.6.0,2024-10-29 13:38:53,,,2.6.0
2,a78877c7-5ea4-4689-a72f-e8ead2ac8c3a,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,This game is as marvelous as Genshin Impact! T...,5,5,2.6.0,2024-10-29 13:31:21,,,2.6.0
3,59a0b3c8-1b2a-4877-84fb-c76b79b5a924,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Not fold friendly. Cant aim at enemies at the ...,3,0,2.6.0,2024-10-29 12:48:45,,,2.6.0
4,d135b884-683a-4daa-9b2e-d8ef29c69df4,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Other than storage this game is amazing I love...,5,0,,2024-10-29 11:12:16,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37018 entries, 0 to 37017
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              37018 non-null  object
 1   userName              37018 non-null  object
 2   userImage             37018 non-null  object
 3   content               37015 non-null  object
 4   score                 37018 non-null  int64 
 5   thumbsUpCount         37018 non-null  int64 
 6   reviewCreatedVersion  26993 non-null  object
 7   at                    37018 non-null  object
 8   replyContent          1191 non-null   object
 9   repliedAt             1191 non-null   object
 10  appVersion            26993 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.1+ MB


In [6]:
df.shape

(37018, 11)

In [7]:
# clean_df[['content','sentiment']].head()

In [8]:
# clean_df.to_csv('clean_app_review.csv', index=False)

# Text Preprocessing

## Cleaning data

In [9]:
clean_df = df.dropna(subset=['content'])
clean_df = clean_df.drop_duplicates(subset=['content'])

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33428 entries, 0 to 37017
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              33428 non-null  object
 1   userName              33428 non-null  object
 2   userImage             33428 non-null  object
 3   content               33428 non-null  object
 4   score                 33428 non-null  int64 
 5   thumbsUpCount         33428 non-null  int64 
 6   reviewCreatedVersion  24382 non-null  object
 7   at                    33428 non-null  object
 8   replyContent          1178 non-null   object
 9   repliedAt             1178 non-null   object
 10  appVersion            24382 non-null  object
dtypes: int64(2), object(9)
memory usage: 3.1+ MB


In [10]:
clean_df.shape

(33428, 11)

In [11]:
# Function to clean text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove White Space
    text = text.strip()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, wordnet.VERB) for word in tokens]
    
    # Join tokens back to string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Apply the cleaning function to the 'content' column
clean_df['cleaned_content'] = clean_df['content'].apply(clean_text)

# Display the cleaned data
clean_df[['content', 'cleaned_content']].head()

Unnamed: 0,content,cleaned_content
0,"Great game, and I enjoy playing it, but now it...",great game enjoy play start crash often
1,For me the game itself is very well-made in te...,game wellmade term graphics audio gameplay rea...
2,This game is as marvelous as Genshin Impact! T...,game marvelous genshin impact character unique...
3,Not fold friendly. Cant aim at enemies at the ...,fold friendly cant aim enemies edge theres roo...
4,Other than storage this game is amazing I love...,storage game amaze love lot


In [12]:
clean_df['cleaned_content']

0                  great game enjoy play start crash often
1        game wellmade term graphics audio gameplay rea...
2        game marvelous genshin impact character unique...
3        fold friendly cant aim enemies edge theres roo...
4                              storage game amaze love lot
                               ...                        
37013                                                     
37014                                       beautiful game
37015                dont mind ill download give first rat
37016                                   soro soro jikan da
37017                                                  omg
Name: cleaned_content, Length: 33428, dtype: object

In [13]:
# check if there is any empty string in the cleaned_content column
clean_df[clean_df['cleaned_content'] == '']


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,cleaned_content
18,6416510f-2821-4a9d-ad3f-27d4aa755e24,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,why not?,5,0,,2024-10-28 18:59:11,,,,
76,5ccf0f09-c19c-4a6c-b124-5bee5c689a6f,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,👍🏻,5,0,2.6.0,2024-10-26 19:08:16,,,2.6.0,
378,bfac5f03-8dfd-4880-89eb-41774ffc3473,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,💓,5,0,2.5.0,2024-10-23 07:10:59,,,2.5.0,
543,6d004706-2970-48e5-80ef-ac71a21b1d7f,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,👍👍,5,0,2.5.0,2024-10-19 06:44:43,,,2.5.0,
706,eb4d0582-7f86-47f8-943c-4401aad558bf,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,❤️❤️,5,0,2.5.0,2024-10-13 00:08:10,,,2.5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
36687,d6df2861-6b03-452a-94ab-6897bec94108,Jeremiah Winfield,https://play-lh.googleusercontent.com/a-/ALV-U...,I am him,5,0,,2023-04-23 10:35:58,,,,
36840,b9900906-2b2c-4270-9c59-ce2e9a5daf2e,Rence Louis Dela Cruz,https://play-lh.googleusercontent.com/a-/ALV-U...,🏃🏃🏃,5,0,,2023-04-23 10:21:05,,,,
36934,946ba24d-a000-4090-aaa1-9e73664b6a0d,Katelyn Bxtterfly's,https://play-lh.googleusercontent.com/a/ACg8oc...,💖✨,5,0,,2023-04-23 10:15:13,,,,
36941,67c88a93-c6ff-4494-9dd1-2bf5c6d17acc,Deybu,https://play-lh.googleusercontent.com/a-/ALV-U...,🔛🔝,5,0,,2023-04-23 10:14:37,,,,


In [14]:
# drop rows with empty string in the cleaned_content column
clean_df = clean_df[clean_df['cleaned_content'] != '']

In [15]:
from multiprocessing import Pool

# Function to detect language
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "error"

# Function to apply language detection in parallel
def parallel_detect_lang(texts):
    with Pool() as pool:
        return pool.map(detect_lang, texts)

# Apply the parallel language detection to the 'cleaned_content' column
clean_df['lang'] = parallel_detect_lang(clean_df['content'])


In [16]:
# Display the result
clean_df[['content', 'lang']].head()

Unnamed: 0,content,lang
0,"Great game, and I enjoy playing it, but now it...",en
1,For me the game itself is very well-made in te...,en
2,This game is as marvelous as Genshin Impact! T...,en
3,Not fold friendly. Cant aim at enemies at the ...,en
4,Other than storage this game is amazing I love...,en


In [17]:
# show the results which is not en
clean_df[clean_df['lang'] == 'en'][['cleaned_content', 'lang']]

Unnamed: 0,cleaned_content,lang
0,great game enjoy play start crash often,en
1,game wellmade term graphics audio gameplay rea...,en
2,game marvelous genshin impact character unique...,en
3,fold friendly cant aim enemies edge theres roo...,en
4,storage game amaze love lot,en
...,...,...
36998,ohhh shoot,en
37003,way play game,en
37008,iam excite play,en
37009,im ready live experience,en


In [18]:
# drop the rows which are not in English
clean_df = clean_df[clean_df['lang'] == 'en']

In [19]:
clean_df.count()

reviewId                27746
userName                27746
userImage               27746
content                 27746
score                   27746
thumbsUpCount           27746
reviewCreatedVersion    20493
at                      27746
replyContent             1105
repliedAt                1105
appVersion              20493
cleaned_content         27746
lang                    27746
dtype: int64

## Text Labelling

In [20]:
# from transformers import pipeline

# sentiment_pipeline = pipeline('sentiment-analysis', device="cuda")

# clean_df['sentiment'] = clean_df['content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

In [21]:
# labelling with textblob
from textblob import TextBlob

def sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def sentiment_subjectivity(text):
    analysis = TextBlob(text)
    return analysis.sentiment.subjectivity

clean_df['sentiment_polarity'] = clean_df['cleaned_content'].apply(sentiment_score)
clean_df['sentiment_subjectivity'] = clean_df['cleaned_content'].apply(sentiment_subjectivity)

def sentiment_label(score):
    if score > 0:
        return 2
    elif score < 0:
        return 0
    else:
        return 1
    
clean_df['sentiment_label'] = clean_df['sentiment_polarity'].apply(sentiment_label)



In [22]:
clean_df[['cleaned_content', 'sentiment_label']]

Unnamed: 0,cleaned_content,sentiment_label
0,great game enjoy play start crash often,2
1,game wellmade term graphics audio gameplay rea...,2
2,game marvelous genshin impact character unique...,2
3,fold friendly cant aim enemies edge theres roo...,2
4,storage game amaze love lot,2
...,...,...
36998,ohhh shoot,1
37003,way play game,0
37008,iam excite play,1
37009,im ready live experience,2


In [23]:
# count the number of positive, negative and neutral reviews
clean_df['sentiment_label'].value_counts()

sentiment_label
2    15507
0     8647
1     3592
Name: count, dtype: int64

### label exploration

In [None]:
# Word Cloud of each sentiment
from wordcloud import WordCloud

def plot_wordcloud(sentiment):
    text = ' '.join(clean_df[clean_df['sentiment_label'] == sentiment]['cleaned_content'])
    wordcloud = WordCloud(width = 800, height = 400, 
                background_color ='white', 
                # stopwords = set(stopwords.words('english')), 
                min_font_size = 10).generate(text)
    plt.figure(figsize = (10, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(f'{sentiment} Reviews')
    plt.show()
    
plot_wordcloud(2)
plot_wordcloud(0)
plot_wordcloud(1)

ValueError: We need at least 1 word to plot a word cloud, got 0.

In [None]:
# Pisahkan data menjadi fitur (tweet) dan label (sentimen)
X = clean_df['cleaned_content']
y = clean_df['sentiment_label']
 
# Ekstraksi fitur dengan TF-IDF
tfidf = TfidfVectorizer(max_features=200, min_df=17, max_df=0.8 )
X_tfidf = tfidf.fit_transform(X)
 
# Konversi hasil ekstraksi fitur menjadi dataframe
features_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
 
# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# Menampilkan hasil ekstraksi fitur
features_df

Unnamed: 0,10,absolutely,account,actually,add,already,also,always,amaze,annoy,...,want,way,well,wish,without,wont,work,world,would,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.352854,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.348669,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448825,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
27742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.847504,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
27743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
27744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [None]:
from sklearn.ensemble import RandomForestClassifier
 
# Membuat objek model Random Forest
random_forest = RandomForestClassifier()
 
# Melatih model Random Forest pada data pelatihan
random_forest.fit(X_train.toarray(), y_train)
 
# Prediksi sentimen pada data pelatihan dan data uji
y_pred_train_rf = random_forest.predict(X_train.toarray())
y_pred_test_rf = random_forest.predict(X_test.toarray())
 
# Evaluasi akurasi model Random Forest
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)
 
# Menampilkan akurasi
print('Random Forest - accuracy_train:', accuracy_train_rf)
print('Random Forest - accuracy_test:', accuracy_test_rf)

Random Forest - accuracy_train: 0.9758965579383673
Random Forest - accuracy_test: 0.8273873873873874
