Import libraries

# CSP650: PROJECT <br>
# TITLE: GOOGLE PLAY APPLICATION CLASSIFICATION BASED ON SENTIMENT ANALYSIS OF REVIEWS <br>
SUPERVISOR: MADAM UMMU FATIHAH BINTI MOHD BAHRIN <br>
SUPERVISEE: AQIL KHAIRY BIN HAMSANI (2021856342) <br>

In [47]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
import joblib
import re

Load datasets

In [48]:
data = pd.read_csv("Reviews.csv")
data = data.sample(frac=0.1, random_state=42) #reduce samples (memory issues)

Update nltk library (if necessary)

In [49]:
#nltk.download('stopwords') #uncomment this to update stopwords package


Preprocess data

In [50]:
stop = set(nltk.corpus.stopwords.words('english'))
data['original_content'] = data['content']
data['content'] = data['content'].fillna('')
data['content'] = data['content'].str.lower()
data['content'] = data['content'].str.replace('[^\w\s]','', regex=True)
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Using TextBlob to classify the dataset

In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment >= 0.1:
        return 'positive'
    elif sentiment <= -0.1:
        return 'negative'
    else:
        return 'neutral'
    
data['sentiment'] = data['content'].apply(lambda x: get_sentiment(x))

Show some of the content and it's sentiment after using TextBlob

In [53]:
sample_data = data.sample(n=5)
for i, row in sample_data.iterrows():
    print(f"Content: {row['original_content']}\nSentiment: {row['sentiment']}\n")

Content: awesome cool fun funny ridiculous funny
Sentiment: positive

Content: I Like the music 🎶 but give pop ok 👌
Sentiment: positive

Content: Buggy and there is ads
Sentiment: neutral

Content: Why did you put silver in Sonic Force's but not blaze if you could please tell me why if not okay whatever but it would be amazing if you could play as silver and I'm not saying Sonic or shadow are bad cheaters but I would love to play silver and blaze and one more thing is I think Sonic needs more charters well if you could please let me know why thank you egg Man well be back by Winter so be careful bye
Sentiment: positive

Content: Love it!
Sentiment: positive



Split datasets for training and testing

In [54]:
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['sentiment'], test_size=0.1, random_state=42)

Vectorize text into numerical format

In [55]:
def vectorize_text(text):
    return [int(word in set(text.split())) for word in features]

features = list(set(' '.join(X_train).split()))
X_train_vect = np.array([vectorize_text(x) for x in X_train])
X_test_vect = np.array([vectorize_text(x) for x in X_test])

Save features as a file

In [56]:
joblib.dump(features, 'features.joblib')

['features.joblib']

Train the Decision Tree model

In [58]:
clf = DecisionTreeClassifier(random_state=42)

Save model using Joblib

In [60]:
# Save the variable to file
joblib.dump(clf, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

Load back model into new variable

In [61]:
dt_model = joblib.load('decision_tree_model.joblib')

Evaluate the model

In [62]:
y_pred = dt_model.predict(X_test_vect)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.58      0.51      0.54       120
     neutral       0.76      0.81      0.78       180
    positive       0.93      0.93      0.93       844

    accuracy                           0.87      1144
   macro avg       0.75      0.75      0.75      1144
weighted avg       0.86      0.87      0.86      1144



Select 5 random testing to display the result of prediction

In [63]:
random_indices = np.random.choice(len(X_test), size=5, replace=False)

random_X_test = X_test.iloc[random_indices]
random_y_test = y_test.iloc[random_indices]

# Predict sentiment for the selected samples
random_X_test_vect = np.array([vectorize_text(x) for x in random_X_test])
random_y_pred = dt_model.predict(random_X_test_vect)

# Print the random samples and their predicted sentiment
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Review: {random_X_test.iloc[i]}")
    print(f"Actual Sentiment: {random_y_test.iloc[i]}")
    print(f"Predicted Sentiment: {random_y_pred[i]}\n")

Sample 1:
Review: good app easily
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 2:
Review: quality songs low lot disturbance song please fix issue full version app song quality good lite version need fixed
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 3:
Review: good game nc keep legendary mythic monster
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 4:
Review: fantastic fair liked
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 5:
Review: awesome app saved time cost moving around meetings
Actual Sentiment: positive
Predicted Sentiment: positive



Single review test

In [None]:
input_arr = [input('Enter review')]
input_vect = np.array([vectorize_text(x) for x in input_arr])
input_pred = dt_model.predict(input_vect)

for i in range(len(input_pred)):
    print(f"Review: {input_arr[i]}")
    print(f"Sentiment: {input_pred[i]}")
    