Import libraries

# CSP650: PROJECT <br>
# TITLE: GOOGLE PLAY APPLICATION CLASSIFICATION BASED ON SENTIMENT ANALYSIS OF REVIEWS <br>
SUPERVISOR: MADAM UMMU FATIHAH BINTI MOHD BAHRIN <br>
SUPERVISEE: AQIL KHAIRY BIN HAMSANI (2021856342) <br>

In [4]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
import joblib

Load datasets

In [5]:
data = pd.read_csv("Reviews.csv")
data = data.sample(frac=0.2, random_state=42) #reduce samples (memory issues)

Update stopwords library (if necessary)

In [6]:
#nltk.download('stopwords') #uncomment this to update stopwords package

Preprocess data

In [7]:
stop = set(nltk.corpus.stopwords.words('english'))
data['original_content'] = data['content']
data['content'] = data['content'].fillna('')
data['content'] = data['content'].str.lower()
data['content'] = data['content'].str.replace('[^\w\s]','', regex=True)
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Using TextBlob to classify the dataset

In [8]:
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment >= 0.1:
        return 'positive'
    elif sentiment <= -0.1:
        return 'negative'
    else:
        return 'neutral'
        
data['sentiment'] = data['content'].apply(lambda x: get_sentiment(x))

Show some of the content and it's sentiment after using TextBlob

In [9]:
sample_data = data.sample(n=5)
for i, row in sample_data.iterrows():
    print(f"Content: {row['original_content']}\nSentiment: {row['sentiment']}\n")

Content: Good
Sentiment: positive

Content: This app is most effective and really does help out alot if you do find beginners easy then do the exercise faster it really is effective (#AmNotBot)
Sentiment: positive

Content: Nice
Sentiment: positive

Content: It's a fun way to interact and play with others
Sentiment: positive

Content: very nice and effective app
Sentiment: positive



Split datasets for training and testing

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['sentiment'], test_size=0.1, random_state=42)

Vectorize text into numerical format

In [11]:
def vectorize_text(text):
    return [int(word in set(text.split())) for word in features]

features = list(set(' '.join(X_train).split()))
X_train_vect = np.array([vectorize_text(x) for x in X_train])
X_test_vect = np.array([vectorize_text(x) for x in X_test])

Save features as a file

In [12]:
joblib.dump(features, 'features.joblib')

['features.joblib']

Train with Decision Tree Algorithm (LIBRARY)

In [13]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_vect, y_train)

Save model using Joblib

In [14]:
# Save the variable to file
joblib.dump(clf, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

Load back model into new variable

In [15]:
dt_model = joblib.load('decision_tree_model.joblib')

Evaluate the model

In [16]:
y_pred = dt_model.predict(X_test_vect)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.75      0.78      0.76       235
     neutral       0.80      0.75      0.77       566
    positive       0.93      0.94      0.94      1486

    accuracy                           0.88      2287
   macro avg       0.82      0.82      0.82      2287
weighted avg       0.88      0.88      0.88      2287



Select 5 random testing to display the result of prediction

In [17]:
random_indices = np.random.choice(len(X_test), size=5, replace=False)

random_X_test = X_test.iloc[random_indices]
random_y_test = y_test.iloc[random_indices]

# Predict sentiment for the selected samples
random_X_test_vect = np.array([vectorize_text(x) for x in random_X_test])
random_y_pred = dt_model.predict(random_X_test_vect)

# Print the random samples and their predicted sentiment
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Review: {random_X_test.iloc[i]}")
    print(f"Actual Sentiment: {random_y_test.iloc[i]}")
    print(f"Predicted Sentiment: {random_y_pred[i]}\n")

Sample 1:
Review: good
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 2:
Review: good app
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 3:
Review: enjoying app love fact listen song search africas country songs ect except adso giving four star
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 4:
Review: love love love app gets better better grade wish products offered bit extensive easier purchase phone
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 5:
Review: innovative voice technology developments
Actual Sentiment: positive
Predicted Sentiment: neutral

