Import libraries

# CSP650: PROJECT <br>
# TITLE: GOOGLE PLAY APPLICATION CLASSIFICATION BASED ON SENTIMENT ANALYSIS OF REVIEWS <br>
SUPERVISOR: MADAM UMMU FATIHAH BINTI MOHD BAHRIN <br>
SUPERVISEE: AQIL KHAIRY BIN HAMSANI (2021856342) <br>

In [1]:
import pandas as pd
import numpy as np
#from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import nltk
import joblib
import re

Load datasets

In [2]:
data = pd.read_csv("Reviews.csv")
data = data.sample(frac=0.05, random_state=42) #reduce samples (memory issues)

Update nltk library (if necessary)

In [3]:
#nltk.download('stopwords') #uncomment this to update stopwords package
#nltk.download('vader_lexicon') #uncomment this to update lexicon package


Preprocess data

In [4]:
stop = set(nltk.corpus.stopwords.words('english'))
data['original_content'] = data['content']
data['content'] = data['content'].fillna('')
data['content'] = data['content'].str.lower()
data['content'] = data['content'].str.replace('[^\w\s]','', regex=True)
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

#ADDITIONAL PREPROCESS

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Replace "not" and "no" with "not_" and "no_" respectively
    text = re.sub(r'\bnot\b', 'not_', text)
    text = re.sub(r'\bno\b', 'no_', text)

    return text
data['content'] = data['content'].apply(preprocess_text)

Using TextBlob to classify the dataset

In [6]:
# def get_sentiment(text):
#     blob = TextBlob(text)
#     sentiment = blob.sentiment.polarity
#     if sentiment >= 0.1:
#         return 'positive'
#     elif sentiment <= -0.1:
#         return 'negative'
#     else:
#         return 'neutral'
    
# data['sentiment'] = data['content'].apply(lambda x: get_sentiment(x))

In [7]:
def get_sentiment(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)

    if sentiment_scores['compound'] >= 0.1:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.1:
        return 'negative'
    else:
        return 'neutral'
        
data['sentiment'] = data['content'].apply(lambda x: get_sentiment(x))

Show some of the content and it's sentiment after using TextBlob

In [8]:
sample_data = data.sample(n=5)
for i, row in sample_data.iterrows():
    print(f"Content: {row['original_content']}\nSentiment: {row['sentiment']}\n")

Content: Good app
Sentiment: positive

Content: Excelent cbr reader, fast and smart, good reading tools, better app for for 
comics i found. Really no complains.
Sentiment: positive

Content: Amazing app
Sentiment: positive

Content: A friend recommended the app to me. I particularly like the clarity and the message function.
Sentiment: positive

Content: I couldn't drag the landmark to the desired point. It was frustrating. Pls 
fix it
Sentiment: negative



Split datasets for training and testing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['content'], data['sentiment'], test_size=0.1, random_state=42)

Vectorize text into numerical format

In [10]:
def vectorize_text(text):
    return [int(word in set(text.split())) for word in features]

features = list(set(' '.join(X_train).split()))
X_train_vect = np.array([vectorize_text(x) for x in X_train])
X_test_vect = np.array([vectorize_text(x) for x in X_test])

Save features as a file

In [11]:
joblib.dump(features, 'features.joblib')

['features.joblib']

Define parameter grid to find the best max_depth for the tree

In [12]:
param_grid = {
    'max_depth': [None, 50, 100, 200]
}

Train the Decision Tree model using the best max_depth found by grid search

In [13]:
clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train_vect, y_train)
clf = grid_search.best_estimator_

tree_depth = clf.tree_.max_depth
print("Depth of the decision tree:", tree_depth)

Depth of the decision tree: 193


Print each and the best depth for the model

In [14]:
for params in grid_search.cv_results_['params']:
    depth = params['max_depth']
    score = grid_search.cv_results_['mean_test_score'][grid_search.cv_results_['params'].index(params)]
    print(f"Depth: {depth}, Mean Test Score: {score}")

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


Depth: None, Mean Test Score: 0.8215743440233234
Depth: 50, Mean Test Score: 0.8038872691933916
Depth: 100, Mean Test Score: 0.8184645286686102
Depth: 200, Mean Test Score: 0.8215743440233234
Best Parameters:  {'max_depth': None}
Best Score:  0.8215743440233234


Save model using Joblib

In [15]:
# Save the variable to file
joblib.dump(clf, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

Load back model into new variable

In [16]:
dt_model = joblib.load('decision_tree_model.joblib')

Evaluate the model

In [17]:
y_pred = dt_model.predict(X_test_vect)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.56      0.43      0.48        75
     neutral       0.66      0.76      0.71        88
    positive       0.91      0.92      0.92       409

    accuracy                           0.83       572
   macro avg       0.71      0.70      0.70       572
weighted avg       0.83      0.83      0.83       572



Select 5 random testing to display the result of prediction

In [18]:
random_indices = np.random.choice(len(X_test), size=5, replace=False)

random_X_test = X_test.iloc[random_indices]
random_y_test = y_test.iloc[random_indices]

# Predict sentiment for the selected samples
random_X_test_vect = np.array([vectorize_text(x) for x in random_X_test])
random_y_pred = dt_model.predict(random_X_test_vect)

# Print the random samples and their predicted sentiment
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Review: {random_X_test.iloc[i]}")
    print(f"Actual Sentiment: {random_y_test.iloc[i]}")
    print(f"Predicted Sentiment: {random_y_pred[i]}\n")

Sample 1:
Review: gd appbut little bit dissatisfied adds
Actual Sentiment: negative
Predicted Sentiment: neutral

Sample 2:
Review: good story best
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 3:
Review: love unlock characters run races
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 4:
Review: really intersting apps whatever like tam tam
Actual Sentiment: positive
Predicted Sentiment: positive

Sample 5:
Review: love lt
Actual Sentiment: positive
Predicted Sentiment: positive



Single review test

In [19]:
input_arr = ["not bad"]
input_vect = np.array([vectorize_text(x) for x in input_arr])
input_pred = dt_model.predict(input_vect)

for i in range(len(input_pred)):
    print(f"Review: {input_arr[i]}")
    print(f"Sentiment: {input_pred[i]}")
    

Review: not bad
Sentiment: negative
