 # Sentiment Analysis of reviews Apps for Android from Amazon

In [1]:
pip install nltk



In [2]:
pip install tokenizers



In [3]:
import nltk
import gzip
import numpy as np
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from transformers import BertTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

## Importing the data

In [5]:
path = "../reviews_Apps_for_Android.json.gz"

In [7]:
#Parsing the data
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("drive/MyDrive/reviews_Apps_for_Android.json.gz")

In [8]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AUI0OLXAB3KKT,B004A9SDD8,A Customer,"[0, 0]",Glad to finally see this app on the android ma...,5.0,Great app!!!,1301184000.0,"03 27, 2011"
1,A1ZUSQ3TC3EC4C,B004A9SDD8,A. Lissak,"[12, 14]",this app works great on the Kindle Fire... kid...,5.0,Kid loves it,1321574000.0,"11 18, 2011"
2,AC05OAXD72X1V,B004A9SDD8,Allie,"[0, 0]",We love these monkey's and all the concepts th...,4.0,Love these monkeys!,1367366000.0,"05 1, 2013"
3,A2RVMFOKBVM21I,B004A9SDD8,Amazon Customer,"[0, 2]",cannot get my kindle away from my 2 year old g...,5.0,fun fun for toddlers,1350173000.0,"10 14, 2012"
4,A3NBSRGUWQGCMZ,B004A9SDD8,Amazon Customer,"[1, 3]",I start this app up whenever I forget what a f...,1.0,Might be great if it worked,1300838000.0,"03 23, 2011"


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2638173 entries, 0 to 2638172
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   reviewerID      object 
 1   asin            object 
 2   reviewerName    object 
 3   helpful         object 
 4   reviewText      object 
 5   overall         float64
 6   summary         object 
 7   unixReviewTime  float64
 8   reviewTime      object 
dtypes: float64(2), object(7)
memory usage: 201.3+ MB


## Text Preprocessing

In [10]:
# Normalizing Case Folding
df['reviewText'] = df['reviewText'].str.lower()

# Punctuations
df['reviewText'] = df['reviewText'].str.replace('[^\w\s]', '')

# Numbers
df['reviewText'] = df['reviewText'].str.replace('\d', '')

In [11]:
# Stopwords
sw = stopwords.words('english')
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

In [12]:
# Rarewords
drops = pd.Series(' '.join(df['reviewText']).split()).value_counts()[-1000:]
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))

In [14]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in the reviewText column
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Tokenize each review text and count the occurrences of each token
token_counts = {}

for text in df["reviewText"]:
    tokens = tokenizer.tokenize(text)
    for token in tokens:
        if token in token_counts:
            token_counts[token] += 1
        else:
            token_counts[token] = 1

# Convert the token counts dictionary to a DataFrame
tf = pd.DataFrame(token_counts.items(), columns=["words", "tf"])

print(tf.head())


     words      tf
0     glad   32320
1  finally   21972
2      see  122936
3      app  878593
4  android   41725


## Sentiment Analysis

In [15]:
sia = SentimentIntensityAnalyzer()

In [16]:
# Calculating sentiment scores for example sentence
sia.polarity_scores("This app is very bad, I dont recomend it at all!")

{'neg': 0.312, 'neu': 0.688, 'pos': 0.0, 'compound': -0.623}

In [17]:
df["reviewText"] = df["reviewText"].str.upper()

In [18]:
# Iterate and calculate compound polarity score over each text in "reviewText" column
scores = []
for text in df["reviewText"]:
    if isinstance(text, str):
        score = sia.polarity_scores(text)["compound"]
    else:
        score = 0.0
    scores.append(score)
# Add new column "polarity_score" to the dataset
df["polarity_score"] = scores

In [21]:
# Creating Target column and classify the values into Pos, Neg and Neu
sentiments = []
for text in df["reviewText"]:
    if isinstance(text, str):  # Check if the text is a string
        compound_score = sia.polarity_scores(text)["compound"]
        if compound_score > 0.5:
            sentiment = "pos"
        elif compound_score < -0.4:
            sentiment = "neg"
        else:
            sentiment = "neu"
    else:
        sentiment = "neu"  # Treat non-string values as neutral sentiment
    sentiments.append(sentiment)

# Add new column "target" to the dataset
df["target"] = sentiments

In [22]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,polarity_score,target
0,AUI0OLXAB3KKT,B004A9SDD8,A Customer,"[0, 0]",GLAD FINALLY SEE APP ANDROID MARKET . WIFE IPH...,5.0,Great app!!!,1301184000.0,"03 27, 2011",0.951,pos
1,A1ZUSQ3TC3EC4C,B004A9SDD8,A. Lissak,"[12, 14]",APP WORK GREAT KINDLE FIRE ... KID LOVE PLAY A...,5.0,Kid loves it,1321574000.0,"11 18, 2011",0.9547,pos
2,AC05OAXD72X1V,B004A9SDD8,Allie,"[0, 0]",LOVE MONKEY 'S CONCEPT LEARN WORKING THEM . LO...,4.0,Love these monkeys!,1367366000.0,"05 1, 2013",0.9652,pos
3,A2RVMFOKBVM21I,B004A9SDD8,Amazon Customer,"[0, 2]",CAN NOT GET KINDLE AWAY 2 YEAR OLD GRANDDAUGHT...,5.0,fun fun for toddlers,1350173000.0,"10 14, 2012",0.9037,pos
4,A3NBSRGUWQGCMZ,B004A9SDD8,Amazon Customer,"[1, 3]",START APP WHENEVER FORGET FORCE CLOSE PROMPT L...,1.0,Might be great if it worked,1300838000.0,"03 23, 2011",0.7598,pos


In [23]:
df["reviewText"].isnull().value_counts()

reviewText
False    2638173
Name: count, dtype: int64

In [24]:
# Encode the target variable
label_encoder = LabelEncoder()
df["target"] = label_encoder.fit_transform(df["target"])

In [25]:
df["target"].value_counts()

target
2    2004778
1     481608
0     151787
Name: count, dtype: int64

In [26]:
# Prepare feature matrix X and target vector y
X = df["reviewText"]
y = df["target"]

In [27]:
# Scaling the dataset
# Identify the indices of rows where y is 2
indices_to_drop = np.where(y == 2)[0]

# Calculate the number of rows to drop (60% of the total rows with y == 2)
num_rows_to_drop = int(0.67 * len(indices_to_drop))

# Randomly select 60% of the indices to drop
indices_to_drop = np.random.choice(indices_to_drop, size=num_rows_to_drop, replace=False)

# Drop the selected rows from X
X.drop(indices_to_drop, inplace=True)

# Also drop the corresponding rows from y
y = y.drop(indices_to_drop)

In [28]:
y.value_counts()

target
2    661577
1    481608
0    151787
Name: count, dtype: int64

In [29]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Then apply TF-IDF vectorization
tf_idf_word_vectorizer = TfidfVectorizer()
X_train_tf_idf_word = tf_idf_word_vectorizer.fit_transform(X_train)
X_test_tf_idf_word = tf_idf_word_vectorizer.transform(X_test)

## Modeling

In [33]:
# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tf_idf_word, y_train)

# Evaluate the model
nb_accuracy = nb_model.score(X_test_tf_idf_word, y_test)

print("Testing Accuracy: {:2f}%".format(nb_accuracy * 100))

Testing Accuracy: 65.755323%


In [34]:
# Initialize and train the logistic regression model
logr_model = LogisticRegression(max_iter=1000)
logr_model.fit(X_train_tf_idf_word, y_train)

# Evaluate the model on the test set
logr_accuracy = logr_model.score(X_test_tf_idf_word, y_test)

print("Testing Accuracy: {:2f}%".format(logr_accuracy * 100))

Testing Accuracy: 87.410182%


In [36]:
# Testing the best model on an example review
test_tex = pd.Series(["this product is great",
                        "look at that app very bad",
                        "it was good but I am sure that it fits me"])

# Transform new text data using the fitted TF-IDF vectorizer
test_tex_tfidf = tf_idf_word_vectorizer.transform(test_tex)

# Make predictions
predictions = logr_model.predict(test_tex_tfidf)

print("Predictions:", predictions)

Predictions: [2 0 1]


Obviously the Logistic Regression was better than the Naive Bayes model