In [1]:
#pip install nltk

In [2]:
import pandas as pd
import random
import string
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk import classify
from nltk import NaiveBayesClassifier

# Data Preparation.

In [3]:
df = pd.read_csv("reviews_Digital_Music_5.json.gz.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album,1158019200,"09 12, 2006"
1,1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant,991526400,"06 3, 2001"
2,2,A38IRL0X2T4DPF,5555991584,bob turnley,"[2, 2]",I never thought Enya would reach the sublime h...,5.0,The best so far,1058140800,"07 14, 2003"
3,3,A22IK3I6U76GX0,5555991584,Calle,"[1, 1]",This is the third review of an irish album I w...,5.0,Ireland produces good music.,957312000,"05 3, 2000"
4,4,A1AISPOIIHTHXX,5555991584,"Cloud ""...""","[1, 1]","Enya, despite being a successful recording art...",4.0,4.5; music to dream to,1200528000,"01 17, 2008"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64706 entries, 0 to 64705
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      64706 non-null  int64  
 1   reviewerID      64706 non-null  object 
 2   asin            64706 non-null  object 
 3   reviewerName    64529 non-null  object 
 4   helpful         64706 non-null  object 
 5   reviewText      64705 non-null  object 
 6   overall         64706 non-null  float64
 7   summary         64706 non-null  object 
 8   unixReviewTime  64706 non-null  int64  
 9   reviewTime      64706 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 4.9+ MB


Drop all columns that we don't need in the analysis.

In [5]:
df.drop(labels=['Unnamed: 0', 'reviewerID', 'reviewerName', 'helpful', 'unixReviewTime', 'reviewTime'],
        axis='columns', inplace=True)

In [6]:
df.head()

Unnamed: 0,asin,reviewText,overall,summary
0,5555991584,"It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album
1,5555991584,"A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant
2,5555991584,I never thought Enya would reach the sublime h...,5.0,The best so far
3,5555991584,This is the third review of an irish album I w...,5.0,Ireland produces good music.
4,5555991584,"Enya, despite being a successful recording art...",4.0,4.5; music to dream to


The overall score is from 1 to 5; we should check the percentage of each score in the dataset.

In [7]:
df['overall'].value_counts(normalize=True)

5.0    0.549872
4.0    0.255556
3.0    0.104921
2.0    0.046518
1.0    0.043134
Name: overall, dtype: float64

Print out the review text of the first review.

In [8]:
df.iloc[0,1]

'It\'s hard to believe "Memory of Trees" came out 11 years ago;it has held up well over the passage of time.It\'s Enya\'s last great album before the New Age/pop of "Amarantine" and "Day without rain." Back in 1995,Enya still had her creative spark,her own voice.I agree with the reviewer who said that this is her saddest album;it is melancholy,bittersweet,from the opening title song."Memory of Trees" is elegaic&majestic.;"Pax Deorum" sounds like it is from a Requiem Mass,it is a dark threnody.Unlike the reviewer who said that this has a "disconcerting" blend of spirituality&sensuality;,I don\'t find it disconcerting at all."Anywhere is" is a hopeful song,looking to possibilities."Hope has a place" is about love,but it is up to the listener to decide if it is romantic,platonic,etc.I\'ve always had a soft spot for this song."On my way home" is a triumphant ending about return.This is truly a masterpiece of New Age music,a must for any Enya fan!'

Print out the summary text of the first review.

In [9]:
df.iloc[0,3]

"Enya's last great album"

Combine the review text and summary into one text.

In [10]:
df['reviewText'] = df['reviewText'] + ' ' + df['summary']
df.drop(labels=['summary'], axis='columns', inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64705 entries, 0 to 64705
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   asin        64705 non-null  object 
 1   reviewText  64705 non-null  object 
 2   overall     64705 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.0+ MB


We label the sentiment for each review in the dataset. For the overall score of 4 or 5, the sentiment is 'positive'. For the overall score of 2 or 1, the sentiment is 'negative'; if overall score equals to 3, the sentiment is 'neutral'.

In [11]:
df.loc[df['overall'] >= 4, 'sentiment'] = 'positive'
df.loc[df['overall'] <= 2, 'sentiment'] = 'negative'
df.loc[df['overall'] == 3, 'sentiment'] = 'neutral'

We do not need the neutral sentiment in the analysis because opinions of some people may incline toward positive sentiment but they still rate the product at 3. It is also the same in case of negative-inclined opinions. So neutral sentiment is quite ambiguous.

In [12]:
train_dataset = df[df['sentiment'] != 'neutral']

In [13]:
train_dataset['sentiment'].value_counts()

positive    52116
negative     5801
Name: sentiment, dtype: int64

There is imbalance between two labels, the negative sentiment are equivalent to 10% of the positive sentiment. We will use the lesser sentiment as the number of samples.

In [14]:
sample_number = min(train_dataset['sentiment'].value_counts())
sample_number

5801

Sampling for both labels.

In [15]:
train_dataset = train_dataset.groupby('sentiment').apply(lambda x: x.sample(n=sample_number)).reset_index(drop=True)
train_dataset['sentiment'].value_counts()

negative    5801
positive    5801
Name: sentiment, dtype: int64

In [16]:
train_dataset.head()

Unnamed: 0,asin,reviewText,overall,sentiment
0,B000003B6J,This piece of ignorant trash single-handedly r...,1.0,negative
1,B000002IWQ,All they can do with Rush's recordings is play...,1.0,negative
2,B000002OQ3,Jodeci was one of the most popular R&B; groups...,2.0,negative
3,B00008V61C,Ok I'm a big Jay-Z fan but not that big of a f...,2.0,negative
4,B000002GXX,"It's been a while since I bashed the Eagles, s...",1.0,negative


# Build a Classification model using Bag-of-words method.

This will be done by the nltk library and it requires to process all the reviews into a "bag-of-words" before applying classification. To create the word bag, first we have to combine all review into one large string then we tokenize that large string to a list of words - means the bag-of-words.

Create the Positive bag-of-words.

In [17]:
#Create a list of all positive reviews.
positive_reviews = train_dataset.loc[train_dataset['sentiment'] == 'positive']
positive_reviews = positive_reviews['reviewText'].tolist()

#Lower all the words to eliminate duplicates.
positive_reviews = [positive_review.lower() for positive_review in positive_reviews]

#Join all reviews into one large string.
positive_reviews = ' '.join([str(positive_review) for positive_review in positive_reviews])

In [18]:
print(f"Datatype of positive_reviews is {type(positive_reviews)} and it has {len(positive_reviews)} characters!")

Datatype of positive_reviews is <class 'str'> and it has 6799576 characters!


Our bag-of-words does not include stop words in English, punctuations, and whitespaces.

In [19]:
#Create a set of stop words and punctuation.
stop_words = set(stopwords.words('english') + list(string.punctuation))

In [20]:
#Tokenize the list of positive words using whitespace method. It mean that we will separate the words by the whitespces, tabs, or new line.
positive_words = WhitespaceTokenizer().tokenize(positive_reviews)

#Remove stop words and punctuations.
positive_words = [positive_word for positive_word in positive_words if positive_word not in stop_words]

#Remove punctuations that may be connected with the words.
positive_words = [positive_word.strip(string.punctuation) for positive_word in positive_words]

View the most 20 common positive words.

In [21]:
positive_word_frequency = nltk.FreqDist(positive_words)
positive_word_frequency.most_common(20)

[('album', 12594),
 ('song', 6564),
 ('one', 6368),
 ('like', 5809),
 ('songs', 5167),
 ('great', 4148),
 ('music', 4022),
 ('good', 3688),
 ('best', 3544),
 ('cd', 3406),
 ('love', 3222),
 ('first', 2660),
 ('track', 2538),
 ('time', 2478),
 ('really', 2399),
 ('still', 2228),
 ('get', 2177),
 ('it', 2149),
 ('sound', 2115),
 ('would', 2004)]

Create the Negative bag-of-words similarly to creating the Positive bag-of-words.

In [22]:
negative_reviews = train_dataset.loc[train_dataset['sentiment'] == 'negative']
negative_reviews = negative_reviews['reviewText'].tolist()
negative_reviews = [negative_review.lower() for negative_review in negative_reviews]
negative_reviews = ' '.join([str(negative_review) for negative_review in negative_reviews])
negative_words = WhitespaceTokenizer().tokenize(negative_reviews)
negative_words = [negative_word for negative_word in negative_words if negative_word not in stop_words]
negative_words = [negative_word.strip(string.punctuation) for negative_word in negative_words]

In [23]:
negative_word_frequency = nltk.FreqDist(negative_words)
negative_word_frequency.most_common(20)

[('album', 9085),
 ('like', 6279),
 ('songs', 3980),
 ('one', 3912),
 ('song', 3638),
 ('good', 3331),
 ('music', 3069),
 ('cd', 2757),
 ('really', 2377),
 ('even', 2353),
 ('get', 2339),
 ('it', 2121),
 ('would', 1860),
 ('much', 1829),
 ('better', 1665),
 ('first', 1635),
 ('time', 1610),
 ('sound', 1560),
 ('bad', 1457),
 ("i'm", 1440)]

In [24]:
print("Number of Positive word:", len(positive_words))
print("Number of Negative word:", len(negative_words))


Number of Positive word: 668572
Number of Negative word: 504185


More specifically, we will use Naive Bayes Classifier of nltk library. This classifier requires the inputs have to be in the set format (feature, label). More details can be found in nltk documentation.

This function will transform a word in the bag to the required set format (feature, label).

In [25]:
def word_features(words):
    return dict([(word, True) for word in words.split()])

Create the list of (feature, label) sets for the positive and negative bag-of-words.

In [26]:
positive_features = [(word_features(positive_word), 'positive') for positive_word in positive_words]
negative_features = [(word_features(negative_word), 'negative') for negative_word in negative_words]


Final format of the positive and negative bag-of-words.

In [27]:
positive_features[:10]

[({'john': True}, 'positive'),
 ({'denver': True}, 'positive'),
 ({'farewell': True}, 'positive'),
 ({'andromeda': True}, 'positive'),
 ({'true': True}, 'positive'),
 ({'huge': True}, 'positive'),
 ({'hits': True}, 'positive'),
 ({'album': True}, 'positive'),
 ({'nevertheless': True}, 'positive'),
 ({'john': True}, 'positive')]

In [28]:
negative_features[:10]

[({'piece': True}, 'negative'),
 ({'ignorant': True}, 'negative'),
 ({'trash': True}, 'negative'),
 ({'single-handedly': True}, 'negative'),
 ({'ruined': True}, 'negative'),
 ({'rap': True}, 'negative'),
 ({'music': True}, 'negative'),
 ({'misled': True}, 'negative'),
 ({'generation': True}, 'negative'),
 ({'destroyed': True}, 'negative')]

Combine the positive and negative bag-of-words into one big bag then shuffle all the words inside.

In [29]:
feature_label_pairs = positive_features + negative_features
random.shuffle(feature_label_pairs)

Split "the big bag" into train and test dataset with the .8 training split ratio.

In [30]:
train_ratio = 0.8
split_point = int(len(feature_label_pairs)*train_ratio)
train_set, test_set = feature_label_pairs[:split_point], feature_label_pairs[split_point:]

Train and test the model. Accuracy is used to evaluated model's performance.

In [31]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

accuracy = nltk.classify.accuracy(classifier, test_set)
accuracy = round(accuracy*100, 2)
print(f"Accuracy rate of Naive Bayes Classification model: {accuracy}%")

Accuracy rate of Naive Bayes Classification model: 61.79%
