In [None]:
import pandas as pd

# install sentiment analyzer
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# set seed
import random
random.seed(0)

# for metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## https://datasetsearch.research.google.com/search?query=amazon%20reviews%20dataset%20polarity&docid=p7MRKNwJKo4PvhOzAAAAAA%3D%3D



In [None]:
## download the dataset for analysis
!wget --no-check-certificate \
    'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz' \
    -O './amazon_review_polarity_csv.tgz'

!tar -xzvf './amazon_review_polarity_csv.tgz'

--2020-12-13 03:22:03--  https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.105.62
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.105.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 688339454 (656M) [application/x-tar]
Saving to: ‘./amazon_review_polarity_csv.tgz’


2020-12-13 03:22:46 (15.6 MB/s) - ‘./amazon_review_polarity_csv.tgz’ saved [688339454/688339454]

amazon_review_polarity_csv/
amazon_review_polarity_csv/train.csv
amazon_review_polarity_csv/readme.txt
amazon_review_polarity_csv/test.csv


In [None]:
# read the dataset, name the columns and add one column for predicted sentiment
%%time
train = pd.read_csv('amazon_review_polarity_csv/train.csv', header=None)
test = pd.read_csv('amazon_review_polarity_csv/test.csv', header=None)

train.columns = ['label', 'review_title', 'review_text']
test.columns = ['label', 'review_title', 'review_text']

train.loc[:, 'sentiment_predicted'] = None
test.loc[:, 'sentiment_predicted'] = None

print(f'Train shape: {train.shape} - Test shape: {test.shape}')

train.drop(['review_title'], axis=1, inplace=True)
test.drop(['review_title'], axis=1, inplace=True)

Train shape: (3600000, 4) - Test shape: (400000, 4)
CPU times: user 20.6 s, sys: 1.91 s, total: 22.5 s
Wall time: 22.6 s


In [None]:
train.head(10)

Unnamed: 0,label,review_text,sentiment_predicted
0,2,This sound track was beautiful! It paints the ...,
1,2,I'm reading a lot of reviews saying that this ...,
2,2,This soundtrack is my favorite music of all ti...,
3,2,I truly like this soundtrack and I enjoy video...,
4,2,"If you've played the game, you know how divine...",
5,2,I am quite sure any of you actually taking the...,
6,1,"This is a self-published book, and if you want...",
7,2,I loved Whisper of the wicked saints. The stor...,
8,2,I just finished reading Whisper of the Wicked ...,
9,2,This was a easy to read book that made me want...,


In [None]:
## showcase a negative text (the seventh row)
train.iloc[6]["review_text"]

'This is a self-published book, and if you want to know why--read a few paragraphs! Those 5 star reviews must have been written by Ms. Haddon\'s family and friends--or perhaps, by herself! I can\'t imagine anyone reading the whole thing--I spent an evening with the book and a friend and we were in hysterics reading bits and pieces of it to one another. It is most definitely bad enough to be entered into some kind of a "worst book" contest. I can\'t believe Amazon even sells this kind of thing. Maybe I can offer them my 8th grade term paper on "To Kill a Mockingbird"--a book I am quite sure Ms. Haddon never heard of. Anyway, unless you are in a mood to send a book to someone as a joke---stay far, far away from this one!'

In [None]:
# generate the sentiment analyzer method
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    #print("{:-<40} {}".format(sentence, str(score)))
    return score

In [None]:
%%time
sentiment_analyzer_scores(train.iloc[6]["review_text"])

CPU times: user 2.34 ms, sys: 990 µs, total: 3.33 ms
Wall time: 3.66 ms


{'compound': -0.8653, 'neg': 0.114, 'neu': 0.821, 'pos': 0.065}

In [None]:
%%time
sentiment_analyzer_scores(train.iloc[5]["review_text"])

CPU times: user 4.42 ms, sys: 0 ns, total: 4.42 ms
Wall time: 5.47 ms


{'compound': 0.9886, 'neg': 0.014, 'neu': 0.748, 'pos': 0.238}

In [None]:
%%time

test = test.drop(list(range(50000, len(test))))
print(len(test))
for i in range(len(test)):
    test.loc[i, 'sentiment_predicted'] = sentiment_analyzer_scores(test.iloc[i]["review_text"])['compound']
    #for i in range(50):
    test.loc[i, "sentiment_predicted"] = 1 if test.iloc[i]["sentiment_predicted"] < 0 else 2


50000
CPU times: user 2min 42s, sys: 1.18 s, total: 2min 43s
Wall time: 2min 44s


In [None]:
test.head()

Unnamed: 0,label,review_text,sentiment_predicted
0,2,My lovely Pat has one of the GREAT voices of h...,2
1,2,Despite the fact that I have only played a sma...,2
2,1,I bought this charger in Jul 2003 and it worke...,2
3,2,Check out Maha Energy's website. Their Powerex...,2
4,2,Reviewed quite a bit of the combo players and ...,2


In [None]:
print(precision_score(test["label"].tolist(),test["sentiment_predicted"].tolist()))

0.8458743376230129


In [None]:
print(recall_score(test["label"].tolist(),test["sentiment_predicted"].tolist()))

0.45374807114431903


In [None]:
print(accuracy_score(test["label"].tolist(),test["sentiment_predicted"].tolist()))

0.69024


In [None]:
print(confusion_matrix(test["label"].tolist(),test["sentiment_predicted"].tolist()))

[[11174 13452]
 [ 2036 23338]]


MNB Model:

[

  [11174 13452]
 
 [ 2036 23338]

 ]

Precision:  0.84587

Recall:     0.45378

Accuracy:   0.69024