Sentiment analysis is part of natural language processing (NLP) where the goal is to identify the subjectivity of text data. 
We will be analyzing amazon reviews and give a numerical rating (1, bad and 5, good). This is a multilabel classification problem.

In [1]:
#required librairies 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import gzip

### nltk python library for natural language processing 

import nltk
import string

####### scikit-learn is a python library for machine learning 

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
#load the data 

!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz


--2021-10-01 08:47:50--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: ‘reviews_Electronics_5.json.gz’


2021-10-01 08:52:01 (1.88 MB/s) - ‘reviews_Electronics_5.json.gz’ saved [495854086/495854086]



In [3]:
#unzip the file and store in pandas dataframe
data_amazon = []
with gzip.open('reviews_Electronics_5.json.gz') as f:
    for l in f:
        data_amazon.append(json.loads(l.strip()))
df = pd.DataFrame.from_dict(data_amazon)

In [4]:
#sample 
df.head(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"
5,A2JXAZZI9PHK9Z,594451647,"Billy G. Noland ""Bill Noland""","[3, 3]",I am using this with a Nook HD+. It works as d...,5.0,HDMI Nook adapter cable,1388707200,"01 3, 2014"
6,A2P5U7BDKKT7FW,594451647,Christian,"[0, 0]",The cable is very wobbly and sometimes disconn...,2.0,Cheap proprietary scam,1398556800,"04 27, 2014"
7,AAZ084UMH8VZ2,594451647,"D. L. Brown ""A Knower Of Good Things""","[0, 0]",This adaptor is real easy to setup and use rig...,5.0,A Perfdect Nook HD+ hook up,1399161600,"05 4, 2014"
8,AEZ3CR6BKIROJ,594451647,Mark Dietter,"[0, 0]",This adapter easily connects my Nook HD 7&#34;...,4.0,A nice easy to use accessory.,1405036800,"07 11, 2014"
9,A3BY5KCNQZXV5U,594451647,Matenai,"[3, 3]",This product really works great but I found th...,5.0,This works great but read the details...,1390176000,"01 20, 2014"


In [5]:
#take only rating and text 

df_overall_text = df[['reviewText','overall']]

df_overall_text.head(5)

Unnamed: 0,reviewText,overall
0,We got this GPS for my husband who is an (OTR)...,5.0
1,"I'm a professional OTR truck driver, and I bou...",1.0
2,"Well, what can I say. I've had this unit in m...",3.0
3,"Not going to write a long review, even thought...",2.0
4,I've had mine for a year and here's what we go...,1.0


In [6]:
#overall description 

df_overall_text.describe()

Unnamed: 0,overall
count,1689188.0
mean,4.222779
std,1.185632
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [7]:
#missing and nan values

df_overall_text.isna().sum()

reviewText    0
overall       0
dtype: int64

In [8]:
df_overall_text.count()

reviewText    1689188
overall       1689188
dtype: int64

In [9]:
#drop rows with missing values
df_overall_text.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
#count unique ratings unbalanced ??
df_overall_text.overall.value_counts()

sample_size = df_overall_text.overall.value_counts()
print(sample_size)

5.0    1009026
4.0     347041
3.0     142257
1.0     108725
2.0      82139
Name: overall, dtype: int64


In [11]:
#downsamling to be as the least represented rating

df_equal_proportion = pd.DataFrame()
for i in df_overall_text.overall.unique():
  X = df_overall_text[df_overall_text.overall == i].sample(82000)
  df_equal_proportion = df_equal_proportion.append(X)

In [12]:
df_equal_proportion['overall'].value_counts()

3.0    82000
5.0    82000
4.0    82000
2.0    82000
1.0    82000
Name: overall, dtype: int64

In [13]:
#lets load ntlk libraries that we will use to preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet, stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [14]:
stopwords_list = stopwords.words('english')

def ReviewProcessing(df):
  # remove non alphanumeric 
  df['review_non_alphanumeric'] = df.reviewText.str.replace('[^a-zA-Z0-9 ]', '')
  # lowercase
  df.review_non_alphanumeric = df.review_non_alphanumeric.str.lower()
  # split into list
  df.review_non_alphanumeric = df.review_non_alphanumeric.str.split(' ')
  # remove stopwords
  df.review_non_alphanumeric = df.review_non_alphanumeric.apply(lambda x: [item for item in x if item not in stopwords_list])
  return df

In [15]:
#identify and cut down the inflectional forms into a common base word.
def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

  return tag_dict.get(tag, wordnet.NOUN)
#lemmatizing
lemmatizer = nltk.stem.WordNetLemmatizer()
def get_lemmatize(sent):
  return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sent)])


In [16]:
clean_data = ReviewProcessing(df_equal_proportion)
clean_data.review_non_alphanumeric = clean_data.review_non_alphanumeric.apply(' '.join)
clean_data['review_cleaned_lemmatized'] = clean_data.review_non_alphanumeric.apply(get_lemmatize)

In [17]:
clean_data.head(10)

Unnamed: 0,reviewText,overall,review_non_alphanumeric,review_cleaned_lemmatized
240062,This is by far the best FM transmitter I have ...,5.0,far best fm transmitter ever usedit easy use s...,far best fm transmitter ever usedit easy use s...
555796,I bought a Garmin Nuvi 350 a while back (paid ...,5.0,bought garmin nuvi 350 back paid around 800 fi...,bought garmin nuvi 350 back paid around 800 fi...
63529,Used these more than once in older cars as a r...,5.0,used older cars replacement new set easy inst...,use old car replacement new set easy install i...
67826,Just got theSony VMC15FS A/V Cable for most So...,5.0,got thesony vmc15fs av cable sony minidv dvd ...,get thesony vmc15fs av cable sony minidv dvd c...
1495131,This is just cool. It's actually made by SONY ...,5.0,cool actually made sony customized camera woul...,cool actually make sony customize camera would...
788689,MyNikon D80 10.2MP Digital SLR Camera (Body on...,5.0,mynikon d80 102mp digital slr camera body only...,mynikon d80 102mp digital slr camera body only...
595624,,5.0,,
1117608,The Canadian Kindle Keyboard did not come with...,5.0,canadian kindle keyboard come handy adapter u...,canadian kindle keyboard come handy adapter u ...
665952,It's the cheapest charger i ever had and it wo...,5.0,cheapest charger ever works really liked compl...,cheapest charger ever work really like complai...
695756,I've bought five of these over the last year. ...,5.0,ive bought five last year far better deal walm...,ive bought five last year far well deal walmar...


In [18]:
#save the file to csv 
clean_data.to_csv('amazon_review_lema.csv')

In [19]:
#lets create a pipeline that will vectorize the data as ingrams of two and then uses term frequency-inverse doccument frequency to represent the text numerically
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


nb = Pipeline([('vectorize', CountVectorizer(ngram_range=(1, 2))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])


sgd = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
               ])




logreg = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(max_iter=500)),
               ])

In [20]:
x = clean_data['review_cleaned_lemmatized']
y = clean_data['overall']
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.2, stratify=y, random_state = 44)

In [None]:

# Naive Bayes
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print(accuracy_score(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# SGD Classifier
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print(accuracy_score(y_test, y_pred_sgd))
print(confusion_matrix(y_test, y_pred_sgd))
print(classification_report(y_test, y_pred_sgd))

# Logistic Regression
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

0.464
[[9468 4960 1289  518  165]
 [3948 7150 3567 1528  207]
 [1869 4010 5431 4542  548]
 [ 829 1561 2459 9638 1913]
 [ 883 1009 1002 7145 6361]]
              precision    recall  f1-score   support

         1.0       0.56      0.58      0.57     16400
         2.0       0.38      0.44      0.41     16400
         3.0       0.40      0.33      0.36     16400
         4.0       0.41      0.59      0.48     16400
         5.0       0.69      0.39      0.50     16400

    accuracy                           0.46     82000
   macro avg       0.49      0.46      0.46     82000
weighted avg       0.49      0.46      0.46     82000

0.47324390243902437
[[13422   993   701   337   947]
 [ 8227  2917  2388  1163  1705]
 [ 3630  1945  4231  3375  3219]
 [ 1255   577  1726  5142  7700]
 [ 1003   258   539  1506 13094]]
              precision    recall  f1-score   support

         1.0       0.49      0.82      0.61     16400
         2.0       0.44      0.18      0.25     16400
         3.0   

In [None]:
#can we improve it better with tunning the hyperparameters gridsearch 


from sklearn.model_selection import GridSearchCV

grid=[{'clf__solver': ['lbfgs', 'sag', 'saga'],
       'clf__C': [0.01, 0.1, 1]}]
lr = GridSearchCV(logreg, param_grid = grid, cv = 5, scoring='accuracy', verbose = 1, n_jobs = -1)
best_model = lr.fit(X_train, y_train)

print(best_model.best_estimator_)
print(best_model.best_score_)

y_pred_grid = best_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid))
print(accuracy_score(y_test, y_pred_grid))