#Amazon Reviews Dataset
An Analysis by Andrew Bauman and Allison Nowakowski

Fall 2019

In [None]:
#Importing the Amazon Reviews Dataset for Pet Products in the US Marketplace
from urllib.request import urlretrieve
import os

def download(url, file):
    if not os.path.isfile(file):
        print("Download file... " + file + " ...")
        urlretrieve(url,file)
        print("File downloaded")

download('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz','amazon_reviews_us_Pet_Products_v1_00.tsv.gz')
print("All the files are downloaded")

#Verifying that the file was downloaded
!ls 

In [None]:
#Creating a pandas dataframe with the data, skipping bad lines for now.
import pandas as pd

df = pd.read_csv('amazon_reviews_us_Pet_Products_v1_00.tsv.gz', compression='gzip', sep='\t', header=0, error_bad_lines=False)
df.head()

In [None]:
#Previewing the data
df.info()

In [None]:
#Cleaning/Simplifying the dataset
#Removing unnecessary columns and only keeping reviews from verified purchases.
df=df[['review_id', 'star_rating', 'verified_purchase', 'helpful_votes', 'total_votes', 'review_headline', 'review_body']]
df.info()

In [None]:
#Viewing the data
import seaborn as sns
df.groupby('star_rating').count()

In [None]:
#Dropping rows without reviews
df_filtered = df.dropna()
df_filtered = df_filtered.reset_index(drop=True)
df_filtered.groupby('star_rating').count()

df_filtered.star_rating.value_counts()

In [None]:
#Viewing the data
sns.countplot(df_filtered.star_rating)

In [None]:
df_minority = df_filtered[df_filtered.star_rating==2.0]
df_minority_count = df_minority.size
df_minority_count

In [None]:
from sklearn.utils import resample
#Balancing the data
df_majority = df_filtered[df_filtered.star_rating==5.0]
df_minority = df_filtered[df_filtered.star_rating==2.0]
df_oneStar = df_filtered[df_filtered.star_rating==1.0]
df_threeStars = df_filtered[df_filtered.star_rating==3.0]
df_fourStars = df_filtered[df_filtered.star_rating==4.0]
df_minority_count = df_minority.count

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                  replace=False,    # sample without replacement
                                  n_samples=151065, # to match minority class
                                  random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority, df_oneStar, df_threeStars, df_fourStars])

# Display new class counts
df_downsampled.star_rating.value_counts()

In [None]:
#Viewing the downsampled data
sns.countplot(df_downsampled.star_rating)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

testData, trainData = train_test_split(df_downsampled, test_size=0.33, random_state=42)


In [None]:
trainData.size

In [None]:
testData.size

In [None]:
#Defining a pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()), #adding stop_words='english' lowers accuracy
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=42, solver='sag', multi_class='ovr')),
])
text_clf.fit(trainData.review_body, trainData.star_rating)

In [None]:
#Testing the classifer
test_docs = testData.review_body
predictions = text_clf.predict(test_docs)
#Evaluating predictive accuracy of the classifier
np.mean(predictions == testData.star_rating)

In [None]:
#Printing the model metrics
from sklearn import metrics
print(metrics.classification_report(testData.star_rating, predictions))

Comparing our model's results to state-of-the-art methods:

https://paperswithcode.com/sota/sentiment-analysis-on-amazon-review-full

# NOTE

The code below is unable to be run on google colab (not enough RAM) and, therefore, we have been unable to run the model and test its accuracy. It is our attempt to create a version of the above model using keras and neural networks. 

In [None]:
#Neural Network Model -- CANNOT RUN ON GOOGLE COLAB;
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer

In [None]:
num_labels = 5
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(trainData.review_body)
 
x_train = tokenizer.texts_to_matrix(trainData.review_body, mode='tfidf')
x_test = tokenizer.texts_to_matrix(testData.review_body, mode='tfidf')

encoder = LabelBinarizer()
encoder.fit(trainData.review_body)
y_train = encoder.transform(trainData.review_body)
y_test = encoder.transform(testData.review_body)

In [None]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=1,
        verbose=1,
    ),
]

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=30,
                    verbose=1,
                    callbacks=callbacks_list,
                    validation_data=(x_train, y_train))

In [None]:
#Evaluating the model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_
 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_data.filename.iloc[i])
    print('Actual label:' + test_data.category.iloc[i])
    print("Predicted label: " + predicted_label)

In [None]:
#Evaluating the model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
 
print('Test accuracy:', score[1])
 
text_labels = encoder.classes_
 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_data.filename.iloc[i])
    print('Actual label:' + test_data.category.iloc[i])
    print("Predicted label: " + predicted_label)

In [None]:
#Confusion Matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


y_pred = model.predict(x_test);
cnf_matrix = metrics.confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1))

# Plot normalized confusion matrix
fig = plt.figure()
fig.set_size_inches(14, 12, forward=True)
#fig.align_labels()

# fig.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)
plot_confusion_matrix(cnf_matrix, classes=np.asarray(label_names), normalize=True,
                      title='Normalized confusion matrix')

fig.savefig("txt_classification-smote" + ".png", pad_inches=5.0)