# Amazon Reviews Classifier

The goal of this project is to perform Aspect-Based Sentimento Analysis (ASBA) to dive into reviews and analyze the opinions of customers about a product. The project is divided into two parts: the first one is the data preprocessing, where we clean the data and extract the aspects and the second one is the classification, where we use the extracted aspects to classify the reviews.

## Initial Setup

### Import and install required packages

In [1]:
# Install and import required packages

# Uncomment and run the following line if this is your first time running this notebook and you need to install the required packages
#!pip install -r "./requirements.txt"

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import contractions
import warnings
import time
import spacy
import re
from tqdm import tqdm
from textblob import TextBlob
from collections import Counter

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Initiate tdqm
tqdm.pandas()

In [2]:
# Environment settings
# Suppress warnings
warnings.filterwarnings('ignore')

# Expand display options for pandas dataframes
pd.set_option('display.max_colwidth', 200)
pd.options.display.max_columns = 999
pd.options.display.max_rows = 300

### User Definitions

In [3]:
# User definitions

# Dataset: name of the dataset to be used (must a CSV inside the data folder of the project)
dataset_name = "targeted_reviews.csv"

# Aspects to analyze (must be a dictionary with the aspect as key and a list of keywords that define the aspect as value)
aspects = {
    "battery": ["battery"],
    "performance": ["performance", "cpu", "ram", "processor", "gpu", "graphics"],
    "software": ["software", "operating system", "os", "programs"],
    "hardware": ["ports", "usb", "keyboared", "trackpad", "touchpad", "webcam", "cam"],
    "storage": ["storage", "hd", "hard drive", "ssd"],
    "audio": ["audio", "sound", "speakers"],
    "display": ["display", "screen", "resolution", "touchscreen"],
}

### Load the Dataset

In [4]:
# Loading the dataset
raw_data = pd.read_csv('./data/' + dataset_name, encoding='utf-8')

raw_data.head(2)

Unnamed: 0,review_title,review_body,review_author,review_rating,review_date,review_helpful_votes
0,Love it,Best laptop ever!,Carlos,5.0 out of 5 stars,"Reviewed in the United States 🇺🇸 on July 11, 2021",
1,Excelente,Es un Excelente computador llego en el momento que dijeron.,Nicolas Ospina Ospina,5.0 out of 5 stars,"Reviewed in the United States 🇺🇸 on May 25, 2021",


In [5]:
# Get only the relevant columns
data = raw_data[["review_body", "review_rating"]].rename(columns={"review_body": "Review", "review_rating": "Rating"})
data["Rating"] = data["Rating"].str.replace(".0 out of 5 stars", "/5")

data.head()

Unnamed: 0,Review,Rating
0,Best laptop ever!,5/5
1,Es un Excelente computador llego en el momento que dijeron.,5/5
2,Love this machine. have had it a couple months and it's fast battery lasts insanely long.,5/5
3,I’m happy with the product this far.Arrived on time.,5/5
4,Feels faster than any high powered desktop. This thing is unreal. Never even gets warm,5/5


## Data Preprocessing

In [6]:
# Data cleaning
# Remove rows with empty reviews
data = data[data["Review"].notna()]

# Remove extra spaces
data["Review"] = data["Review"].str.replace('\s+', ' ')

# Remove leading and trailing spaces
data["Review"] = data["Review"].str.strip()

# Remove reviews with less than 10 words
data = data[data["Review"].str.split().str.len() > 10]

# Set all reviews to lowercase
data["Review"] = data["Review"].str.lower()

# Reset the index
data = data.reset_index(drop=True)

data.head()

Unnamed: 0,Review,Rating
0,love this machine. have had it a couple months and it's fast battery lasts insanely long.,5/5
1,feels faster than any high powered desktop. this thing is unreal. never even gets warm,5/5
2,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks.",5/5
3,love that its compact and can take it anywhere - fast and easy to use.,5/5
4,so far great learning a mac system though but can find help at every corner of the net,5/5


In [7]:
# Clear the dataset from slang words
def remove_slang(text):
    """
    Correct common slang and abbreviations
    """
    # Example
    #text = re.sub(r"smh", "shake my head", text)
    
    return text

# Expand contractions
def expand_contractions(text):
    """
    Expand contractions (i.e don't -> do not)
    """
    expanded_text = contractions.fix(text, slang=False)
    return expanded_text

In [8]:
data['Review'] = data['Review'].apply(lambda x: remove_slang(x)).apply(lambda x: expand_contractions(x))

In [9]:
# Filter dataset for our aspects
dataset = pd.DataFrame(columns=['Review', 'Rating'])

for aspect, keywords in aspects.items():
    for keyword in keywords:
        dataset = dataset.append(data[data["Review"].str.contains(keyword)])

# Prints the number of reviews for each aspect
for aspect, keywords in aspects.items():
    print("Number of reviews containing the aspect " + aspect + ":", len(dataset[dataset["Review"].str.contains('|'.join(keywords))]))

# Reset the index
filtered_dataset = dataset.reset_index(drop=True)

Number of reviews containing the aspect battery: 2060
Number of reviews containing the aspect performance: 2639
Number of reviews containing the aspect software: 3815
Number of reviews containing the aspect hardware: 2062
Number of reviews containing the aspect storage: 1511
Number of reviews containing the aspect audio: 811
Number of reviews containing the aspect display: 3165


In [10]:
# Extract the sentences from the reviews that contains the aspect values
def extract_sentences(review, aspect, keywords):
    sentences = []
    for sentence in review.split("."):
        if any(word in sentence for word in keywords):
            sentences.append(sentence.strip())
    return sentences

# Convert a list to a string #separated by commas

for aspect, keywords in aspects.items():
    # apply the function to the dataset and separate the sentences by |
    filtered_dataset[aspect[0].upper() + aspect[1:] + " Sentences"] = filtered_dataset["Review"].progress_apply(lambda x: " | ".join(extract_sentences(x, aspect, keywords)))

filtered_dataset.head(5)



100%|██████████| 7057/7057 [00:00<00:00, 86062.21it/s]
100%|██████████| 7057/7057 [00:00<00:00, 56457.18it/s]
100%|██████████| 7057/7057 [00:00<00:00, 53059.81it/s]
100%|██████████| 7057/7057 [00:00<00:00, 47047.31it/s]
100%|██████████| 7057/7057 [00:00<00:00, 37338.22it/s]
100%|██████████| 7057/7057 [00:00<00:00, 44105.70it/s]
100%|██████████| 7057/7057 [00:00<00:00, 61369.92it/s]


Unnamed: 0,Review,Rating,Battery Sentences,Performance Sentences,Software Sentences,Hardware Sentences,Storage Sentences,Audio Sentences,Display Sentences
0,love this machine. have had it a couple months and it is fast battery lasts insanely long.,5/5,have had it a couple months and it is fast battery lasts insanely long,,,,,,
1,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks.",5/5,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks",,,,,,
2,excellent battery life! i love everything about this macbook. the only let down was the camera quality.,5/5,excellent battery life! i love everything about this macbook,,,the only let down was the camera quality,,,
3,absolutely love it!!!! came on time battery life is good i absolutely love the color,5/5,absolutely love it!!!! came on time battery life is good i absolutely love the color,,,absolutely love it!!!! came on time battery life is good i absolutely love the color,,,
4,"this is my first macbook and thank you for changing the shipping restrictions, i was able to buy one. i am still adjusting with the keyboard and the os coming from a long time windows user. the ba...",5/5,the battery life is amazing,"i originally wanted to get the 16gb ram and 1tb storage, but it does not ship to my location",i am still adjusting with the keyboard and the os coming from a long time windows user,,"i originally wanted to get the 16gb ram and 1tb storage, but it does not ship to my location",,


## Predictions

### Polarity Prediction

In [11]:
def polarity(text):
    """
    Predict the polarity of the text using TextBlob.
    Results range from negative to positive on a scale of [-1, +1].
    """
    testimonial = TextBlob(text)
    return round(testimonial.sentiment.polarity, 2)

In [12]:
# Calculate the polarity of the sentences for each aspect (and its keywords)
for aspect, keywords in aspects.items():
    aspect_polarities = []
    for sentence in filtered_dataset[aspect[0].upper() + aspect[1:] + " Sentences"]:
        # if sentence contains the keyword for the aspect
        if any(word in sentence for word in keywords):
            aspect_polarities.append(polarity(sentence))
        else:
            aspect_polarities.append(0)

    filtered_dataset[aspect[0].upper() + aspect[1:] + " Polarity"] = aspect_polarities

In [13]:
filtered_dataset.head()

Unnamed: 0,Review,Rating,Battery Sentences,Performance Sentences,Software Sentences,Hardware Sentences,Storage Sentences,Audio Sentences,Display Sentences,Battery Polarity,Performance Polarity,Software Polarity,Hardware Polarity,Storage Polarity,Audio Polarity,Display Polarity
0,love this machine. have had it a couple months and it is fast battery lasts insanely long.,5/5,have had it a couple months and it is fast battery lasts insanely long,,,,,,,0.08,0.0,0.0,0.0,0.0,0.0,0.0
1,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks.",5/5,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks",,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,excellent battery life! i love everything about this macbook. the only let down was the camera quality.,5/5,excellent battery life! i love everything about this macbook,,,the only let down was the camera quality,,,,0.75,0.0,0.0,-0.08,0.0,0.0,0.0
3,absolutely love it!!!! came on time battery life is good i absolutely love the color,5/5,absolutely love it!!!! came on time battery life is good i absolutely love the color,,,absolutely love it!!!! came on time battery life is good i absolutely love the color,,,,0.73,0.0,0.0,0.73,0.0,0.0,0.0
4,"this is my first macbook and thank you for changing the shipping restrictions, i was able to buy one. i am still adjusting with the keyboard and the os coming from a long time windows user. the ba...",5/5,the battery life is amazing,"i originally wanted to get the 16gb ram and 1tb storage, but it does not ship to my location",i am still adjusting with the keyboard and the os coming from a long time windows user,,"i originally wanted to get the 16gb ram and 1tb storage, but it does not ship to my location",,,0.6,0.38,-0.05,0.0,0.38,0.0,0.0


### Descriptors Identification

In [14]:
def dependency_matching(text, aspect):
    """
    Identify and extract word(s) that are describing
    the aspect term.
    """
    doc = nlp(text)

    tags = ['JJ', 'JJR', 'JJS']
    dependents = ['acomp', 'advmod']

    extraction = []
    for i, token in enumerate(doc):

        # location of aspect in sequence
        if re.search(aspect, token.text):
            aspect_pos = i

        if ((token.dep_ in dependents) or (token.tag_ in tags)) and re.search(aspect, token.head.text):
            extraction.append(token.text)

        if token.dep_ == 'acomp':
            extraction.append(token.text)

            children = [child for child in token.children]
            if len(children) > 0 and str(children[0]).isalpha():
                extraction.insert(0, str(children[0]))

            for t in range(4):
                try:
                    if doc[i-t].dep_ == 'neg':
                        negation = doc[i-t].text
                        extraction.insert(0, negation)
                except:
                    continue

        # look for adjectives near the aspect if no matches were found yet
        if len(extraction) == 0 and i == len(doc)-1:
            for t in range(-6, 6):
                try:
                    if doc[aspect_pos+t].tag_ in tags:
                        if doc[aspect_pos+t].text in extraction:
                            continue
                        extraction.append(doc[aspect_pos+t].text)

                        children = [
                            child for child in doc[aspect_pos+t].children]
                        if len(children) > 0 and str(children[0]).isalpha():
                            extraction.insert(0, str(children[0]))
                except:
                    continue

    return " ".join(extraction)


In [15]:
for aspect, keywords in aspects.items():
    sentences = filtered_dataset[aspect[0].upper() + aspect[1:] + " Sentences"].str.split("|")
    descriptors = []
    for sentence in sentences:
        descriptors.append([dependency_matching(s, aspect) for s in sentence])
    filtered_dataset[aspect[0].upper() + aspect[1:] + " Descriptors"] = descriptors

In [16]:
filtered_dataset.to_csv("./data/filtered_dataset_preprocessed.csv", index=True)

## Visualizations

In [17]:
# Generate a separate dataset for each aspect

dataset = pd.read_csv("./data/filtered_dataset_preprocessed.csv")

for aspect in aspects.keys():
    aspect_dataset = dataset[["Review", "Rating", aspect[0].upper() + aspect[1:] + " Sentences", aspect[0].upper() + aspect[1:] + " Polarity", aspect[0].upper() + aspect[1:] + " Descriptors"]]
    # remove rows with empty sentences
    aspect_dataset = aspect_dataset[aspect_dataset[aspect[0].upper() + aspect[1:] + " Sentences"].notna()]
    aspect_dataset.to_csv("./data/" + aspect + "_dataset.csv", index=True)

In [22]:
# Print the number of reviews for each aspect
for aspect in aspects.keys():
    aspect_dataset = pd.read_csv("./data/" + aspect + "_dataset.csv")
    print("Number of reviews containing the aspect " + aspect + ":", len(aspect_dataset))

Number of reviews containing the aspect battery: 2060
Number of reviews containing the aspect performance: 2639
Number of reviews containing the aspect software: 3815
Number of reviews containing the aspect hardware: 2062
Number of reviews containing the aspect storage: 1511
Number of reviews containing the aspect audio: 811
Number of reviews containing the aspect display: 3165


In [21]:
aspect = list(aspects.keys())[1]
aspect = aspect[0].upper() + aspect[1:]
temp_dataset = dataset[dataset['{} Sentences'.format(aspect)] != '']

# Display the Review, Rating Price Sentences, Price Polarities and Price Descriptors
temp_dataset[['Review', 'Rating', '{} Sentences'.format(aspect), '{} Polarity'.format(aspect), '{} Descriptors'.format(aspect)]].head(10)

Unnamed: 0,Review,Rating,Performance Sentences,Performance Polarity,Performance Descriptors
0,love this machine. have had it a couple months and it is fast battery lasts insanely long.,5/5,,0.0,['']
1,"the battery life is awesome, just one charged in a day in case you use it for demanding tasks.",5/5,,0.0,['']
2,excellent battery life! i love everything about this macbook. the only let down was the camera quality.,5/5,,0.0,['']
3,absolutely love it!!!! came on time battery life is good i absolutely love the color,5/5,,0.0,['']
4,"this is my first macbook and thank you for changing the shipping restrictions, i was able to buy one. i am still adjusting with the keyboard and the os coming from a long time windows user. the ba...",5/5,"i originally wanted to get the 16gb ram and 1tb storage, but it does not ship to my location",0.38,['']
5,"10/10 recommend, battery life is fantastic i charged it when i first got it & it is still at 97%",5/5,,0.0,['']
6,screen quality is great and battery lasts long it takes me a couple of days before putting it on a charger,5/5,,0.0,['']
7,the battery life is amazing and the speaker are good too,5/5,,0.0,['']
8,love this laptop best purchase yet! screen quality is amazing battery lasts forever and camera quality is great!!,5/5,,0.0,['']
9,i was happily surprised to see how good the battery life was compared to my other computers,5/5,,0.0,['']


In [None]:
ax = sns.histplot(dataset["Battery Polarity"], bins=np.arange(-1, 1, 0.2)).set_title('Histogram of Polarities (Aspect = "Battery")')
plt.show()

In [None]:
ax = sns.histplot(dataset["Battery Polarity"], bins=np.arange(-1, 1, 0.2)).set_title('Histogram of Polarities (Aspect = "Battery")')
plt.show()

In [None]:
# Save the Polarities histogram picture
#fig = ax.get_figure()
#fig.savefig('polarity_distribution.png' 
#            dpi=75, 
#            bbox_inches="tight")

In [None]:
# Descriptor Analysis (n-grams)
positives = dataset[dataset["Battery Polarity"] > 0] # polarity greater than 0
negatives = dataset[dataset["Battery Polarity"] < 0] # polarity less than 0

In [None]:
def ngrams(text, n):
    return zip(*[text[i:] for i in range(n)])

def display_ngram_frequency(corpus, n, display):
    """
    Generate a DataFrame of n-grams and their frequencies.
    """
    ngram_counts = Counter(ngrams(corpus.split(), n))
    most_commmon = ngram_counts.most_common(display)

    ngram = []
    count = []
    for i in range(0,len(most_commmon)):
        ngram.append(" ".join(most_commmon[i][0]))
        count.append(most_commmon[i][1])

    if n > 3:
        col = f"{n}-gram"
    if n == 3:
        col = 'Tri-gram'
    if n == 2:
        col = 'Bi-gram'

    return pd.DataFrame(zip(ngram, count), columns=[col, "Count"])

In [None]:
# list all negative descriptors in a single string
descriptors_negative_opinions = negatives["Battery Descriptors"].tolist()
descriptors_negative_opinions = " ".join(descriptors_negative_opinions)

# positives
descriptors_positive_opinions = positives["Battery Descriptors"].tolist()
descriptors_positive_opinions = " ".join(descriptors_positive_opinions)

In [None]:
display_ngram_frequency(descriptors_negative_opinions, n=3, display=10)

### WordClouds

In [None]:
from wordcloud import WordCloud

def generate_wordcloud(text):
    """
    Generate word cloud images.
    """
    wordcloud = WordCloud(collocations=False, background_color="black", max_words=50).generate(text)
    
    # set the figure size
    plt.figure(figsize=[8,10])

    # plot the wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")

    # remove plot axes
    plt.axis("off")

In [None]:
# WordCloud: descriptors extracted from negative opinions
generate_wordcloud(descriptors_negative_opinions)

In [None]:
# WordCloud: descriptors extracted from positive opinions
generate_wordcloud(descriptors_positive_opinions)