In [57]:
# Import necessary libraries
import pandas as pd
import spacy


In [58]:
# For the text processing load the spacy English model
nlp = spacy.load('en_core_web_md')


In [59]:
# Read the dataset containing the review data
data = pd.read_csv('amazon_product_reviews.csv')

In [60]:
data.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.dateSeen,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-09-18T00:00:00Z,2017...",False,,0,3,http://reviews.bestbuy.com/3545/5442403/review...,I thought it would be as big as small paper bu...,Too small,llyyue,https://www.newegg.com/Product/Product.aspx%25...
1,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-07-07T00:00:00Z,2017...",True,,0,5,http://reviews.bestbuy.com/3545/5442403/review...,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,Charmi,https://www.newegg.com/Product/Product.aspx%25...
2,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-05-27T00:00:00Z,True,,0,4,https://reviews.bestbuy.com/3545/5442403/revie...,Didnt know how much i'd use a kindle so went f...,Great for the price,johnnyjojojo,https://www.newegg.com/Product/Product.aspx%25...
3,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-10-09T00:00:00Z,True,177283626.0,3,5,https://redsky.target.com/groot-domain-api/v1/...,I am 100 happy with my purchase. I caught it o...,A Great Buy,Kdperry,https://www.newegg.com/Product/Product.aspx%25...
4,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-05-27T00:00:00Z,True,,0,5,https://reviews.bestbuy.com/3545/5442403/revie...,Solid entry level Kindle. Great for kids. Gift...,Solid entry-level Kindle. Great for kids,Johnnyblack,https://www.newegg.com/Product/Product.aspx%25...


In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5000 non-null   object 
 1   dateAdded            5000 non-null   object 
 2   dateUpdated          5000 non-null   object 
 3   name                 5000 non-null   object 
 4   asins                5000 non-null   object 
 5   brand                5000 non-null   object 
 6   categories           5000 non-null   object 
 7   primaryCategories    5000 non-null   object 
 8   imageURLs            5000 non-null   object 
 9   keys                 5000 non-null   object 
 10  manufacturer         5000 non-null   object 
 11  manufacturerNumber   5000 non-null   object 
 12  reviews.date         5000 non-null   object 
 13  reviews.dateAdded    1052 non-null   object 
 14  reviews.dateSeen     5000 non-null   object 
 15  reviews.doRecommend  5000 non-null   b

In [62]:
# Preprocess text: remove stopwords and perform text cleaning
def preprocess_text(text):
    doc = nlp(text)
    cleaned_text = ' '.join(token.text.lower().strip() for token in doc if not token.is_stop and token.is_alpha)
    return cleaned_text

In [63]:
# Apply preprocessing to all reviews
# Create a new column for cleaned reviews
data.dropna(subset=['reviews.text'], inplace=True)
data['cleaned_review'] = data['reviews.text'].apply(preprocess_text)

In [64]:
# Import the textblob, from the textblob library
from textblob import TextBlob

def analyze_sentiment(review):
    # Analyze sentiment using TextBlob and polarity score for the review
    blob = TextBlob(review)
    polarity = blob.sentiment.polarity
    
    # Classify sentiment based on polarity score
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'


In [65]:
# Test the sentiment analysis function on sample reviews
sample_reviews = [
    "This product exceeded my expectations. I love it!",
    "The product arrived damaged and doesn't work properly.",
    "It's an okay product, nothing special."
]

In [66]:
# Iterate through the set of sample reviews and output the sentiment analysis results for each review.
for review in sample_reviews:
    sentiment = analyze_sentiment(review)
    print(f"Review: {review}\nSentiment: {sentiment}\n")

Review: This product exceeded my expectations. I love it!
Sentiment: Positive

Review: The product arrived damaged and doesn't work properly.
Sentiment: Neutral

Review: It's an okay product, nothing special.
Sentiment: Positive



In [73]:
# my two reviews for comparison
my_review_of_choice_1 = data['reviews.text'][0]
my_review_of_choice_2 = data['reviews.text'][1]

# Process the text of each review using spaCy
doc1 = nlp(my_review_of_choice_1)
doc2 = nlp(my_review_of_choice_2)

# Calculate the similarity score between the processed texts of the two reviews
similarity_score = doc1.similarity(doc2)

# Print the similarity score
print("Similarity score between the two reviews:", similarity_score)
# Print the selected reviews
# Print the selected review
print("Selected review 1:", my_review_of_choice_1)
print("Selected review 2:", my_review_of_choice_2)

Similarity score between the two reviews: 0.757820996637799
Selected review 1: I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.
Selected review 2: This kindle is light and easy to use especially at the beach!!!


Observation

The performance of the sentiment analysis model can be influenced by the choice of the underlying language model. In this analysis, I used different models such as small and medium, and when using the small spacy model, the sentiment analysis for the two reviews did not perform well, possibly due to the limited vocabulary and context understanding. 