## Data Preparation

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from textblob import TextBlob
import csv
import re
from tqdm import tqdm

In [3]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
print(dataset["full"][0])
print(type(dataset))

{'rating': 5.0, 'title': 'Such a lovely scent but not overpowering.', 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'images': [], 'asin': 'B00YQ6X8EO', 'parent_asin': 'B00YQ6X8EO', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1588687728923, 'helpful_vote': 0, 'verified_purchase': True}
<class 'datasets.dataset_dict.DatasetDict'>


In [5]:
file = pd.DataFrame(columns=['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id', 'helpful_vote', 'verified_purchase'])

In [7]:
file["rating"] = [i["rating"] for i in dataset["full"]]
file["title"] = [i["title"] for i in dataset["full"]]
file["text"] = [i["text"] for i in dataset["full"]]
file["asin"] = [i["asin"] for i in dataset["full"]]
file["parent_asin"] = [i["parent_asin"] for i in dataset["full"]]
file["user_id"] = [i["user_id"] for i in dataset["full"]]
file["helpful_vote"] = [i["helpful_vote"] for i in dataset["full"]]
file["verified_purchase"] = [i["verified_purchase"] for i in dataset["full"]]

In [9]:
file.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True


In [11]:
file.to_csv("Amazon_All_Beauty_Reviews_2023.csv", index = False)

## Sentiment Analysis

In [3]:
tqdm.pandas()

In [5]:
data = pd.read_csv("Amazon_All_Beauty_Reviews_2023.csv")
data = data[:20000]
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True


In [7]:
def sentiment_score(rating):
    if (rating > 3.0):
        return 'positive'
    elif (rating == 3.0):
        return 'neutral'
    else:
        return 'negative'
        
data['label'] = data.rating.apply(sentiment_score)

In [9]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive


In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name, truncation=True)
model = BertForSequenceClassification.from_pretrained(model_name)
    
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)

In [13]:
def analyze_sentiment(text):
    result = sentiment_pipeline(text)

    return int(result[0]['label'][0])

data['text'] = data.text.apply(str)
data['predicted_label'] = data.text.progress_apply(analyze_sentiment)

100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [1:20:58<00:00,  4.12it/s]


In [15]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label,predicted_label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive,4
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive,4
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive,5
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative,5
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive,5


In [17]:
def sentiment_score(rating):
    if (rating > 3.0):
        return 'positive'
    elif (rating == 3.0):
        return 'neutral'
    else:
        return 'negative'
        
data['predicted_label'] = data.predicted_label.apply(sentiment_score)

In [19]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label,predicted_label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive,positive
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive,positive
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative,positive
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive,positive


In [21]:
from sklearn import metrics

cm = metrics.confusion_matrix(data['label'], data['predicted_label'])

print(cm)

accuracy = ((cm[0,0]+cm[1,1]+cm[2,2]) / (np.sum(cm))) * 100

print(accuracy)

[[ 2661   343   103]
 [  623   959   342]
 [  463  1107 13399]]
85.095
