## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import csv
import re
from tqdm import tqdm

In [3]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
print(dataset["full"][0])
print(type(dataset))

{'rating': 5.0, 'title': 'Such a lovely scent but not overpowering.', 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'images': [], 'asin': 'B00YQ6X8EO', 'parent_asin': 'B00YQ6X8EO', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1588687728923, 'helpful_vote': 0, 'verified_purchase': True}
<class 'datasets.dataset_dict.DatasetDict'>


In [5]:
file = pd.DataFrame(columns=['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id', 'helpful_vote', 'verified_purchase'])

In [7]:
file["rating"] = [i["rating"] for i in dataset["full"]]
file["title"] = [i["title"] for i in dataset["full"]]
file["text"] = [i["text"] for i in dataset["full"]]
file["asin"] = [i["asin"] for i in dataset["full"]]
file["parent_asin"] = [i["parent_asin"] for i in dataset["full"]]
file["user_id"] = [i["user_id"] for i in dataset["full"]]
file["helpful_vote"] = [i["helpful_vote"] for i in dataset["full"]]
file["verified_purchase"] = [i["verified_purchase"] for i in dataset["full"]]

In [9]:
file.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True


In [11]:
file.to_csv("Amazon_All_Beauty_Reviews_2023.csv", index = False)

## Sentiment Analysis

In [3]:
tqdm.pandas()

In [5]:
data = pd.read_csv("Amazon_All_Beauty_Reviews_2023.csv")
data = data[:20000]
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True


In [7]:
def sentiment_score(rating):
    if (rating > 3.0):
        return 'positive'
    elif (rating == 3.0):
        return 'neutral'
    else:
        return 'negative'
        
data['label'] = data.rating.apply(sentiment_score)

In [9]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive


In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name, truncation=True)
model = BertForSequenceClassification.from_pretrained(model_name)
    
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)

In [13]:
def analyze_sentiment(text):
    result = sentiment_pipeline(text)

    return int(result[0]['label'][0])

data['text'] = data.text.apply(str)
data['predicted_label'] = data.text.progress_apply(analyze_sentiment)

 72%|██████████████████████████████████████████████████████▌                     | 14358/20000 [39:50<11:57,  7.86it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [53:04<00:00,  6.28it/s]


In [15]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label,predicted_label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive,4
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive,4
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive,5
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative,5
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive,5


In [17]:
def sentiment_score(rating):
    if (rating > 3.0):
        return 'positive'
    elif (rating == 3.0):
        return 'neutral'
    else:
        return 'negative'
        
data['predicted_label'] = data.predicted_label.apply(sentiment_score)

In [19]:
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,label,predicted_label
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,positive,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1,True,positive,positive
2,5.0,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2,True,positive,positive
3,1.0,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,negative,positive
4,5.0,A+,Love it,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,0,True,positive,positive


In [21]:
from sklearn import metrics

cm = metrics.confusion_matrix(data['label'], data['predicted_label'])

print(cm)

accuracy = ((cm[0,0]+cm[1,1]+cm[2,2]) / (np.sum(cm))) * 100

print(accuracy)

[[ 2661   343   103]
 [  623   959   342]
 [  463  1107 13399]]
85.095


## Text Generation

In [195]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", trust_remote_code=True)
print(dataset["full"][1])
print(type(dataset))

{'main_category': 'All Beauty', 'title': 'Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.', 'average_rating': 4.5, 'rating_number': 3, 'features': [], 'description': [], 'price': 'None', 'images': {'hi_res': ['https://m.media-amazon.com/images/I/71g1lP0pMbL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/81OqvR94isL._SL1500_.jpg'], 'large': ['https://m.media-amazon.com/images/I/41b+11d5igL.jpg', 'https://m.media-amazon.com/images/I/41j2ocUzCtL.jpg'], 'thumb': ['https://m.media-amazon.com/images/I/41b+11d5igL._SS40_.jpg', 'https://m.media-amazon.com/images/I/41j2ocUzCtL._SS40_.jpg'], 'variant': ['MAIN', 'PT01']}, 'videos': {'title': [], 'url': [], 'user_id': []}, 'store': 'Yes To', 'categories': [], 'details': '{"Item Form": "Powder", "Skin Type": "Acne Prone", "Brand": "Yes To", "Age Range (Description)": "Adult", "Unit Count": "10 Fl Oz", "Is Discontinued By Manufacturer": "No", "Item m

In [177]:
file = pd.DataFrame(columns=['title', 'parent_asin'])

In [191]:
file["title"] = [i for i in dataset["full"]["title"]]
file["parent_asin"] = [i for i in dataset["full"]["parent_asin"]]

In [193]:
file.head()

Unnamed: 0,title,parent_asin
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",B01CUPMQZE
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,B076WQZGPM
2,Eye Patch Black Adult with Tie Band (6 Per Pack),B000B658RI
3,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",B088FKY3VD
4,Precision Plunger Bars for Cartridge Grips – 9...,B07NGFDN6G


In [197]:
file.to_csv("Amazon_All_Beauty_Meta_2023.csv", index = False)

In [23]:
data1 = pd.read_csv("Amazon_All_Beauty_Meta_2023.csv")
data1.head()

Unnamed: 0,title,parent_asin
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",B01CUPMQZE
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,B076WQZGPM
2,Eye Patch Black Adult with Tie Band (6 Per Pack),B000B658RI
3,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",B088FKY3VD
4,Precision Plunger Bars for Cartridge Grips – 9...,B07NGFDN6G


## Positive Product Description

In [171]:
import pandas as pd
from rake_nltk import Rake

product = 'B00YQ6X8EO'

ls = data[data['asin'] == product]['predicted_label'].value_counts('positive')

max1 = max(ls.keys())
    
reviews = [i for i in data.loc[(data['asin'] == product) & (data['predicted_label'] == max1), 'text']]

# Create a pandas DataFrame
df = pd.DataFrame({'reviews': reviews})

# Initialize RAKE
r = Rake()

# Extract keywords from each review
keywords = []
for review in df['reviews']:
    r.extract_keywords_from_text(review)
    keywords.extend(r.get_ranked_phrases())
    
print(keywords)

['smells really good', 'really nice', 'really fine', 'yucky chemicals', 'medium thickness', 'gonna stick', 'feels like', 'want', 'try', 'trick', 'though', 'texture', 'spray', 'say', 'need', 'lot', 'lot', 'hair', 'goes', 'get', 'comparing', 'brands']


In [175]:
import json
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import nltk
import random

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load data
product_name = data1.loc[data1['parent_asin'] == product]['title']
keywords = keywords[:3]
sentiments = [max1]

# Preprocess the keywords
def preprocess_keywords(keywords):
    processed_keywords = [gensim.utils.simple_preprocess(keyword) for keyword in keywords]
    return processed_keywords

processed_keywords = preprocess_keywords(keywords)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=processed_keywords, vector_size=100, window=5, min_count=1, workers=4)

# Generate product description based on keywords and sentiments
def generate_product_description(product_name, keywords, sentiments):
    positive_templates = [
        f"The {product_name} is highly praised for its {{}}, making it a top choice among consumers.",
        f"Users love the {product_name} for its {{}}, which sets it apart from the competition.",
        f"One of the standout features of the {product_name} is its {{}}, which has received rave reviews."
    ]
    
    neutral_templates = [
        f"The {product_name} offers {{}} features that cater to a wide range of needs.",
        f"With its  {{}} features, the {product_name} provides a balanced performance.",
        f"Consumers appreciate the  {{}} aspects of the {product_name}, which contribute to its versatility."
    ]
    
    negative_templates = [
        f"However, some users have reported issues with the  {{}} of the {product_name}.",
        f"Despite its many strengths, the {product_name} has faced criticism for its  {{}}.",
        f"While generally well-received, the {product_name} has some drawbacks, particularly its  {{}}."
    ]
    
    description = f"The {product_name} is a remarkable product. "
          
    for keyword, sentiment in zip(keywords, sentiments):
        if sentiment == "positive":
            template = random.choice(positive_templates)
            description += template.format(keyword) + " "
        elif sentiment == "neutral":
            template = random.choice(neutral_templates)
            description += template.format(keyword) + " "
        elif sentiment == "negative":
            template = random.choice(negative_templates)
            description += template.format(keyword) + " "
    
    return description.strip()

# Generate product description
print("Positive product description: ")
description = generate_product_description(product_name, keywords, sentiments)

print(description)

Positive product description: 
The 29624    Herbivore - Natural Sea Mist Texturizing Salt ...
Name: title, dtype: object is a remarkable product. Users love the 29624    Herbivore - Natural Sea Mist Texturizing Salt ...
Name: title, dtype: object for its smells really good, which sets it apart from the competition.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Negative Product Description

In [179]:
import pandas as pd
from rake_nltk import Rake

product = 'B00R8DXL44'

ls = data[data['asin'] == product]['predicted_label'].value_counts('positive')

max1 = max(ls.keys())
    
reviews = [i for i in data.loc[(data['asin'] == product) & (data['predicted_label'] == max1), 'text']]

# Create a pandas DataFrame
df = pd.DataFrame({'reviews': reviews})

# Initialize RAKE
r = Rake()

# Extract keywords from each review
keywords = []
for review in df['reviews']:
    r.extract_keywords_from_text(review)
    keywords.extend(r.get_ranked_phrases())
    
print(keywords)

['second coat since', 'let dry overnight', 'apply smoothly', 'quiet thick', 'thick', 'polish', 'adding']


In [181]:
import json
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import nltk
import random

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load data
product_name = data1.loc[data1['parent_asin'] == product]['title']
keywords = keywords[:3]
sentiments = [max1]

# Preprocess the keywords
def preprocess_keywords(keywords):
    processed_keywords = [gensim.utils.simple_preprocess(keyword) for keyword in keywords]
    return processed_keywords

processed_keywords = preprocess_keywords(keywords)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=processed_keywords, vector_size=100, window=5, min_count=1, workers=4)

# Generate product description based on keywords and sentiments
def generate_product_description(product_name, keywords, sentiments):
    positive_templates = [
        f"The {product_name} is highly praised for its {{}}, making it a top choice among consumers.",
        f"Users love the {product_name} for its {{}}, which sets it apart from the competition.",
        f"One of the standout features of the {product_name} is its {{}}, which has received rave reviews."
    ]
    
    neutral_templates = [
        f"The {product_name} offers {{}} features that cater to a wide range of needs.",
        f"With its  {{}} features, the {product_name} provides a balanced performance.",
        f"Consumers appreciate the  {{}} aspects of the {product_name}, which contribute to its versatility."
    ]
    
    negative_templates = [
        f"However, some users have reported issues with the  {{}} of the {product_name}.",
        f"Despite its many strengths, the {product_name} has faced criticism for its  {{}}.",
        f"While generally well-received, the {product_name} has some drawbacks, particularly its  {{}}."
    ]
    
    description = f"The {product_name} is a remarkable product. "
          
    for keyword, sentiment in zip(keywords, sentiments):
        if sentiment == "positive":
            template = random.choice(positive_templates)
            description += template.format(keyword) + " "
        elif sentiment == "neutral":
            template = random.choice(neutral_templates)
            description += template.format(keyword) + " "
        elif sentiment == "negative":
            template = random.choice(negative_templates)
            description += template.format(keyword) + " "
    
    return description.strip()

# Generate product description
print("Negative product description: ")
description = generate_product_description(product_name, keywords, sentiments)

print(description)

Negative product description: 
The 57662    China Glaze Nail Polish, Wanderlust 1381
Name: title, dtype: object is a remarkable product. However, some users have reported issues with the  second coat since of the 57662    China Glaze Nail Polish, Wanderlust 1381
Name: title, dtype: object.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
