## Data Preparation

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from textblob import TextBlob
import csv
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", trust_remote_code=True)
print(dataset["full"][0])
print(type(dataset))

{'rating': 5.0, 'title': 'Work great', 'text': 'work great. use a new one every month', 'images': [], 'asin': 'B01N0TQ0OH', 'parent_asin': 'B01N0TQ0OH', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1519317108692, 'helpful_vote': 0, 'verified_purchase': True}
<class 'datasets.dataset_dict.DatasetDict'>


In [3]:
file = pd.DataFrame(columns=['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id', 'helpful_vote', 'verified_purchase'])

In [4]:
file["title"] = [i["title"] for i in dataset["full"]]
file["text"] = [i["text"] for i in dataset["full"]]
file["asin"] = [i["asin"] for i in dataset["full"]]
file["parent_asin"] = [i["parent_asin"] for i in dataset["full"]]
file["user_id"] = [i["user_id"] for i in dataset["full"]]
file["helpful_vote"] = [i["helpful_vote"] for i in dataset["full"]]
file["verified_purchase"] = [i["verified_purchase"] for i in dataset["full"]]

In [5]:
file.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True


In [6]:
file.to_csv("Amazon_Reviews_2023.csv", index = False)

## Data Preprocessing

In [7]:
data = pd.read_csv("Amazon_Reviews_2023.csv")
data = data.loc[:10000, :]
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True


In [8]:
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

data['cleaned reviews'] = data['text'].apply(clean)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...


In [9]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

data['POS tagged'] = data['cleaned reviews'].apply(token_stop_pos)
data.head()

[nltk_data] Downloading package punkt to /home/rohit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/rohit/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/rohit/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/rohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rohit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o..."
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]"
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue..."
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give..."
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ..."


In [10]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

data['Lemma'] = data['POS tagged'].apply(lemmatize)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged,Lemma
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o...",work great use new one every month
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]",Little thin side
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue...",Quick delivery fix issue
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give...",sure whether worth give cost compare origina...
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ...",Easy install get product expect receive


In [11]:
from textblob import TextBlob

def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
    
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [12]:
data['Polarity'] = data['Lemma'].apply(getPolarity)
data['Analysis'] = data['Polarity'].apply(analysis)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged,Lemma,Polarity,Analysis
0,,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o...",work great use new one every month,0.468182,Positive
1,,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]",Little thin side,-0.29375,Negative
2,,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue...",Quick delivery fix issue,0.333333,Positive
3,,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give...",sure whether worth give cost compare origina...,0.51875,Positive
4,,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ...",Easy install get product expect receive,0.433333,Positive


In [13]:
data['Analysis'].value_counts()

Analysis
Positive    8149
Neutral     1021
Negative     831
Name: count, dtype: int64

In [14]:

from sentence_transformers import SentenceTransformer, util

# Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute Embeddings for Lemmatized Reviews
data['review_embedding'] = data['Lemma'].apply(lambda x: model.encode(x, convert_to_tensor=True))


2024-12-01 21:20:27.649267: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
# Example User Context
user_context = "Looking for a durable jacket for cold weather"
user_embedding = model.encode(user_context, convert_to_tensor=True)

# Calculate Similarity Scores
data['similarity_score'] = data['review_embedding'].apply(lambda x: util.pytorch_cos_sim(user_embedding, x).item())

# Sort Data by Similarity Score
data_sorted = data.sort_values(by='similarity_score', ascending=False)

In [16]:
# Display Top Matches
print("\nTop Matching Reviews for User Context:")
print(data_sorted[['Lemma', 'similarity_score']].head())


Top Matching Reviews for User Context:
                                                  Lemma  similarity_score
2145                             Decent product durable          0.428468
3420    Material quite thin know weather sunlight ef...          0.418373
8395     job flap tad flimsy Since protect weather okay          0.399216
9829                           love keep cold air dryer          0.397104
8125    Works great Temperature stay even issue area...          0.382547


In [17]:
from collections import Counter
# Extract user preferences based on positive reviews
def extract_keywords(data, num_keywords=10):
    all_words = " ".join(data['Lemma']).split()
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in all_words if word.lower() not in stopwords_list]
    most_common = Counter(filtered_words).most_common(num_keywords)
    keywords = [word for word, _ in most_common]
    return ", ".join(keywords)

# Filter positive reviews for user context
positive_reviews = data[data['Analysis'] == 'Positive']
user_preferences = extract_keywords(positive_reviews)
user_context = f"User preferences include: {user_preferences}."
print("Extracted User Context:")
print(user_context)

Extracted User Context:
User preferences include: br, use, filter, work, one, coffee, make, get, great, water.


In [18]:
from transformers import pipeline

# Load Text Generation Model
# generator = pipeline("text-generation", model="gpt-2")
# Use a pipeline as a high-level helper

generator = pipeline("text-generation", model="openai-community/gpt2")

# Generate Personalized Product Descriptions
def generate_description(user_context, product_details):
    input_prompt = f"Context: {user_context}. Product: {product_details}. Write a personalized product description:"
    result = generator(input_prompt, max_length=50, num_return_sequences=1)
    return result

# Example Product Details
product_details = "This is a lightweight, durable jacket made of waterproof material."

# Generate and Display Personalized Description
personalized_description = generate_description(user_context, product_details)
print("\nPersonalized Product Description:")
print(personalized_description)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Personalized Product Description:
[{'generated_text': 'Context: User preferences include: br, use, filter, work, one, coffee, make, get, great, water.. Product: This is a lightweight, durable jacket made of waterproof material.. Write a personalized product description: this is a simple'}]


In [19]:
# from transformers import pipeline

# # Use distilgpt2 for text generation
# generator = pipeline("text-generation", model="distilgpt2")

# # Generate personalized product description
# context = "Looking for a durable jacket for cold weather."
# product = "This jacket is lightweight, waterproof, durable and designed for extreme conditions."

# input_prompt = f"Context: {context}. Product: {product}. Write a personalized product description:"
# result = generator(input_prompt, max_length=50, num_return_sequences=1)

# print("Generated Description:")
# print(result)

In [20]:
# Summarize product features from the 'text' column
def summarize_product_details(data):
    return ". ".join(data['text'].dropna()[:5])  # Combine first 5 non-null product descriptions

product_details = summarize_product_details(data)
print("\nExtracted Product Details:")
print(product_details)


Extracted Product Details:
work great. use a new one every month. Little on the thin side. Quick delivery, fixed the issue!. I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.. Easy to install got the product expected to receive


In [21]:
# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for lemmatized reviews
data['review_embedding'] = data['Lemma'].apply(lambda x: model.encode(x, convert_to_tensor=True))

# Example User Context (use extracted preferences or manually define)
user_embedding = model.encode(user_context, convert_to_tensor=True)

# Calculate similarity scores
data['similarity_score'] = data['review_embedding'].apply(lambda x: util.pytorch_cos_sim(user_embedding, x).item())

# Sort data by similarity scores
data_sorted = data.sort_values(by='similarity_score', ascending=False)

# Display top matches
print("\nTop Matching Reviews for User Context:")
print(data_sorted[['Lemma', 'similarity_score']].head())


Top Matching Reviews for User Context:
                                                  Lemma  similarity_score
1142    Okay read review still buy br br B Keurig ma...          0.593335
267     excited use cold brew since love many OXO pr...          0.589313
960     Every Care product use good product particul...          0.582313
2282    thing make water taste well believer br br l...          0.580809
8244    filter let water quickly part secret good po...          0.579286


In [22]:
 #Load text generation model
generator = pipeline("text-generation", model="distilgpt2")

# Construct the prompt for the LLM
input_prompt = f"Context: {user_context}. Product: {product_details}. Task: Write a detailed, personalized product description."

# Generate the description
result = generator(input_prompt, max_length=150, truncation=True, num_return_sequences=1)

# Display generated description
print("\nGenerated Product Description:")
print(result[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Product Description:
Context: User preferences include: br, use, filter, work, one, coffee, make, get, great, water.. Product: work great. use a new one every month. Little on the thin side. Quick delivery, fixed the issue!. I wasn't sure whether these were worth it or not, given the cost compared to the original branded filters.<br /><br />I can happily report that these are a great value and work every bit as good as the original. If you are on the fence worrying whether these are worth it- I can assure you they are.. Easy to install got the product expected to receive. Task: Write a detailed, personalized product description. Use my product information and send it to me. I will


In [34]:
from transformers import pipeline

# Example cleaned and refined user preferences and product details
user_preferences = "durable, easy to use, reliable, affordable"
product_details = "This filter works well, is easy to install, and provides great value compared to the original branded filters."

# Load text generation model
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B")

product_details = 'Jacker with extra lining and thermal layer, it is rugged, and waterproof'
# Construct a more refined prompt
input_prompt = f"Write a detailed product description based on the following user preferences: {user_preferences}. Product details: {product_details}. Be sure to highlight the benefits of the product."

# Generate the description
result = generator(input_prompt, max_length=150, num_return_sequences=1, temperature=0.7)

# Display generated description
print("\nGenerated Product Description:")
print(result[0]['generated_text'])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Product Description:
Write a detailed product description based on the following user preferences: durable, easy to use, reliable, affordable. Product details: Jacker with extra lining and thermal layer, it is rugged, and waterproof. Be sure to highlight the benefits of the product.

The Jacker is a water resistant, rugged, and durable waterproof jacket. It is easy to use and durable. The Jacker is an excellent product for the job. It is very well made and has a good price. Be sure to highlight the benefits of the product.

This product is one of the best waterproof jackets on the market, and we are in love with it! The Jacker is a great product for the job. It is very well made and has a
