# RevieWiz: Summarize product reviews on Amazon

Uses web scraping and OpenAI APIs


In [1]:
!pip install PyPDF2 langchain openai
!pip install typing-extensions --upgrade
!pip install fpdf
!pip install clean-text
!pip install pypdf
!pip install unidecode



# Section 1 of 2: Web Scraping

In [2]:
import requests #for calling a URL
from bs4 import BeautifulSoup #for web scraping

In [3]:
url = input("Enter the URL of the webpage you want to scrape: ")

Enter the URL of the webpage you want to scrape: https://www.amazon.com/Tasiso-Necklace-Sparkling-Stacking-Minimalist/dp/B0B14L4TMR/ref=sr_1_8?crid=24HXHUJFOCC0M&dib=eyJ2IjoiMSJ9.cbkjRvjnX92VoqH4P3S4ITuDECtU-VGRRc4YZDTF6N3EGdHuyQkAEf0sbsnshqiIkCv3eacXBTl2eeQNoqO6r9JNwnUP8QbC38baRSfqKgIIegkrykwZmaTtArjcGIHg6U1kuptIQu0Zo_it4Dmjt3zl2Is5NS02YAURXmG5XYmtujC-g4FsGF2C3qCzG0niy6cUu4GIiWfTy2kOCx92lpNoC4Ez7N6LDfxcWc0cj3dXuZ8odJETemGFcZFOh5t-KX450DGwunSR81bZunTNLxigkMWDUaDct40Y5ZJVmwA.lIBDElyWopgGU-roaYFrMn91-_CEirmJA70s5Eb6Y6o&dib_tag=se&keywords=necklace&qid=1723139562&sprefix=neckalc%2Caps%2C412&sr=8-8


In [4]:
page = requests.get(url)

In [5]:
soup = BeautifulSoup(page.content,'html.parser') # to get the html code(obtained using page.content) in a readable format
# print(soup)

In [6]:
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}

In [7]:
review_details="" # to store the all the relevant details of each review as a string
review_elements = soup.select("div.review") # this div section contains reviews
print(review_elements)

[<div class="a-section review aok-relative" data-hook="review" id="R29DY3UN6123XF"><div class="a-row a-spacing-none" id="R29DY3UN6123XF-review-card"><div class="a-section celwidget" id="customer_review-R29DY3UN6123XF"><div class="a-row a-spacing-mini" data-hook="genome-widget"><a class="a-profile" data-a-size="small" href="/gp/profile/amzn1.account.AHZLAECFB6JPQ7H7I67ZUVZWOPKA"><div aria-hidden="true" class="a-profile-avatar-wrapper"><div class="a-profile-avatar"><img class="a-lazy-loaded" data-src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/9d6ec3e4-22a7-4981-abca-853a4629ac48._CR0,0,375,375_SX48_.jpg" src="https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/grey-pixel.gif"/><noscript><img src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/9d6ec3e4-22a7-4981-abca-853a4629ac48._CR0,0,375,375_SX48_.jpg"/></noscript></div></div><div class="a-profile-content"><span class="a-profile-name">Dijuana Banks-Stephens</span></di

In [8]:
title_element = soup.find('span', {'id': 'productTitle'})
if title_element:
    product_name = title_element.get_text(strip=True)

In [9]:
num = 1 # to keep track of the number of reviews

review_details += "Product Name: "+product_name+"\n"

for scrap in review_elements:
  review_details += "Review_num_"+str(num)+"\n"
  num = num+1

  rating_element = scrap.select_one("i.review-rating") # stores the review

  title_element = scrap.select_one("a.review-title") # stores the title of the review

  content_element = scrap.select_one("span.review-text") # stores the rating of the review

  if rating_element:
    rating_text = rating_element.get_text(strip=True)
    review_details += "Rating out of 5: "+ rating_text+"\n"
  if title_element:
    title_text = title_element.get_text(strip=True)
    review_details += "Title: "+ title_text+"\n"
  if content_element:
    content_text = content_element.get_text(strip=True)
    review_details += "Review: "+ content_text+"\n"


In [10]:
print(review_details)

Product Name: Tasiso 14K Gold Filled Herringbone Choker Necklace Set Double Layer Snake Chain Herringbone Chain Necklace Layering Necklace Set Cuban Chain Necklace for Women
Review_num_1
Rating out of 5: 5.0 out of 5 stars
Title: 5.0 out of 5 starsClassic 💋💯
Review: Let me say, The herringbone is a chain that never goes out of style. Its stylish and its elegant shine enhances your outfit! The necklace is strong and sturdy. The clasp is Well made. It does not fade. I wore this chain in the shower, after a night of 🍹. Oops 😬 It’s not affected by the rain. Nor, does the sun’s rays, or The body’s perspiration change a thing! I love the width. It’s not bulky or heavy on the neck. I would definitely be purchasing from this merchant again. Whaaaat a gurl must have plenty of options🤣😂 #LayersRead more
Review_num_2
Rating out of 5: 5.0 out of 5 stars
Title: 5.0 out of 5 starsGreat inexpensive costume jewelry
Review: This necklace is super cute! Dainty and adds just enough pizazz to take your ou

# Converting the text to a pdf
Here, we're converting the text to a pdf and also storing the reviews in a .txt file ("text.txt") to make it easier for the chatbot to read.

In [11]:
from fpdf import *
from pypdf import PdfReader
from pathlib import *

In [12]:
pdf = FPDF()

In [13]:
pdf_location = Path.home()/"/content/amazon_product_reviews_pdf.pdf" # path of the file where the pdf is to be stored

In [14]:
!pip install --upgrade fpdf



In [15]:
from cleantext import clean
clean_reviews = clean(review_details, no_emoji=True) # removing all emojis to prevent UnicodeEncodeError when we add the reviews to a pdf

extracted_text = "text.txt" # to store review_details in a .txt file

In [17]:
import re

def replace_unicode_characters(text):
    # Dictionary for replacing common unicode characters with their equivalents
    replacements = {
        '\U0001f379': '[cocktail]',  # Example: Replacing cocktail emoji with text
    }

    def replace_match(match):
        char = match.group(0)
        return replacements.get(char, '')

    clean_text = re.sub(r'[^\x00-\xFF]', replace_match, text) # replacing characters not supported by latin-1
    return clean_text

clean_reviews = replace_unicode_characters(review_details)

# CREATING AND WRITING TO A PDF:

# Creating a new PDF document
pdf = FPDF()

# Adding a page
pdf.add_page()

pdf.set_font("Arial", size=8)
pdf.cell(200, 10, txt="Amazon Product Reviews", ln=1, align="C")

# Adding the cleaned reviews text to the PDF
pdf.multi_cell(0, 10, txt=clean_reviews, align="L")

# Saving the pdf with name and path
pdf.output("reviews_pdf.pdf")

with open(extracted_text, "w") as file:
    file.write(clean_reviews)

# Section 2 of 2: Creating embeddings and reading from the PDF for the chatbot to answer questions

### Import necessary libraries

In [19]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from openai import OpenAI
import os
import json
from numpy import dot

In [20]:
EXTRACTED_TEXT_FILE_PATH = "text.txt" # text extracted from pdf
EXTRACTED_JSON_PATH = "extracted.json" # snippets and embeddings
os.environ['OPENAI_API_KEY'] = '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'
# this key has been disabled XD

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4o-mini"
CHUNK_SIZE = 700 # 700 was a resonable assumption- chunks were created up to 600
CHUNK_OVERLAP = 200 # to create overlap between snippets
CONFIDENCE_SCORE = 0.75 # specify confidence score to filter search results. [0,1] prefered: 0.75

In [21]:
def create_embeddings(file_path: str):

    # Initializing the OpenAI Client and setting the OpenAI API key
    client = OpenAI()
    client.api_key = OPENAI_API_KEY

    snippets = [] # to store text snippets
    text_splitter = CharacterTextSplitter(separator="\n",
                                         chunk_size=CHUNK_SIZE,
                                         chunk_overlap=CHUNK_OVERLAP,
                                         length_function=len)

    with open(file_path, "r", encoding="utf-8") as file: # reading the file
      file_text = file.read()

    # Splitting the text into snippets
    snippets = text_splitter.split_text(file_text)

    # Requesting embeddings for the snippets using the specified model
    response = client.embeddings.create(input=snippets,model=EMBEDDING_MODEL)

    # Extracting embeddings from the response
    embedding_list = [response_object.embedding for response_object in response.data]

    # Creating a JSON object that stores the embeddings and text chunks
    embedding_json = {
        'embeddings': embedding_list,
        'snippets': snippets
    }

    # Converting the data in the JSON object to a formatted JSON string
    json_object = json.dumps(embedding_json, indent=4)

    with open(EXTRACTED_JSON_PATH, 'w', encoding="utf-8") as file:
        file.write(json_object)

In [22]:
def get_embeddings():
    with open(EXTRACTED_JSON_PATH,'r') as file:
        embedding_json = json.load(file) # loading the JSON data into a Python dictionary

    # Returning the embeddings and text chunks from the loaded JSON
    return embedding_json['embeddings'], embedding_json['snippets']

In [23]:
def user_question_embedding_creator(question):
    client = OpenAI()
    response = client.embeddings.create(input=question,model=EMBEDDING_MODEL)
    return response.data[0].embedding # extracting and returning the embedding from the chatbot's response

In [24]:
def answer_users_question(user_question):

    try:
        # Creating an embedding for the user's question
        user_question_embedding = user_question_embedding_creator(user_question)
    except Exception as e:
        return "Oops! An error occurred while creatign embeddings"

    cosine_similarities = [] # Used here to calculate the similarities in embeddings of the document and the user's question
    for embedding in embeddings:
        cosine_similarities.append(dot(user_question_embedding,embedding))

    # Pairing snippets with their respective cosine similarities and sorting them by similarity
    scored_snippets = zip(snippets, cosine_similarities)
    sorted_snippets = sorted(scored_snippets, key=lambda x: x[1], reverse=True)

    all_results = [snipps for snipps, _score in sorted_snippets]

    pdf_description = "This pdf contains all the reviews for a given product on amazon. The purpose of this pdf is for the chatbot to learn what the reviews are, what component of the product has a positive review or a negative review."

    # This is a description of the pdf for the chatbot
    chatbot_system = f"As an AI assistant, you have access to SEARCH RESULTS extracted from a PDF document. "
    f"The PDF is described as: {pdf_description}. "
    f"Your task is to provide the most relevant answer to the user's question based on the SEARCH RESULTS provided. "
    f"Both the SEARCH RESULTS and the USER'S QUESTION are enclosed in triple backticks (```). "
    f"If the information is not available or the question is irrelevant, respond with: 'Sorry, I can't help with that. This is not something the reviews answer. '"

    # Creating a prompt
    prompt = f"""\
    SEARCH RESULTS:
    ```
    {all_results}
    ```
    USER'S QUESTION:
    ```
    {user_question}
    ```
    """

    # Prepare the chat conversation and use GPT model for generating a response
    messages = [{'role':'system', 'content':chatbot_system},
                {'role':'user', 'content':prompt}]

    try:
        client = OpenAI()
        completion = client.chat.completions.create(model=GPT_MODEL,
                                             messages=messages,
                                             temperature=0,
                                             stream=False)
    except Exception as e:
        return f"Oops! An error occurred with the chatbot."

    return completion.choices[0].message.content # contains the chatbot's response

---

# Executing and running the chatbot

In [27]:
PDF_FILE_PATH = "reviews_pdf.pdf"

create_embeddings(EXTRACTED_TEXT_FILE_PATH) # commented because the embeddings have already been created once

embeddings, snippets = get_embeddings() # storing embeddings for the chatbot

## Final Output: Chatbot
To exit leave user input blank and hit return

In [28]:
while True:

    # The user gives an input
    print("USER:")
    user_question = input("")

    if user_question =="": # If the user enters an empty question, the conversation ends (the loop breaks)
        break
    else:
        print("RevieWiz:")

        # Calling answer_users_question() to generate an answer based on the user's question and printing the RevieWiz's response
        print(answer_users_question(user_question=user_question))
        print("----------------------")

USER:
what is the name of the product?
RevieWiz:
The name of the product is "Tasiso 14K Gold Filled Herringbone Choker Necklace Set Double Layer Snake Chain Herringbone Chain Necklace Layering Necklace Set Cuban Chain Necklace for Women."
----------------------
USER:
is the necklace sturdy?
RevieWiz:
Yes, several reviews indicate that the necklace is sturdy. For example, one review mentions that "the necklace is strong and sturdy," and another review highlights that it is "not so dainty that you feel it will break." Overall, the feedback suggests that the necklace is well-made and durable.
----------------------
USER:
does the necklace tarnish easily?
RevieWiz:
Based on the reviews, the necklace does not seem to tarnish easily. One reviewer specifically mentioned that the necklace "does not fade" and remained unaffected by water, rain, or perspiration. Another review highlighted that the necklace looks shiny and has a nice quality, indicating that it maintains its appearance well. Howe