In [None]:
'''
Title: Headphone Webscraping
Author: Aron Kim
Date: 12/17/24

This project aims to demonstrate web scraping skills using customer reviews for the well-known headphone brand Audio-Technica.
The primary goal is to perform sentiment analysis on these reviews to determine whether the product is worth purchasing.
Additionally, I plan to store the scraped data in a CSV file and implement an automated process to update the data monthly with new reviews.

'''

In [3]:
# Starting by importing libraries

import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import random
import datetime
import time
import smtplib
import csv
import pandas as pd

In [4]:
# Connecting to website

URL = 'https://www.amazon.com/Audio-Technica-ATH-M50x-Professional-Monitor-Headphones/dp/B00HVLUR86/?_encoding=UTF8&pd_rd_w=dpbnC&content-id=amzn1.sym.ce7f3edf-012d-48bd-8dd4-138738c29cfa&pf_rd_p=ce7f3edf-012d-48bd-8dd4-138738c29cfa&pf_rd_r=QKTXT3Q7F487PKJYYNYM&pd_rd_wg=K8B1i&pd_rd_r=64cd525c-d896-492b-8093-c951fdaf21b3&ref_=pd_hp_d_btf_gcx_gw_per_1&th=1'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.amazon.com/",
    "DNT": "1",
    "Connection": "keep-alive"
}

time.sleep(random.uniform(2, 5))  # Random sleep between 2 and 5 seconds
page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, 'html.parser')

# Organizing contents of soup1
soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

#Scraping the website information
#I found information by using element finder on the actual website for the headphones

title = soup2.find(id='title').get_text()

price = soup2.find(class_='a-price-whole').get_text(strip=True)

rating = soup2.find('span', class_='a-icon-alt').get_text(strip=True)

num_reviews = soup2.find(id='acrCustomerReviewText').get_text(strip=True)

returns = soup2.find('a', id='creturns-policy-anchor-text').get_text(strip=True)

print(title)
print(price)
print(rating)
print(num_reviews)
print(returns)



             Audio-Technica ATH-M50x Professional Studio Monitor Headphones, Black
            

149.
4.7 out of 5 stars
27,251 ratings
FREE Returns


In [5]:
#Data cleaning

price = price.strip()[:-1]
title = title.strip()
rating = rating.strip()
returns = returns.strip()

print(title)
print(price)
print(rating)
print(returns)

Audio-Technica ATH-M50x Professional Studio Monitor Headphones, Black
149
4.7 out of 5 stars
FREE Returns


In [6]:
#Adding date

today = datetime.date.today()
print(today)

2025-02-06


In [7]:
#Creating CSV and adding data to it

header = ['Title', 'Price', 'Date', 'Rating', 'Returns']
data = [title, price, today, rating, returns]


#Using 'w' to write and 'newline' to remove spaces between CSVs
#Creating CSV then adding header and data
with open('TechnicaWebScraping.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

In [8]:
#Using pandas library to read in the CSV file and visually see my list

df = pd.read_csv(r'C:\Users\aronk\TechnicaWebScraping.csv')
print(df)

                                               Title  Price        Date  \
0  Audio-Technica ATH-M50x Professional Studio Mo...    149  2025-02-06   

               Rating       Returns  
0  4.7 out of 5 stars  FREE Returns  


In [9]:
#Creating a function in which will give me the data that I want everyday automatically
#Simply adding the code that I already wrote into the function

def check_price():
    URL = 'https://www.amazon.com/Audio-Technica-ATH-M50x-Professional-Monitor-Headphones/dp/B00HVLUR86/?_encoding=UTF8&pd_rd_w=dpbnC&content-id=amzn1.sym.ce7f3edf-012d-48bd-8dd4-138738c29cfa&pf_rd_p=ce7f3edf-012d-48bd-8dd4-138738c29cfa&pf_rd_r=QKTXT3Q7F487PKJYYNYM&pd_rd_wg=K8B1i&pd_rd_r=64cd525c-d896-492b-8093-c951fdaf21b3&ref_=pd_hp_d_btf_gcx_gw_per_1&th=1'

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9"}

    page = requests.get(URL, headers=headers)

    soup1 = BeautifulSoup(page.content, 'html.parser')

    soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

    title = soup2.find(id='title').get_text()

    price = soup2.find(class_='a-price-whole').get_text(strip=True)

    rating = soup2.find('span', class_='a-icon-alt').get_text(strip=True)

    returns = soup2.find('a', id='creturns-policy-anchor-text').get_text(strip=True)
    
    price = price.strip()[:-1]
    title = title.strip()
    rating = rating.strip()
    returns = returns.strip()
    
    today = datetime.date.today()

    header = ['Title', 'Price', 'Date', 'Rating', 'Returns']
    data = [title, price, today, rating, returns]

    with open('TechnicaWebScraping.csv', 'w', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerow(data)

In [None]:
#This adds new price every month

while(True):
    check_price()
    time.sleep(60*60*24*30)     #Multipling to find the seconds in a month

In [None]:
#From this point of the project I will create a sentiment rating for the reviews.
#The goal is to find score which tells me whether I should buy the product or not. 

In [None]:
#Starting with data cleaning
#I am converting ratings into numerical values

import re

rating_value = re.search(r'\d+(\.\d+)?', rating)

if rating_value:
    rating_value = float(rating_value.group())
else:
    rating_value = None

rating_value

In [None]:
#Checking for NaN and filling them in with the mean
#No need to fill in because there are no NaNs

print(pd.isna(rating))
print(pd.isna(price))
print(pd.isna(returns))

In [None]:
#I am converting the number of reviews to an integer

num_reviews_cleaned = re.sub(r'[^\d]', '', num_reviews)
num_reviews_asint = int(num_reviews_cleaned)
num_reviews_asint

In [None]:
#I am now creating a sentiment score
#I am starting by scraping the review comments left by customers

review_comments = soup2.find_all('li', {'data-hook': 'review'})

for comment in review_comments:
    review_text = comment.find('span', {'class': 'review-text'})
    if review_text:
        print(review_text.get_text(strip=True))

In [None]:
#Using textblob to do a sentiment text analysis
#I am finding the sentiment scores for each review left by the customers
#The score ranges from -1 to 1, where 0 means neutral, x > 0 means positive and x < 0 means negative
#The scores will tell me whether I should buy the product or not

!pip install textblob
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

sentiment_scores = []


for comment in review_comments:
    review_text = comment.find('span', {'class': 'review-text'})
    
    if review_text:
        review_text_clean = review_text.get_text(strip=True)

        sentiment_score = TextBlob(review_text_clean).sentiment.polarity
        print(f"Sentiment Score: {sentiment_score}\n")
        
        sentiment_scores.append(sentiment_score)

if sentiment_scores:
    avg_sentiment_score = sum(sentiment_scores) / len(sentiment_scores)
    print(f"Average Sentiment Score: {avg_sentiment_score}")