# Amazon Web Scraping Project

### Import Required Libraries

In [24]:
from bs4 import BeautifulSoup
import requests
import smtplib
import time
import datetime

### Import Headers and URL

In [29]:
# Load the headers and URL
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html, application/xhtml+xml, application/xml;q=0.9,*/*;q=0.8","DNT":"1","Connection":"close","Upgrade-Insecure-Requests":"1"}
URL = "https://www.amazon.ca/s?k=ipad&i=electronics&crid=35Y80BUST4QDH&sprefix=ipad%2Celectronics%2C104&ref=nb_sb_noss_1"

### Parse the amazon webpage that lists all of the products
Note that for the purposes of the project, the search query of "ipad" was made. The below code parses the intial webpage containing differenting listings of available ipad options

In [30]:
webpage = requests.get(URL, headers=headers)
webpage

<Response [503]>

In [31]:
type(webpage.content)

bytes

In [32]:
# Soup object containing the webpage html data
soup = BeautifulSoup(webpage.content, "html.parser")
print(soup)


<!DOCTYPE html>
<html><head><meta charset="utf-8"/><meta content="ie=edge" http-equiv="x-ua-compatible"/><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><title>Amazon.ca Something Went Wrong / Quelque chose s'est mal passé</title><style>html,body{padding:0;margin:0}img{border:0}#a,#b{background:#232f3e;padding:11px;height:35px}#c{position:absolute;left:22px;top:12px}#e{position:relative;max-width:800px;padding:0 40px 0 171px}#f,#g{height:35px;border:0;font-size:1em}#f{width:100%;margin:0;padding:0 10px;border-radius:4px 0 0 4px}#g{cursor:pointer;background:#febd69;font-weight:bold;border-radius:0 4px 4px 0;-webkit-appearance:none;position:absolute;top:0;right:0;padding:0 12px}@media(max-width:500px){#e{padding-left:0}#b{padding:55px 10px 10px}#c{left:6px}}#h{text-align:center;margin:30px 0}#h img{max-width:90%}#d{display:none}#d[src]{display:inline}</style></head><body><form accept-charset="utf-8" action="/s" id="b" method="GET" role="search"><a h

In [22]:
# Fetch links as list of tag objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
links

[]

In [23]:
# Provides the full link to the listing
link = links[1].get('href')
product_list = "https://amazon.ca"+ link
print(product_list)

IndexError: list index out of range

### Parse the webpage for a specific product
For the purposes of this project, that would be the second item of the available options on the webpage 

In [18]:
product_webpage = requests.get(product_list, headers=headers)
product_webpage

<Response [200]>

In [19]:
# Soup object containing HTML data
product_soup = BeautifulSoup(product_webpage.content, "html.parser")
product_soup

<!DOCTYPE html>

<!--[if lt IE 7]> <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8 a-lt-ie7"> <![endif]-->
<!--[if IE 7]>    <html lang="en-us" class="a-no-js a-lt-ie9 a-lt-ie8"> <![endif]-->
<!--[if IE 8]>    <html lang="en-us" class="a-no-js a-lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="a-no-js" lang="en-us"><!--<![endif]--><head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title dir="ltr">Amazon.ca</title>
<meta content="width=device-width" name="viewport"/>
<link href="https://images-na.ssl-images-amazon.com/images/G/01/AUIClients/AmazonUI-3c913031596ca78a3768f4e934b1cc02ce238101.secure.min._V1_.css" rel="stylesheet"/>
<script>

if (true === true) {
    var ue_t0 = (+ new Date()),
        ue_csm = window,
        ue = { t0: ue_t0, d: function() { return (+new Date() - ue_t0); } },
        ue_furl = "fls-na.amazon.ca",
        ue_mid = "A2EUQ1WTGCTBG2",

#### Get the Product Information
Please note: These requests no longer work due to Amazon requiring a captcha prompt. An alternate web scraping project is in the works. For all intents and purposes, I do not support illegal web scraping or data mining. This project was made as a learning opportunity, and I will work to find proper alternatives.

For the purposes of continuity and creating a full project though, I have chosen to add hard-coded values to continue with the reading and writing data to a csv file.

In [34]:

#title = product_soup.find("span", attrs={"id":"productTitle"}).get_text()
#price = product_soup.find("span",attrs = {"class":"a-price a-text-price a-size-medium"}).find("span", attrs={"class":"a-offscreen"}).get_text()
#rating = product_soup.find("span", attrs={"class":"a-size-base a-color-base"}).get_text()

title_full = "Apple iPad (9th Generation): with A13 Bionic chip, 10.2-inch Retina Display, 256GB, Wi-Fi, 12MP front/8MP Back Camera, Touch ID, All-Day Battery Life – Silver"
price_dollars = "$579.98"
rating = "4.8"
print(title_full)
print(price_dollars)
print(rating)


Apple iPad (9th Generation): with A13 Bionic chip, 10.2-inch Retina Display, 256GB, Wi-Fi, 12MP front/8MP Back Camera, Touch ID, All-Day Battery Life – Silver
$579.98
4.8


#### Clean the Imported Data
The following code would be required if the product_soup.find.get_text() requests worked.

In [43]:
price = price_dollars.strip()[1:]
title = title_full.strip()
today = datetime.date.today()

print(price)
print(title)
print(today)

579.98
Apple iPad (9th Generation): with A13 Bionic chip, 10.2-inch Retina Display, 256GB, Wi-Fi, 12MP front/8MP Back Camera, Touch ID, All-Day Battery Life – Silver
2023-09-02


### Create the .csv file and write the scraped data to it

In [44]:
# Creates the .csv file and adds the titles of the columns, as well as the first row of data scraped from above
import csv
header = ['Title','Price','Rating','Date']
data = [title, price, rating, today]


with open('AmazonProductDataset.csv', 'w', newline='',encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)
    


In [54]:
# Adds the .csv file data to Pandas dataframe to make it easier to read while running the notebook
import pandas as pd
df = pd.read_csv(r"C:\Users\asmig\dev-projects\ds\Amazon Web Scraping\AmazonProductDataset.csv")
df

Unnamed: 0,Title,Price,Rating,Date
0,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02


In [None]:
# Append new data to the .csv file

with open('AmazonProductDataset.csv', 'a+', newline='',encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [4]:
# Create a function to check price, which does essentially the same as all of the code written so far
# If price drops below 400, calls the function to send an email alert to the user's inbox
def check_price():
    page = requests.get(product_list, headers=headers)
    soup1 = BeautifulSoup(page.content, "html.parser")
    soup2 = BeautifulSoup(soup1.prettify(),"html.parser")
   
    #title = product_soup.find("span", attrs={"id":"productTitle"}).get_text()
    #price = product_soup.find("span",attrs = {"class":"a-price a-text-price a-size-medium"}).find("span", attrs={"class":"a-offscreen"}).get_text()
    #rating = product_soup.find("span", attrs={"class":"a-size-base a-color-base"}).get_text()
    title_new = "Apple iPad (9th Generation): with A13 Bionic chip, 10.2-inch Retina Display, 256GB, Wi-Fi, 12MP front/8MP Back Camera, Touch ID, All-Day Battery Life – Silver"
    price_new = "$579.98"
    rating_new = "4.8"
    
    price_ = price_new.strip()[1:]
    title_ = title_new.strip()
    
    if (price_ < 400):
        send_mail()
    
    import datetime
    today_date = datetime.date.today()
    
    import csv
    header_new = ['Title','Price','Rating','Date']
    data_new = [title_, price_, rating_new, today_date]
    
    with open('AmazonProductDataset.csv', 'a+', newline='',encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data_new)




In [5]:
# Checks the title, price and rating of the product every day (86400 seconds in one day)
while(True):
    check_price()
    time.sleep(86400)


In [60]:
import pandas as pd
df_new = pd.read_csv(r"C:\Users\asmig\dev-projects\ds\Amazon Web Scraping\AmazonProductDataset.csv")
df_new

Unnamed: 0,Title,Price,Rating,Date
0,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
1,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
2,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
3,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
4,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
5,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02
6,Apple iPad (9th Generation): with A13 Bionic c...,579.98,4.8,2023-09-02


In [28]:
def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('akgujral@uwaterloo.ca','XXXXXXXXXXXXXX')
    subject = "ALERT: The iPad you want is below $400! Now's your chance!"
    body = "Quick go get it right now at"
    msg = f"Subject: {subject}\n\n{body}"
    server.sendmail(
        'akgujral@uwaterloo.ca',
        msg
    )