<a href="https://colab.research.google.com/github/adam-bozman/hypothetical-sabbatical/blob/main/WebScrapingAmazonDataBS4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Web Scraping Amazon Data with Python

## Imports

In [172]:
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import pandas as pd
import urllib.parse

##Initializing the Program

####HTTP Request

In [103]:
#To avoid a 503 error, headers is required (exclusive to Amazon)
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/90.0.4430.212 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

website = 'https://www.amazon.com/s?k=timberland+boots+for+men&crid=DPMBRVEUI1YS&sprefix=timberland%2Caps%2C283&ref=nb_sb_ss_ts-doa-p_1_10'

#### Get Request

In [104]:
response = requests.get(website, headers = HEADERS)

#### Status Code

In [105]:
response.status_code

200

##Soup Object

In [106]:
soup = BeautifulSoup(response.content, 'html.parser')

##Results

In [107]:
results = soup.find_all('div', {'data-component-type':'s-search-result'})

In [108]:
len(results)

57

In [109]:
results[1]

<div class="sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20" data-asin="B000VX04F2" data-component-type="s-search-result" data-index="3" data-uuid="7341b256-6f2f-460d-b934-5284d6183245"><div class="sg-col-inner">
<div cel_widget_id="MAIN-SEARCH_RESULTS-3" class="s-widget-container s-spacing-small s-widget-container-height-small celwidget slot=MAIN template=SEARCH_RESULTS widgetId=search-results_2">
<div class="s-card-container s-overflow-hidden s-expand-height s-include-content-margin s-latency-cf-section s-card-border"><div class="a-section a-spacing-base"><div class="s-product-image-container aok-relative s-image-overlay-grey s-text-center s-padding-left-small s-padding-right-small s-spacing-small s-height-equalized"><span class="rush-component" data-component-type="s-product-image"><a class="a-link-normal s-no-outline" href="/Timberland-Classic-Premium-Black-Nubuck/dp/B000VX04F2/ref=sr_1_2?crid=DPMBRVEUI1YS&amp;keywords=timberland+boot

##Target Data

In [None]:
# Product Name
# Price
# Review Rating
# Review Count
# Product Link

####Name

In [110]:
results[0].find('span', {'class':'a-size-base-plus a-color-base a-text-normal'}).get_text()

"Timberland Men's White Ledge Mid Waterproof Hiking Boot"

####Price

In [117]:
results[0].find('span', {'class':'a-offscreen'}).get_text()

'$90.12'

####Review Rating

In [151]:
results[0].find('span', {'class':'a-icon-alt'}).get_text()

'4.5 out of 5 stars'

####Review Count

In [152]:
results[0].find('span', {'class':'a-size-base a-color-base s-underline-text'}).get_text()

'43,664'

####Relative URL

In [154]:
relative_url = results[0].find('a', {'class':'a-link-normal s-no-outline'}).get('href')

In [155]:
relative_url

'/Timberland-White-Ledge-Waterproof-Brown/dp/B000VX6Y2O/ref=sr_1_1?crid=DPMBRVEUI1YS&keywords=timberland+boots+for+men&qid=1643597021&sprefix=timberland%2Caps%2C283&sr=8-1'

In [156]:
root_url = 'https://www.amazon.com/'

In [157]:
#URLs combined
url_combined = root_url + relative_url

In [158]:
url_combined

'https://www.amazon.com//Timberland-White-Ledge-Waterproof-Brown/dp/B000VX6Y2O/ref=sr_1_1?crid=DPMBRVEUI1YS&keywords=timberland+boots+for+men&qid=1643597021&sprefix=timberland%2Caps%2C283&sr=8-1'

##Create a For Loop

In [168]:
product_name = []
product_price = []
review_rating = []
review_count = []
relative_url = []

for result in results:
    
    # name
    try:
        product_name.append(result.find('span', {'class':'a-size-base-plus a-color-base a-text-normal'}).get_text()) 
    except:
        product_name.append('n/a')
    
    # price
    try:
        product_price.append(result.find('span', {'class':'a-offscreen'}).get_text())
    except:
        product_price.append('n/a')
    
    # review rating
    try:
        review_rating.append(result.find('span', {'class':'a-icon-alt'}).get_text())
    except:
        review_rating.append('n/a')
        
    # review count
    try:
        review_count.append(result.find('span', {'class':'a-size-base a-color-base s-underline-text'}).get_text())
    except:
        review_count.append('n/a')
    
    # relative URL
    try:
        relative_url.append(result.find('a', {'class':'a-link-normal s-no-outline'}).get('href'))
    except:
        relative_url.append('n/a')

###Combine URLs

In [169]:
url_combined = []

for link in relative_url:
    url_combined.append(urllib.parse.urljoin(root_url, link))

##Create Pandas Dataframe

In [170]:
product_overview = pd.DataFrame({'Name': product_name, 'Price':product_price, 'Rating':review_rating,
                                'Review Count': review_count, 'Link': url_combined})

In [171]:
product_overview

Unnamed: 0,Name,Price,Rating,Review Count,Link
0,Timberland Men's White Ledge Mid Waterproof Hi...,$90.12,4.5 out of 5 stars,43664,https://www.amazon.com/Timberland-White-Ledge-...
1,Timberland Men's 6-Inch Premium Waterproof Boot,,4.6 out of 5 stars,14408,https://www.amazon.com/Timberland-Classic-Prem...
2,Timberland PRO Men's Pit Boss 6 Inch Steel Saf...,$102.69,4.4 out of 5 stars,15520,https://www.amazon.com/Timberland-PRO-Pitboss-...
3,Timberland Men's Ankle Chukka Boots,$246.07,4.6 out of 5 stars,2696,https://www.amazon.com/Timberland-Ankle-Chukka...
4,Timberland Men's Anti-Fatigue Hiking Waterproo...,$99.95,4.6 out of 5 stars,7222,https://www.amazon.com/Timberland-Mens-Maddsen...
5,Timberland PRO Men's Pit Boss 6 Inch Soft Toe ...,$98.56,4.5 out of 5 stars,7497,https://www.amazon.com/Timberland-PRO-Pitboss-...
6,"Timberland Men's Earthkeepers Rugged 6"" Boot",$184.95,4.5 out of 5 stars,3665,https://www.amazon.com/Timberland-Earthkeepers...
7,"Timberland Men's Earthkeepers 6"" Boot",$150.00,4.5 out of 5 stars,3971,https://www.amazon.com/Timberland-Earthkeepers...
8,Timberland mens White Ledge Mid Waterproof,$89.95,4.6 out of 5 stars,3361,https://www.amazon.com/Timberland-White-Ledge-...
9,"Timberland PRO Men's 26011 Direct Attach 8"" So...",$134.95,4.6 out of 5 stars,3716,https://www.amazon.com/Timberland-PRO-Direct-A...


##Output in Excel

In [174]:
product_overview.to_csv('Timberland_.csv', index=False)

##Pagination (All Pages)

In [194]:
product_name = []
product_price = []
review_rating = []
review_count = []
relative_url = []

for i in range (1,8):
    #To avoid a 503 error, headers is required (exclusive to Amazon)
    HEADERS = ({'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                AppleWebKit/537.36 (KHTML, like Gecko) \
                Chrome/90.0.4430.212 Safari/537.36',
                'Accept-Language': 'en-US, en;q=0.5'})
    # website in variable
    website = 'https://www.amazon.com/s?k=timberland+boots+for+men&crid=DPMBRVEUI1YS&sprefix=timberland%2Caps%2C283&ref=nb_sb_ss_ts-doa-p_' + str(i)
    
    # request
    response = requests.get(website, headers = HEADERS)
    
    # soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # results
    results = soup.find_all('div', {'data-component-type':'s-search-result'})
    
    # loop through results
    for result in results:

            # name
            try:
                product_name.append(result.find('span', {'class':'a-size-base-plus a-color-base a-text-normal'}).get_text()) 
            except:
                product_name.append('n/a')
            
            # price
            try:
                product_price.append(result.find('span', {'class':'a-offscreen'}).get_text())
            except:
                product_price.append('n/a')
            
            # review rating
            try:
                review_rating.append(result.find('span', {'class':'a-icon-alt'}).get_text())
            except:
                review_rating.append('n/a')
                
            # review count
            try:
                review_count.append(result.find('span', {'class':'a-size-base a-color-base s-underline-text'}).get_text())
            except:
                review_count.append('n/a')
            
            # relative URL
            try:
                relative_url.append(result.find('a', {'class':'a-link-normal s-no-outline'}).get('href'))
            except:
                relative_url.append('n/a')

In [196]:
url_combined = []

for link in relative_url:
    url_combined.append(urllib.parse.urljoin(root_url, link))

In [197]:
product_overview = pd.DataFrame({'Name':product_name, 'Price':product_price, 'Rating':review_rating,
                                'Review Count':review_count, 'Link':url_combined})

In [198]:
product_overview

Unnamed: 0,Name,Price,Rating,Review Count,Link
0,Timberland Men's White Ledge Mid Waterproof Hi...,$90.12,4.5 out of 5 stars,43664,https://www.amazon.com/Timberland-White-Ledge-...
1,Timberland PRO Men's Pit Boss 6 Inch Steel Saf...,$102.69,4.4 out of 5 stars,15520,https://www.amazon.com/Timberland-PRO-Pitboss-...
2,Timberland Men's 6-Inch Premium Waterproof Boot,,4.6 out of 5 stars,14408,https://www.amazon.com/Timberland-Classic-Prem...
3,Timberland PRO Men's Pit Boss 6 Inch Soft Toe ...,$98.56,4.5 out of 5 stars,7497,https://www.amazon.com/Timberland-PRO-Pitboss-...
4,Timberland Men's Ankle Chukka Boots,$246.07,4.6 out of 5 stars,2696,https://www.amazon.com/Timberland-Ankle-Chukka...
...,...,...,...,...,...
395,Vostey Men's Motorcycle Boots Business Casual ...,$42.49,4.3 out of 5 stars,3145,https://www.amazon.com/gp/slredirect/picassoRe...
396,Golden Fox Men's Leather Wellington Farm & Con...,$109.00,4.2 out of 5 stars,126,https://www.amazon.com/gp/slredirect/picassoRe...
397,Mens Leather Waterproof Snow Boots，Ankle Boots...,$59.99,4.1 out of 5 stars,4,https://www.amazon.com/gp/slredirect/picassoRe...
398,Men's Snow Boots Chukka Casual Boot - Hiking W...,$49.99,4.1 out of 5 stars,136,https://www.amazon.com/gp/slredirect/picassoRe...


##Excel Output

In [199]:
product_overview.to_csv('Timberland_Full.csv', index=False)