In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

%matplotlib inline

#### Creating a User-Agent

Many websites have certain protocols for blocking robots from accessing data. Therefore, in order to extract data from a script, we need to create a User-Agent. The User-Agent is basically a string that tells the server about the type of host sending the request.

You can choose user agents from the following [website](https://developers.whatismybrowser.com/).

In [3]:
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

# There is an extra field in HEADERS called “Accept-Language”, which translates the webpage to English-US, if needed.

#### Sending a Request to a URL
A webpage is accessed by its URL (Uniform Resource Locator). With the help of the URL, we will send the request to the webpage for accessing its data.

In [4]:
URL = "https://www.amazon.com/Sony-PlayStation-Pro-1TB-Console-4/dp/B07K14XKZH/"

webpage = requests.get(URL, headers=HEADERS)

#### Creating a Soup of Information

The `webpage` variable contains a response received by the website. We pass the content of the response and the type of parser to the Beautiful Soup function.

In [5]:
soup = BeautifulSoup(webpage.content, "lxml")

`lxml` is a high-speed parser employed by Beautiful Soup to break down the HTML page into complex Python objects. Generally, there are four kinds of Python Objects obtained:

- `Tag` - It corresponds to HTML or XML tags, which include names and attributes.
- `NavigableString` - It corresponds to the text stored within a tag.
- `BeautifulSoup` - In fact, the entire parsed document.
- `Comments` - Finally, the leftover pieces of the HTML page that is not included in the above three categories.

#### Extracting Product Title

In [7]:
# Outer Tag Object
title = soup.find("span", attrs={"id": "productTitle"})

# Inner NavigableString Object
title_value = title.string

# Print types of values to understand each object type
print(type(title))
print(type(title_value))

<class 'bs4.element.Tag'>
<class 'bs4.element.NavigableString'>


In [8]:
# Print NavigableString
print(title_value)

# Strip extra spaces of variable
title_string = title_value.strip()
print(type(title_string))
print(title_string)

        Sony PlayStation 4 Pro 1TB Console - Black (PS4 Pro)       
<class 'str'>
Sony PlayStation 4 Pro 1TB Console - Black (PS4 Pro)


#### Extracting Product Information

- `The Title of the Product`
- `The Price of the Product`
- `The Rating of the Product`
- `Number of Customer Reviews`
- `Product Availability`

In [13]:
from bs4 import BeautifulSoup
import requests

# Function to extract Product Title
def get_title(soup):
	
	try:
		# Outer Tag Object
		title = soup.find("span", attrs={"id":'productTitle'})

		# Inner NavigableString Object
		title_value = title.string

		# Title as a string value
		title_string = title_value.strip()

	except AttributeError:
		title_string = ""	

	return title_string

# Function to extract Product Price
def get_price(soup):

	try:
		price = soup.find("span", attrs={'class':'a-size-base a-color-price offer-price a-text-normal'}).string.strip()

	except AttributeError:
		price = ""	

	return price

# Function to extract Product Rating
def get_rating(soup):

	try:
		rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
		
	except AttributeError:
		
		try:
			rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
		except:
			rating = ""	

	return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
	try:
		review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
		
	except AttributeError:
		review_count = ""	

	return review_count

# Function to extract Availability Status
def get_availability(soup):
	try:
		available = soup.find("div", attrs={'id':'availability'})
		available = available.find("span").string.strip()

	except AttributeError:
		available = ""	

	return available	

# Headers for request
HEADERS = ({'User-Agent':
			'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
			'Accept-Language': 'en-US, en;q=0.5'})

# The webpage URL
URL = "https://www.amazon.com/Sony-PlayStation-Pro-1TB-Console-4/dp/B07K14XKZH/"

# HTTP Request
webpage = requests.get(URL, headers=HEADERS)

# Soup Object containing all data
soup = BeautifulSoup(webpage.content, "lxml")

# Function calls to display all necessary product information
print("Product Title =", get_title(soup))
print("Product Price =", get_price(soup))
print("Product Rating =", get_rating(soup))
print("Number of Product Reviews =", get_review_count(soup))
print("Availability =", get_availability(soup))

Product Title = Sony PlayStation 4 Pro 1TB Console - Black (PS4 Pro)
Product Price = $541.99
Product Rating = 4.6 out of 5 stars
Number of Product Reviews = 4,080 ratings
Availability = In Stock.


#### Fetching Links from an Amazon Search Result Webpage

In [6]:
URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
HEADERS = ({'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, 'lxml')

In [7]:
# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

In [10]:
# Store the links
links_list = []

# Loop for extracting links from Tag Objects
for link in links:
    links_list.append(link.get('href'))

In [15]:
# Loop for extracting product details from each link 
amazon_product = {
    'product_title': [],
    'product_price': [],
    'product_rating': [],
    'no_of_reviews': [],
    'get_availability': []
}

for link in links_list:
    
    new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
    new_soup = BeautifulSoup(new_webpage.content, "lxml")
    
    amazon_product['product_title'].append(get_title(new_soup))
    amazon_product['product_price'].append(get_price(new_soup))
    amazon_product['product_rating'].append(get_rating(new_soup))
    amazon_product['no_of_reviews'].append(get_review_count(new_soup))
    amazon_product['get_availability'].append(get_availability(new_soup))

In [21]:
df = pd.DataFrame.from_dict(amazon_product)
df.head()

Unnamed: 0,product_title,product_price,product_rating,no_of_reviews,get_availability
0,Sony PlayStation 4 Camera,,4.7 out of 5 stars,"6,585 ratings",
1,PlayStation 4 500GB Console (Renewed),,4.2 out of 5 stars,188 ratings,Only 8 left in stock - order soon.
2,PlayStation 4 Slim 1TB Console (Renewed),,4.4 out of 5 stars,783 ratings,Only 8 left in stock - order soon.
3,Case Club Waterproof Playstation 4 Portable Ga...,,4.4 out of 5 stars,431 ratings,In Stock.
4,BlueFire Stereo Gaming Headset for Playstation...,,4.4 out of 5 stars,"8,118 ratings",In Stock.


In [22]:
df.shape

(30, 5)

In [31]:
df[df['product_price'] == df['product_price'].max()]

Unnamed: 0,product_title,product_price,product_rating,no_of_reviews,get_availability
21,DualShock 4 Wireless Controller for PlayStatio...,$57.99,4.6 out of 5 stars,"135,000 ratings",Only 10 left in stock - order soon.
