# Web Scraping using Beautiful Soup

### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

### Part 1 Functions

In [2]:
# Function to extract Product Name
def get_name(soup):

    try:
        title_string=soup.find("span",attrs={"id":"productTitle"}).text.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        
        price=soup.find("span",attrs={"class":"a-offscreen"}).text

    except AttributeError:

        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count


### Part 2 Functions

In [3]:
#Function to extract Product Description
def get_product_description(soup):

    try:
        description=soup.find("div",attrs={"id":"feature-bullets"}).text.strip()

    except AttributeError:
        description = ""

    return description

#Function to extract Manufacturer
def get_manuf(soup):

    try:
        manuf=soup.find("div",attrs={"class":"a-section a-spacing-medium brand-snapshot-flex-row"}).text.strip()

    except AttributeError:
        manuf= ""

    return manuf



### Main Function

In [4]:
if __name__ == '__main__':

    # adding user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57', 'Accept-Language': 'en-US, en;q=0.5'})

    # adding webpage URL
    URL = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"
    
    # Creating URL_list for 20 URL's of product pages
    URL_List=[URL]
    for i in range(19):
        Pre_URL=URL_List[i]
        webpage = requests.get(Pre_URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "html.parser")
        URL2= 'https://www.amazon.in'+soup.find("a", attrs={'class':"s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"}).get('href')
        URL_List.append(URL2)
        
    #Creating Dictionary    
    d = {"Product URL":[],"ASIN":[],"Manufacturer":[], "Product Name":[],"Product Description":[], "Product Price":[], "Rating":[],"Number of reviews":[]}
    
    
    #For each url 
    for urlx in URL_List:
        #printing url for which data will been scraped
        print(urlx)
        
        # HTTP Request
        webpage = requests.get(urlx, headers=HEADERS)
        
        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Store the links
        links_list = []

        # Loop for extracting links from Tag Objects
        for link in links:
            links_list.append(link.get('href'))

    
        # Loop for extracting product details from each link 
        for link in links_list:
            try:
                new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
            except:
                continue

            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            
            # Finding asin using Regular Expression 
            asin=re.search(r'[dg]p%2F([^%2F]+)',link, flags=re.IGNORECASE)
            if(asin==None):
                asin = re.search(r'[dg]p%2F([^%]+)', link, flags=re.IGNORECASE)
            if(asin):
                d["ASIN"].append(asin.group(1))
            else:
                d["ASIN"].append("")
            
            # Finiding Product URL
            d["Product URL"].append("https://www.amazon.in"+link)

            # Function calls to display all necessary product information
            d["Manufacturer"].append(get_manuf(new_soup))
            d["Product Description"].append(get_product_description(new_soup))
            d['Product Name'].append(get_name(new_soup))
            d['Product Price'].append(get_price(new_soup))
            d['Rating'].append(get_rating(new_soup))
            d['Number of reviews'].append(get_review_count(new_soup))
    
    
    amazon_df = pd.DataFrame.from_dict(d)
    

https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1
https://www.amazon.in/s?k=bags&page=2&crid=2M096C61O4MLT&qid=1685196078&sprefix=ba%2Caps%2C283&ref=sr_pg_1
https://www.amazon.in/s?k=bags&page=3&crid=2M096C61O4MLT&qid=1685196080&sprefix=ba%2Caps%2C283&ref=sr_pg_2
https://www.amazon.in/s?k=bags&page=4&crid=2M096C61O4MLT&qid=1685196081&sprefix=ba%2Caps%2C283&ref=sr_pg_3
https://www.amazon.in/s?k=bags&page=5&crid=2M096C61O4MLT&qid=1685196083&sprefix=ba%2Caps%2C283&ref=sr_pg_4
https://www.amazon.in/s?k=bags&page=6&crid=2M096C61O4MLT&qid=1685196084&sprefix=ba%2Caps%2C283&ref=sr_pg_5
https://www.amazon.in/s?k=bags&page=7&crid=2M096C61O4MLT&qid=1685196085&sprefix=ba%2Caps%2C283&ref=sr_pg_6
https://www.amazon.in/s?k=bags&page=8&crid=2M096C61O4MLT&qid=1685196087&sprefix=ba%2Caps%2C283&ref=sr_pg_7
https://www.amazon.in/s?k=bags&page=9&crid=2M096C61O4MLT&qid=1685196088&sprefix=ba%2Caps%2C283&ref=sr_pg_8
https://www.amazon.in/s?k=bags&page=10&crid=

### Checking the Extracted Data

In [7]:
amazon_df

Unnamed: 0,Product URL,ASIN,Manufacturer,Product Name,Product Description,Product Price,Rating,Number of reviews
0,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B09VGNTDRZ,Red Lemon,Red Lemon Unisex-adult Bange Series Rhombus Sh...,Closure: Zipper PREMIUM MATERIAL: This styl...,"₹1,999.00",4.4 out of 5 stars,73 ratings
1,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0B1,Red Lemon,Red Lemon BANGE Multifunctional Waterproof Ant...,➧ PREMIUM MATERIAL: This multifunctional sling...,"₹1,899.00",4.3 out of 5 stars,146 ratings
2,https://www.amazon.in/Wesley-Milestone-Waterpr...,,Wesley,Wesley Milestone 2.0 Casual Waterproof Laptop ...,30L Capacity: The Backpack has a padded laptop...,₹598.00,4.3 out of 5 stars,"10,968 ratings"
3,https://www.amazon.in/American-Tourister-AMT-S...,,American Tourister,American Tourister 32 Ltrs Black Casual Backpa...,"Laptop Compatibility: No, Strap Type: Adjustab...","₹1,199.00",4.0 out of 5 stars,"54,253 ratings"
4,https://www.amazon.in/Skybags-Brat-Black-Casua...,,Skybags,Skybags Brat Black 46 Cms Casual Backpack,Combination of functional & safety features in...,₹630,4.1 out of 5 stars,"4,570 ratings"
...,...,...,...,...,...,...,...,...
429,https://www.amazon.in/Chris-Kate-Polyester-Lig...,,,Chris & Kate Blue 50 litres Foldable Duffle Ba...,<br>Dimensions: 28 cm x 55 cm x 29 cm<br> <...,₹999.00,4.0 out of 5 stars,328 ratings
430,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B08T1769X9,MOKOBARA,"MOKOBARA Backpack 15.5"" Inch Laptop Backpack F...",CRAFTED WITH INDUSTRY BEST PREMIUM MATERIALS: ...,"₹4,499",4.3 out of 5 stars,168 ratings
431,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0BWNCV31,uppercase,uppercase Medium 17 Ltrs Vegan Leather (upto 1...,Care Instructions: Wipe with Damp Cloth Han...,"₹1,600.00",4.1 out of 5 stars,443 ratings
432,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B07GX3PWNK,Kenneth Cole Reaction,Kenneth Cole Reaction Women's Chevron Quilted ...,polyester Imported polyester lining 1...,"₹3,275.00",4.8 out of 5 stars,237 ratings


### Exporting the Data to csv File

In [6]:
amazon_df.to_csv("amazon_data.csv", header=True, index=False)