# Web Scraping using Beautiful Soup

### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

### Part 1 Functions

In [2]:
# Function to extract Product Name
def get_name(soup):

    try:
        title_string=soup.find("span",attrs={"id":"productTitle"}).text.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        
        price=soup.find("span",attrs={"class":"a-offscreen"}).text

    except AttributeError:

        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count


### Part 2 Functions

In [3]:
#Function to extract Product Description
def get_product_description(soup):

    try:
        description=soup.find("div",attrs={"id":"feature-bullets"}).text.strip()

    except AttributeError:
        description = ""

    return description

#Function to extract Manufacturer
def get_manuf(soup):

    try:
        manuf=soup.find("div",attrs={"class":"a-section a-spacing-medium brand-snapshot-flex-row"}).text.strip()

    except AttributeError:
        manuf= ""

    return manuf



### Main Function

In [10]:
if __name__ == '__main__':

    # adding user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # adding webpage URL
    URL = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"
    
    # Creating URL_list for 20 URL's of product pages
    URL_List=[URL]
    for i in range(19):
        Pre_URL=URL_List[i]
        webpage = requests.get(Pre_URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "html.parser")
        URL2= 'https://www.amazon.in'+soup.find("a", attrs={'class':"s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"}).get('href')
        URL_List.append(URL2)
        
    #Creating Dictionary    
    d = {"Product URL":[],"ASIN":[],"Manufacturer":[], "Product Name":[],"Product Description":[], "Product Price":[], "Rating":[],"Number of reviews":[]}
    
    
    #For each url 
    for urlx in URL_List:
        #printing url for which data will been scraped
        print(urlx)
        
        # HTTP Request
        webpage = requests.get(urlx, headers=HEADERS)
        
        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Store the links
        links_list = []

        # Loop for extracting links from Tag Objects
        for link in links:
            links_list.append(link.get('href'))

    
        # Loop for extracting product details from each link 
        for link in links_list:
            try:
                new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
            except:
                continue

            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            
            # Finding asin using Regular Expression 
            asin=re.search(r'[dg]p%2F([^%2F]+)',link, flags=re.IGNORECASE)
            if asin==None or len(asin.group(1))!=10:
                asin = re.search(r'[dg]p%2F([^%]+)', link, flags=re.IGNORECASE)
            if(asin):
                d["ASIN"].append(asin.group(1))
            else:
                d["ASIN"].append("")
            
            # Finiding Product URL
            d["Product URL"].append("https://www.amazon.in"+link)

            # Function calls to display all necessary product information
            d["Manufacturer"].append(get_manuf(new_soup))
            d["Product Description"].append(get_product_description(new_soup))
            d['Product Name'].append(get_name(new_soup))
            d['Product Price'].append(get_price(new_soup))
            d['Rating'].append(get_rating(new_soup))
            d['Number of reviews'].append(get_review_count(new_soup))
    
    
    amazon_df = pd.DataFrame.from_dict(d)
    

https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1
https://www.amazon.in/s?k=bags&page=2&crid=2M096C61O4MLT&qid=1685679414&sprefix=ba%2Caps%2C283&ref=sr_pg_1
https://www.amazon.in/s?k=bags&page=3&crid=2M096C61O4MLT&qid=1685679416&sprefix=ba%2Caps%2C283&ref=sr_pg_2
https://www.amazon.in/s?k=bags&page=4&crid=2M096C61O4MLT&qid=1685679417&sprefix=ba%2Caps%2C283&ref=sr_pg_3
https://www.amazon.in/s?k=bags&page=5&crid=2M096C61O4MLT&qid=1685679420&sprefix=ba%2Caps%2C283&ref=sr_pg_4
https://www.amazon.in/s?k=bags&page=6&crid=2M096C61O4MLT&qid=1685679421&sprefix=ba%2Caps%2C283&ref=sr_pg_5
https://www.amazon.in/s?k=bags&page=7&crid=2M096C61O4MLT&qid=1685679423&sprefix=ba%2Caps%2C283&ref=sr_pg_6
https://www.amazon.in/s?k=bags&page=8&crid=2M096C61O4MLT&qid=1685679425&sprefix=ba%2Caps%2C283&ref=sr_pg_7
https://www.amazon.in/s?k=bags&page=9&crid=2M096C61O4MLT&qid=1685679426&sprefix=ba%2Caps%2C283&ref=sr_pg_8
https://www.amazon.in/s?k=bags&page=10&crid=

### Checking the Extracted Data

In [11]:
amazon_df

Unnamed: 0,Product URL,ASIN,Manufacturer,Product Name,Product Description,Product Price,Rating,Number of reviews
0,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0C3MHG797,,"Swiss Military Hard Shell Travel Backpack UFO,...",Care Instructions: Wipe with Dry Cloth Uniq...,"₹4,199",2.0 out of 5 stars,1 rating
1,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0B12VF374,Red Lemon,Red Lemon BANGE Multifunctional Waterproof Ant...,➧ PREMIUM MATERIAL: This multifunctional sling...,"₹1,899.00",4.3 out of 5 stars,151 ratings
2,https://www.amazon.in/Wesley-Milestone-Waterpr...,,Wesley,Wesley Milestone 2.0 Casual Waterproof Laptop ...,30L Capacity: The Backpack has a padded laptop...,₹598.00,4.3 out of 5 stars,"11,025 ratings"
3,https://www.amazon.in/American-Tourister-AMT-S...,,American Tourister,American Tourister 32 Ltrs Black Casual Backpa...,"Laptop Compatibility: No, Strap Type: Adjustab...","₹1,049.00",4.0 out of 5 stars,"54,355 ratings"
4,https://www.amazon.in/Half-Moon-Resistant-Back...,,Half Moon,Half Moon Large 37L Laptop Bag Backpack for me...,This lightweight and spacious laptop bag with ...,₹749,3.9 out of 5 stars,"2,804 ratings"
...,...,...,...,...,...,...,...,...
456,https://www.amazon.in/bg1905-Capacity-Laptop-T...,,,TSPARK Red Lemon bg1905 15.6 inch 35 LTR Capac...,Compartment: Outside 4 compartments including ...,"₹2,949",4.3 out of 5 stars,84 ratings
457,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0C1CQWVR9,Impulse,Impulse Seacon 40L Laptop Backpack/Office Bag/...,Care Instructions: Wipe with Dry Cloth SPAC...,"₹1,856",3.8 out of 5 stars,4 ratings
458,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B085MHDJ93,Half Moon,Half Moon 35L Water Resistant 15.6 inch Laptop...,SPACIOUS AND LIGHTWEIGHT: Light weight & spaci...,₹529,3.9 out of 5 stars,"15,990 ratings"
459,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,B0C1CSS81H,Impulse,Impulse Impbee 45L Laptop Backpack/Office Bag/...,Care Instructions: Wipe with Dry Cloth SPAC...,₹936,Previous page,


### Exporting the Data to csv File

In [12]:
amazon_df.to_csv("amazon_data.csv", header=True, index=False)