# Web Scraping using Beautiful Soup

### Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

### Part 1 Functions

In [2]:
# Function to extract Product Name
def get_name(soup):

    try:
        title_string=soup.find("span",attrs={"id":"productTitle"}).text.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        
        price=soup.find("span",attrs={"class":"a-offscreen"}).text

    except AttributeError:

        price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count


### Part 2 Functions

In [3]:
#Function to extract Product Description
def get_product_description(soup):

    try:
        description=soup.find("div",attrs={"id":"feature-bullets"}).text.strip()

    except AttributeError:
        description = ""

    return description

#Function to extract Manufacturer
def get_manuf(soup):

    try:
        manuf=soup.find("div",attrs={"class":"a-section a-spacing-medium brand-snapshot-flex-row"}).text.strip()

    except AttributeError:
        manuf= ""

    return manuf



### Main Function

In [None]:
if __name__ == '__main__':

    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"
    
    # Creating URL_list for 20 URL's of product pages
    URL_List=[URL]
    for i in range(19):
        Pre_URL=URL_List[i]
        webpage = requests.get(Pre_URL, headers=HEADERS)
        print(webpage)
        soup = BeautifulSoup(webpage.content, "html.parser")
        URL2= 'https://www.amazon.in'+soup.find("a", attrs={'class':"s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"}).get('href')
        URL_List.append(URL2)
        
    #Creating Dictionary    
    d = {"Product URL":[],"ASIN":[],"Manufacturer":[], "Product Name":[],"Product Description":[], "Product Price":[], "Rating":[],"Number of reviews":[]}
    
    
    #For each url 
    for urlx in URL_List:
        print(urlx)
        
        # HTTP Request
        webpage = requests.get(urlx, headers=HEADERS)
        
        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Store the links
        links_list = []

        # Loop for extracting links from Tag Objects
        for link in links:
            links_list.append(link.get('href'))

    
        # Loop for extracting product details from each link 
        for link in links_list:
            try:
                new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
            except:
                continue

            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            
            # Finding asin using Regular Expression 
            asin=re.search(r'[dg]p%2F([^%2F]+)',link, flags=re.IGNORECASE)
            if(asin==None):
                asin = re.search(r'[dg]p%2F([^%]+)', link, flags=re.IGNORECASE)
            if(asin):
                d["ASIN"].append(asin.group(1))
            else:
                d["ASIN"].append("")
            
            # Finiding Product URL
            d["Product URL"].append("https://www.amazon.in"+link)

            # Function calls to display all necessary product information
            d["Manufacturer"].append(get_manuf(new_soup))
            d["Product Description"].append(get_product_description(new_soup))
            d['Product Name'].append(get_name(new_soup))
            d['Product Price'].append(get_price(new_soup))
            d['Rating'].append(get_rating(new_soup))
            d['Number of reviews'].append(get_review_count(new_soup))
    
    
    amazon_df = pd.DataFrame.from_dict(d)
    

### Checking the Extracted Data

In [None]:
amazon_df

### Exporting the Data to csv File

In [None]:
amazon_df.to_csv("amazon_data.csv", header=True, index=False)