## Site used from where the data is scraped::
    Coding Help --> https://data36.com/how-to-become-a-data-scientist/

## In this we will scrape the data about bestseller books like ::
    1. Title
    2. Book format(paperback,harback etc.)
    3. Price
    4. Year of Publishing

## Let's Scrape multiple page web in one loop:)

In [None]:
## import relevant libraries
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

## NOTE::
    1. previous_sibling is used to find the previous element of the given element.
    2. next_sibling is used to find the next element of the given element.
    3. previous_siblings is used to find all previous element of the given element.
    4. next_siblings is used to find all next element of the given element.

In [None]:
## IMPORTANT NOTE::
##1. while .find() returns only one element,
##2. find_all() returns a list of elements, which means you can iterate over it

page = 1
bestsellers = []
while(page!=35):
    basic_url = f"https://www.bookdepository.com/bestsellers?page={page}"
    response = requests.get(basic_url)
    html = response.content
    soup = bs(html,'html.parser')
    for book in soup.find_all('div',class_ = "book-item"):
        bestseller_book = {}         ## a dictionary to store one book info in each iteration
        bestseller_book["title"] = book.h3.get_text(strip=True)         ## strip is set True for removing the whitespaces  
        bestseller_book["format"] = book.find('p',class_="format").get_text()
        
        ## Use try, except for handling errors since all of the books don't have their "year" and "original Price" available on the website
        
        ## Handling missing 'year' error
        try:
            bestseller_book["year"] = book.find('p',class_="published").get_text()[-4:]
        except AttributeError:
            bestseller_book["year"] = ""
        
        
        ## Handling missing 'original_price' error
        price = book.find('p',class_='price')
        try:
            original_price = price.find('span',class_='rrp')
        except AttributeError:
            bestseller_book["price"] = ""
        else:
            if original_price:
                current_price = str(original_price.previousSibling).strip()
                current_price = current_price.replace(",","")
                current_price = float(current_price.replace("₹",""))
            else:
                current_price = str(price.get_text(strip=True)).replace(",","")
                current_price = float(current_price.replace("₹",""))
            bestseller_book["price"] = current_price
        bestsellers.append(bestseller_book)
    page += 1

##column_titles = ["Title of book","Book-Format","Publishing-Year","Price"]
##bestsellers.insert(0,column_titles)

In [None]:
dataframe_books = pd.DataFrame(bestsellers)
dataframe_books

In [None]:
## creating an excel file for the data frame
file_name = "Boooks Depository.xlsx"

dataframe_books.to_excel(file_name,encoding="utf-8")
print("Success!!")

In [None]:
## Books of year 2000
dataframe_books[dataframe_books['year'] == "2000"]

## Note two ways to request data from internet or site is Shown below, so don't get confused when first one or the second one is used::)

In [None]:
basic_url = "https://www.bookdepository.com/bestsellers"
response = requests.get(basic_url)
response        ## It returns the response status value 

In [None]:
from urllib.request import urlopen
url = "https://www.bookdepository.com/bestsellers"
response = urlopen(url)
response