# Importing Libraries
+ BeautifulSoap : Powerfull Python library used for web scarpping from HTML and XML
+ requests : Python library use to send and receive request from server

In [31]:
from bs4 import BeautifulSoup 
import requests

In [32]:
base_url = 'http://books.toscrape.com/catalogue/category/books_1/page-{}.html'
all_books = []

# Define function - Web crawler 

In [33]:
def web_crawler_site(url) : 
    # Setting up header
    # header contains HTTP header which stimulates a requestcoming from web browser, it is use to prevent a site being blocked
    headers = {
       "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
         }
    # Sending HTTP Request 
    # This line sends an HTTP GET request to the specified url using the requests library, including the custom headers defined earlier. 
    # The response from the server is stored in the variable response. 
    response = requests.get(url,headers = headers)

    # Checking response status_code 
    # if status_code != 200 retrun empty array = failed 
    if response.status_code != 200 : 
        print(f"Failed to retrive information | Status code :  {response.status_code}")
        return []

    # Parsing HTML Content
    # Define parsing - 
    # In web scraping, parsing means taking the raw HTML code of a webpage and breaking it into different parts (like paragraphs, titles, or links) 
    # so we can extract specific information.

    # Initialization of beautiful soap and setting praser to html.praser (default)
    soup = BeautifulSoup(response.text,'html.parser')

    # Finding book element : 
    # Finding book_listing using beatifulSoap's find_all 
    books = soup.find_all('article', class_= 'product_pod')


    # Extacting Book information 
    # Empty array to store extratced information 
    book_info = []

    # Traversing through books 
    for book in books : 
        # Extracting book title from <a><h3> title </h3></a>
        title = book.h3.a['title']
        # similar way extract links
        links = book.h3.a['href']
        # Extracting price from <p> in class price_color in text format
        price = book.find('p', class_= 'price_color').text
        author = 'Anonymous'
        
        # Appending all extracted data into book_info 
        book_info.append({
            'title' : title,
            'links' : links,
            'price' : price,
            'author': author, 
        })
    # finally returing extracted data. 
    return book_info    

# Adjust pages 

In [34]:
import time
# looping through pages
for i in range(1, 6): 
    # The format(i) method replaces {} in the base_url with the current value of i.
    # Example : i = 1, base_url becomes : 'http://books.toscrape.com/catalogue/category/books_1/page-1.html'
    # Example : i = 2, base_url becomes : 'http://books.toscrape.com/catalogue/category/books_1/page-2.html'
    url = base_url.format(i)
    # Called function
    books = web_crawler_site(url)
    # Aggregating the result
    all_books.extend(books)
    # Pausing Between Requests: The time.sleep(1) function call pauses the execution of the loop for 1 second before proceeding to the next iteration. 
    # 1. To prevent rate limit 
    # 2. To respect server load 
    time.sleep(1)  

# Converting into CSV format 

In [35]:
import csv
with open('books.csv','w',newline='',encoding='utf-8') as csvfile : 
    # Defining feature of csv file
    fieldnames = ['title','price','author','links']

    
    # Creating csv writer 
    # This initialize the Dictwriter obj of csv, which write dictionary to csv files 
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    
    # writing header Row 
    # Ensure 1st row of csv file contain all columns items. 
    writer.writeheader()

    # Traversing through all_books 
    for book in all_books : 
        # The writer.writerow(book) method writes each dictionary as a row in the CSV file.
        writer.writerow(book)
print("Scarping completed successfully 🫠, data is saved in books.csv")

Scarping completed successfully 🫠, data is saved in books.csv


# Displaying csv file

In [36]:
import pandas as pd

df = pd.read_csv('books.csv')
df.head(10)

Unnamed: 0,title,price,author,links
0,A Light in the Attic,Â£51.77,Anonymous,../../a-light-in-the-attic_1000/index.html
1,Tipping the Velvet,Â£53.74,Anonymous,../../tipping-the-velvet_999/index.html
2,Soumission,Â£50.10,Anonymous,../../soumission_998/index.html
3,Sharp Objects,Â£47.82,Anonymous,../../sharp-objects_997/index.html
4,Sapiens: A Brief History of Humankind,Â£54.23,Anonymous,../../sapiens-a-brief-history-of-humankind_996...
5,The Requiem Red,Â£22.65,Anonymous,../../the-requiem-red_995/index.html
6,The Dirty Little Secrets of Getting Your Dream...,Â£33.34,Anonymous,../../the-dirty-little-secrets-of-getting-your...
7,The Coming Woman: A Novel Based on the Life of...,Â£17.93,Anonymous,../../the-coming-woman-a-novel-based-on-the-li...
8,The Boys in the Boat: Nine Americans and Their...,Â£22.60,Anonymous,../../the-boys-in-the-boat-nine-americans-and-...
9,The Black Maria,Â£52.15,Anonymous,../../the-black-maria_991/index.html
