In [8]:
# FUNCTIONS

def get_book_detail(driver, url):
    
    driver.get(url)
    time.sleep(SLEEP_TIME)
    content_div = driver.find_elements(By.XPATH, "//div[@class='content']")
    
    inner_html = content_div[0].get_attribute("innerHTML")
    
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(inner_html, "html.parser")


    name_elem = soup.find("h1")
    book_name = name_elem.text


    price_elem = soup.find("p", attrs={"class":"price_color"})
    book_price = price_elem.text


    import re
    regex = re.compile('^star-rating ')
    star_elem = soup.find("p", attrs={"class":regex})
    book_star_count = star_elem["class"][-1]


    desc_elem = soup.find("div", attrs={"id":"product_description"}).find_next_sibling()
    book_desc = desc_elem.text

    product_info = {}
    table_rows = soup.find("table").find_all("tr")
    for row in table_rows:
        key = row.find("th").text
        value = row.find("td").text
        product_info[key] = value

    return{'book_name': book_name,'book_price': book_price, 'book_star_count': book_star_count,
           'book_desc': book_desc, **product_info}


def get_book_urls(driver, url):
    
    MAX_PAGINATION = 10
    
    book_urls = []
    book_elements_xpath = "//div[@class='image_container']//a"
    
    for i in range(1, MAX_PAGINATION):
        update_url = url if i == 1 else url.replace("index", f"page-{i}")
        driver.get(update_url)
        time.sleep(SLEEP_TIME)
        book_elements = driver.find_elements(By.XPATH, book_elements_xpath)
        if not book_elements:
            break
        temp_urls = [element.get_attribute("href") for element in book_elements]
        book_urls.extend(temp_urls)

    return book_urls


def get_travel_and_nonfiction_category_urls(driver, url):
    driver.get(url)
    time.sleep(SLEEP_TIME)

    category_elements_xpath = "//a[contains(text(),'Travel') or contains(text(),'Nonfiction')]"
    category_elements = driver.find_elements(By.XPATH, category_elements_xpath)
    category_urls = [element.get_attribute("href") for element in category_elements] 
    return category_urls


def initialize_driver():
    options = webdriver.FirefoxOptions()
    options.add_argument("--start-maximized");
    driver = webdriver.Firefox(executable_path='/home/train/web_scraping/Selenium/geckodriver', options=options)
    return driver


In [10]:
# PROCESS

import time
from selenium import webdriver
from selenium.webdriver.common.by import By

SLEEP_TIME = 1

def main():
    BASE_URL = "https://books.toscrape.com/"
    driver = initialize_driver()
    category_urls = get_travel_and_nonfiction_category_urls(driver, BASE_URL)
    data = []
    for cat_url in category_urls:
        book_urls = get_book_urls(driver, cat_url)
        for book_url in book_urls:
            book_data = get_book_detail(driver, book_url)
            book_data["cat_url"]=cat_url
            data.append(book_data)
    
    len(data)
    
    #OPTİONAL
    
    import pandas as pd
    
    df = pd.DataFrame(data)
    
    
    return df


df = main()
print(df.head(1))
print(df.shape)


# Define the file path to save the DataFrame as an Excel file
excel_file_path = "/home/train/Desktop/books_details_new.xlsx"
df.to_excel(excel_file_path, index=False)  # Save the DataFrame to an Excel file without row indices
print(f"\nBook details saved to '{excel_file_path}'.")  # Print confirmation message with file path


                 book_name book_price book_star_count                                book_desc               UPC Product Type Price (excl. tax) Price (incl. tax)    Tax             Availability Number of reviews                                  cat_url
0  It's Only the Himalayas     £45.17             Two  “Wherever you go, whatever you do, j...  a22124811bfa8350        Books            £45.17            £45.17  £0.00  In stock (19 available)                 0  https://books.toscrape.com/catalogue...
(121, 12)

Book details saved to '/home/train/Desktop/books_details_new.xlsx'.
