In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

web = 'https://www.audible.in/search'
driver.get(web)

# Verify the page loads correctly
print(driver.title)

# Lists to store data
book_titles = []
book_authors = []
book_dates = []
book_lengths = []

# Set maximum pages to scrape (change this for more pages)
max_pages = 100
current_page = 1

while current_page <= max_pages:
    print(f"Scraping page {current_page}...")

    # Wait until product list items are loaded
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//li[contains(@class, "productListItem")]'))
    )

    # Find all book elements
    products = driver.find_elements(By.XPATH, '//li[contains(@class, "productListItem")]')

    for product in products:
        try:
            title = product.find_element(By.XPATH, './/h3[contains(@class, "bc-heading")]').text
        except:
            title = "N/A"
        
        try:
            author = product.find_element(By.XPATH, './/li[contains(@class, "authorLabel")]').text.replace("Narrated by: ", "")
        except:
            author = "N/A"
        
        try:
            release_date = product.find_element(By.XPATH, './/li[contains(@class, "releaseDateLabel")]').text.replace("Release date: ", "")
        except:
            release_date = "N/A"
        
        try:
            length = product.find_element(By.XPATH, './/li[contains(@class, "runtimeLabel")]').text.replace("Length: ", "")
        except:
            length = "N/A"

        # Append data to lists
        book_titles.append(title)
        book_authors.append(author)
        book_dates.append(release_date)
        book_lengths.append(length)

    # Pagination: Get next page link and navigate
    try:
        next_button = driver.find_element(By.XPATH, '//span[contains(@class, "nextButton")]//a')
        next_page_url = next_button.get_attribute('href')

        if next_page_url:
            driver.get(next_page_url)  # Navigate to the next page
            time.sleep(2)  # Allow time for the next page to load
            current_page += 1
        else:
            print("No more pages found.")
            break
    except:
        print("Next button not found. Ending pagination.")
        break

# Close the driver
driver.quit()

# Save data to CSV
df_books = pd.DataFrame({
    "title": book_titles,
    "author": book_authors,
    "release_date": book_dates,
    "length": book_lengths
})

df_books.replace("N/A", pd.NA, inplace=True)

df_books.to_csv('audible_books.csv', index=False)
print("Scraping complete. Data saved to audible_books.csv.")


Audiobooks | Audible.in
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scrap