In [49]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# URL for the webpage to scrape
url = 'https://www.audible.in/cat/Arts-Entertainment/Art-Audiobooks/21881934031?ref_pageloadid=not_applicable&pf_rd_p=e32be111-8c00-4812-b46d-c1d31ffed468&pf_rd_r=0C24CVH3TFGB9NR0WFR7&plink=0ncUe3X3JRT4Q9dJ&pageLoadId=ANfr5CjGzdAH73GH&creativeId=9f32d386-8d17-4de5-8a99-a22191fa0921&ref=a_categories_c0_subCat_1'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all book titles
span_element = soup.find_all('span', class_="bc-text oneLine bc-spacing-top-s1 bc-size-title2 bc-color-base bc-text-bold bc-text-ellipses")

# Extract the book titles into a list
book_titles = [str(i.text).strip() for i in span_element]

# Find all links to book detail pages
links = soup.find_all('a', href=re.compile(r'^/pd/'))

# Extract the relative URLs (without the base URL)
url2_1 = set()
for link in links:
    if 'href' in link.attrs:
        url2 = link['href']
        url2_1.add(url2)

# Create an empty list to store extracted data
data = []

# Loop over the book titles and URLs
for book_title, url_suffix in zip(book_titles, url2_1):
    urll = 'https://www.audible.in' + url_suffix
    response2 = requests.get(urll)
    soup2 = BeautifulSoup(response2.content, 'html.parser')

    # Extract the rating of the book
    rating = soup2.find('span', class_="bc-text bc-pub-offscreen")
    rating = rating.text.strip() if rating else 'N/A'

    # Extract author, narrator, and length information
    written_by = None
    narrated_by = None
    length = None

    a = soup2.find_all('div', class_='slot centerSlot')
    for div in a:
        li_elements = div.find_all('li', class_='bc-list-item')
        for li in li_elements:
            li_elements2 = li.text.strip()
            text = li_elements2.split('\n')

            # Process each line to find relevant details
            for line in text:
                if 'Written by' in line:
                    written_by = line.replace('Written by:', '').strip()
                if 'Narrated by' in line:
                    narrated_by = line.replace('Narrated by:', '').strip()
                if 'Length:' in line:
                    length = line.replace('Length:', '').strip()

    # Store the extracted data in a dictionary and append it to the list
    data.append({
        'Book Title': book_title,
        'URL': urll,
        'Rating': rating,
        'Written by': written_by,
        'Narrated by': narrated_by,
        'Length': length
    })

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data)


In [50]:
df = df.drop_duplicates(subset=['Book Title'])

In [51]:
df.fillna('N/A',inplace=True)

In [52]:
df['Rating'] = df['Rating'].str.replace(' out of 5 stars', '', regex=False)

In [53]:
# Assuming you have a 'Full Text' column in your DataFrame
df['Book Title'] = df.apply(lambda row: row['Full Text'] if '...' in row['Book Title'] else row['Book Title'], axis=1)

In [54]:

# Function to convert time to total minutes
def convert_to_minutes(time_str):
    hours = 0
    minutes = 0
    # Extract hours and minutes from the time string
    match = re.match(r'(\d+)\s*hrs?\s*and\s*(\d+)\s*mins?', time_str)
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
    # Return total minutes
    return hours * 60 + minutes


In [55]:
df['Length']= df['Length'].apply(convert_to_minutes)

In [60]:
df

Unnamed: 0,Book Title,URL,Rating,Written by,Narrated by,Length
0,The Steal Like an Artist Audio Trilogy,https://www.audible.in/pd/Cultura-y-Arte-en-Ex...,,,,0
1,The Red Book,https://www.audible.in/pd/La-republica-romana-...,,,,0
2,Malcolm Gladwell with Robert Krulwich at the 9...,https://www.audible.in/pd/The-World-According-...,,,,0
3,7 Secrets of Shiva,https://www.audible.in/pd/Escultura-romanica-C...,,,,0
4,Monuments Men,https://www.audible.in/pd/Culture-Audiobook/B0...,5.0,,,0
5,The Renaissance,https://www.audible.in/pd/Your-Brain-on-Art-Au...,,,,0
12,All the Beauty in the World,https://www.audible.in/pd/All-the-Beauty-in-th...,5.0,,,0
13,Playing to the Gallery,https://www.audible.in/pd/Wabi-Sabi-for-Artist...,5.0,,,0
14,500 Quotes from Peacemakers,https://www.audible.in/pd/500-Quotes-from-Peac...,,,,0
15,The Godfather Notebook,https://www.audible.in/pd/B0DP3K1PT2,,,,0
