In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt

In [None]:
urls = [
   'https://www.nytimes.com/books/best-sellers/2024/09/15/combined-print-and-e-book-nonfiction/',
    'https://www.nytimes.com/books/best-sellers/2024/09/15/combined-print-and-e-book-fiction/',
    'https://www.nytimes.com/books/best-sellers/2024/09/15/hardcover-nonfiction/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/hardcover-nonfiction/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/trade-fiction-paperback/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/paperback-nonfiction/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/advice-how-to-and-miscellaneous/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/childrens-middle-grade-hardcover/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/picture-books/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/series-books/',
   'https://www.nytimes.com/books/best-sellers/2024/09/15/young-adult-hardcover/'
]

In [None]:
def scrape_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize list to store book data for this URL
    books_data = []

    # Find all <li> tags with class 'css-1m0jikr'
    li_tags = soup.find_all('li', class_='css-1m0jikr')

    for li in li_tags:
        # Extracting data from each <li> tag
        article = li.find('article', itemprop='itemListElement')

        # Check if article exists
        if article:
            weekwise_history = article.find('p', class_='css-1o26r9v').text.strip() if article.find('p', class_='css-1o26r9v') else None
            name = article.find('h3', itemprop='name').text.strip() if article.find('h3', itemprop='name') else None
            author = article.find('p', itemprop='author').text.strip().replace('by ', '') if article.find('p', itemprop='author') else None
            publisher = article.find('p', itemprop='publisher').text.strip() if article.find('p', itemprop='publisher') else None
            description = article.find('p', itemprop='description').text.strip() if article.find('p', itemprop='description') else None
            image_link = article.find('img', itemprop='image')['src'] if article.find('img', itemprop='image') else None

            # Extracting meta fields
            meta_tags = article.find_all('meta')
            isbn_tags = [meta.get('content') for meta in meta_tags if meta.get('itemprop') == 'isbn']
            position = next((meta.get('content') for meta in meta_tags if meta.get('itemprop') == 'position'), None)
            url_meta = next((meta.get('content') for meta in meta_tags if meta.get('itemprop') == 'url'), None)

            # Handle multiple ISBNs
            isbn_1 = isbn_tags[0] if len(isbn_tags) > 0 else None
            isbn_2 = isbn_tags[1] if len(isbn_tags) > 1 else None

            # Append the data for each book to the list
            books_data.append({
                'Weekwise History': weekwise_history,
                'Name': name,
                'Author': author,
                'Publisher': publisher,
                'Description': description,
                'Image Link': image_link,
                'ISBN_1': isbn_1,
                'ISBN_2': isbn_2,
                'Position': position,
                'URL': url_meta,
                'Input URL':url
            })

    return books_data

In [None]:
all_books_data = []

# Loop over each URL and scrape data
for url in urls:
    books_data = scrape_data(url)
    all_books_data.extend(books_data)


In [None]:
# Convert list of dictionaries to pandas DataFrame
df = pd.DataFrame(all_books_data)
df['date'] = dt.date.today().strftime("%Y-%m-%d")


In [None]:
print(df)


          Weekwise History                      Name  \
0     23 weeks on the list    THE ANXIOUS GENERATION   
1            New this week     AT WAR WITH OURSELVES   
2     96 weeks on the list           HILLBILLY ELEGY   
3    209 weeks on the list  THE BODY KEEPS THE SCORE   
4      3 weeks on the list               WHAT'S NEXT   
..                     ...                       ...   
135   42 weeks on the list                   MURTAGH   
136    5 weeks on the list       SUCH CHARMING LIARS   
137    8 weeks on the list    THE DARKNESS WITHIN US   
138   35 weeks on the list             RUTHLESS VOWS   
139   12 weeks on the list    THE SHADOWS BETWEEN US   

                                    Author         Publisher  \
0                           Jonathan Haidt     Penguin Press   
1                            H.R. McMaster            Harper   
2                               J.D. Vance            Harper   
3                      Bessel van der Kolk           Penguin   
4    Me

In [None]:
df.shape

(140, 10)

In [None]:
# Export the DataFrame to a CSV file
# df.to_csv('books_data.csv', index=False)


In [None]:
df.head()

Unnamed: 0,Weekwise History,Name,Author,Publisher,Description,Image Link,ISBN_1,ISBN_2,Position,URL,Input URL,date
0,23 weeks on the list,THE ANXIOUS GENERATION,Jonathan Haidt,Penguin Press,A co-author of “The Coddling of the American M...,https://storage.googleapis.com/du-prd/books/im...,593655036,9780593655030,1,https://www.nytimes.com/books/best-sellers/202...,https://www.nytimes.com/books/best-sellers/202...,2024-09-13
1,New this week,AT WAR WITH OURSELVES,H.R. McMaster,Harper,The author of “Battlegrounds” and former natio...,https://storage.googleapis.com/du-prd/books/im...,62899503,9780062899507,2,https://www.nytimes.com/books/best-sellers/202...,https://www.nytimes.com/books/best-sellers/202...,2024-09-13
2,96 weeks on the list,HILLBILLY ELEGY,J.D. Vance,Harper,The Yale Law School graduate and 2024 Republic...,https://storage.googleapis.com/du-prd/books/im...,62300547,9780062300546,3,https://www.nytimes.com/books/best-sellers/202...,https://www.nytimes.com/books/best-sellers/202...,2024-09-13
3,209 weeks on the list,THE BODY KEEPS THE SCORE,Bessel van der Kolk,Penguin,"How trauma affects the body and mind, and inno...",https://storage.googleapis.com/du-prd/books/im...,670785938,9780670785933,4,https://www.nytimes.com/books/best-sellers/202...,https://www.nytimes.com/books/best-sellers/202...,2024-09-13
4,3 weeks on the list,WHAT'S NEXT,Melissa Fitzgerald and Mary McCormack,Dutton,Two cast members of “The West Wing” share insi...,https://storage.googleapis.com/du-prd/books/im...,593184548,9780593184547,5,https://www.nytimes.com/books/best-sellers/202...,https://www.nytimes.com/books/best-sellers/202...,2024-09-13
