In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

def parse_data(html):
    soup = BeautifulSoup(html, 'lxml')
    books = soup.find_all('article', class_='product_pod')

    titles = []
    prices = []
    ratings = []

    for book in books:
        title = book.h3.a['title']
        titles.append(title)

        price = book.find('p', class_='price_color').text
        prices.append(price)

        rating = book.p['class'][1]
        ratings.append(rating)

    return pd.DataFrame({'Title': titles, 'Price': prices, 'Rating': ratings})

url = "http://books.toscrape.com/"
html = fetch_data(url)

if html:
    df = parse_data(html)
    df.to_csv('books_data.csv', index=False, encoding='utf-8')
    print("Book data extracted and saved to books_data.csv")
    print(df.head())
    print(df.info())
    print(df.isnull().sum())
else:
    print("Failed to retrieve or parse data.")

from google.colab import files
files.download('books_data.csv')


Book data extracted and saved to books_data.csv
                                   Title    Price Rating
0                   A Light in the Attic  Â£51.77  Three
1                     Tipping the Velvet  Â£53.74    One
2                             Soumission  Â£50.10    One
3                          Sharp Objects  Â£47.82   Four
4  Sapiens: A Brief History of Humankind  Â£54.23   Five
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   20 non-null     object
 1   Price   20 non-null     object
 2   Rating  20 non-null     object
dtypes: object(3)
memory usage: 608.0+ bytes
None
Title     0
Price     0
Rating    0
dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>