# Web Scraping data from books website

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time
# Import required libraries
from lxml import etree
import json
from typing import Dict
import os

In [None]:

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_soup(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure request was successful
    return BeautifulSoup(response.text, 'html.parser')

# Extract category from book detail page
def get_book_category(book_url):
    soup = get_soup(book_url)
    # Breadcrumb: Home > Books > Category > Book Title
    breadcrumb = soup.find('ul', class_='breadcrumb')
    if breadcrumb:
        links = breadcrumb.find_all('a')
        # 0: Home, 1: Books, 2: Category
        if len(links) >= 3:
            return links[2].text.strip()
    return 'Unknown'

def scrape_books():
    base_url = "https://books.toscrape.com/"
    books_data = []
    page = 1
    
    try:
        while True:
            # Construct URL for each page
            if page == 1:
                url = base_url
            else:
                url = base_url + f'catalogue/page-{page}.html'
            
            print(f"Scraping page {page}...")
            response = requests.get(url, headers=headers)
            
            # Break if page not found
            if response.status_code == 404:
                break
                
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all books on the page
            books = soup.select('article.product_pod')
            
            if not books:
                break
                
            for book in books:
                product_url = base_url + "catalogue/" + book.select_one('h3 a')['href'].replace('../', '').replace("catalogue", "")
                book_data = {
                    "title": book.h3.a['title'],
                    "price": book.select_one('p.price_color').text.strip(),
                    "rating": book.select_one('p.star-rating')['class'][1],
                    "availability": book.select_one('p.availability').text.strip(),
                    "image_url": base_url + book.select_one('img')['src'].replace('../', ''),
                    "product_url": product_url,
                    "category": get_book_category(product_url)
                }
                books_data.append(book_data)
            
            # Add delay to be respectful to the server
            # time.sleep(1)
            page += 1
        
        # Save to JSON file
        with open('books_datav2.json', 'w', encoding='utf-8') as f:
            json.dump(books_data, f, indent=4, ensure_ascii=False)
            
        print("\nData successfully scraped and saved to books_data.json")
        return books_data
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Run the scraper
if __name__ == "__main__":
    books_data = scrape_books()
    if books_data:
        print(f"\nTotal books scraped: {len(books_data)}")
        print("\nSample Book Data:")
        print(json.dumps(books_data[0], indent=2))

## Converting the JSON file to XML and validation using RelaxNG

In [29]:


def create_book_element(doc: etree.ElementTree, book_data: Dict, inde) -> etree.Element:
    """Create an XML element for a book from its JSON data"""
    book = etree.SubElement(doc, "book")
    
    # Add all book details as sub-elements
    for key, value in book_data.items():
        elem = etree.SubElement(book, key.replace("_", "-"))
        elem.text = str(value)
    
    return book

def validate_xml(xml_file: str, schema_file: str) -> bool:
    """
    Validate XML against RelaxNG schema
    Returns True if valid, False otherwise
    """
    try:
        # Verify files exist
        if not os.path.exists(xml_file):
            raise FileNotFoundError(f"XML file not found: {xml_file}")
        if not os.path.exists(schema_file):
            raise FileNotFoundError(f"Schema file not found: {schema_file}")
            
        # Parse the XML file and schema
        xml_doc = etree.parse(xml_file)
        relaxng_doc = etree.RelaxNG(etree.parse(schema_file))
            
        # Validate and return result
        is_valid = relaxng_doc.validate(xml_doc)
        
        if is_valid:
            print("✅ XML document is valid against the schema")
        else:
            print("❌ XML validation failed!")
            for error in relaxng_doc.error_log:
                print(f"Line {error.line}: {error.message}")
            
        return is_valid
        
    except Exception as e:
        print(f"Validation error: {str(e)}")
        return False

def json_to_xml(json_file: str, xml_file: str):
    """Convert JSON book data to XML format and validate"""
    try:
        # Read JSON data
        with open(json_file, 'r', encoding='utf-8') as f:
            books_data = json.load(f)
        
        # Create root element
        root = etree.Element("books")
        
        # Add each book to the XML tree
        for i, book_data in enumerate(books_data):
            create_book_element(root, book_data, i)
        
        # Create XML tree and save to file
        tree = etree.ElementTree(root)
        tree.write(
            xml_file, 
            pretty_print=True, 
            encoding='utf-8', 
            xml_declaration=True
        )
        
        print(f"Successfully converted JSON to XML. Saved as {xml_file}")
        
        # Validate the XML
        if validate_xml(xml_file, 'books_schema.rng'):
            print("XML structure is valid")
            return True
        else:
            print("XML structure is invalid")
            return False
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return False

# Run the conversion and validation
if __name__ == "__main__":
    result = json_to_xml('books_datav2.json', 'books_datav2.xml')
    print(f"\nConversion and validation {'successful' if result else 'failed'}")

Successfully converted JSON to XML. Saved as books_datav2.xml
✅ XML document is valid against the schema
XML structure is valid

Conversion and validation successful


In [3]:
from BaseXClient import BaseXClient

try:
    # Connect to BaseX server
    session = BaseXClient.Session('localhost', 1984, 'admin', 'admin')
    session.execute("open BookCatalog")

    # Drop old db if exists
    # session.execute("drop db BookCatalog")

    # # Load XML content
    # with open('books_datav2.xml', 'r', encoding='utf-8') as f:
    #     xml_content = f.read()

    # # Create new database
    # session.execute("create db BookCatalog " + xml_content)

    # Query database
    query = session.query("""
        for $b in /books/item
        where number(substring-after($b/price, '£')) < 315
        return data($b/title/text())
    """)

    # Print results
    for typecode, item in query.iter():
        print("typecode=%d" % typecode)
        print("item=%s" % str(item))

    # Close query and session
    query.close()
    session.close()

except Exception as e:
    print("Error:", e)
    query.close()
    session.close()


typecode=37
item=A Light in the Attic
typecode=37
item=Tipping the Velvet
typecode=37
item=Soumission
typecode=37
item=Sharp Objects
typecode=37
item=Sapiens: A Brief History of Humankind
typecode=37
item=The Requiem Red
typecode=37
item=The Dirty Little Secrets of Getting Your Dream Job
typecode=37
item=The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
typecode=37
item=The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
typecode=37
item=The Black Maria
