In [25]:
import requests 
from bs4 import BeautifulSoup
import csv
import time

URL = "https://books.toscrape.com/"

# Lists to store the scraped data
titles = []
prices = []
image_urls = []
categories = []
descriptions = []  # Added descriptions list

BASE_URL = "https://books.toscrape.com/"
CATALOGUE_URL = "https://books.toscrape.com/catalogue/"

# Lists to store the scraped data
titles = []
prices = []
image_urls = []
categories = []
descriptions = []  # Added descriptions list

# Iterate through all pages
for page in range(1, 10):  # 
    if page == 1:
        url = BASE_URL
    else:
        url = f"{CATALOGUE_URL}page-{page}.html"
    
    print(f"Scraping page {page} of 50...")
    
    # Make the request
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Find all book articles on this page
    books = soup.find_all('article', class_='product_pod')
    
    # Extract data from each book
    for book in books:
        # Get title
        title = book.h3.a['title']
        titles.append(title)
        
        # Get price
        price = book.find('p', class_='price_color').text[1:]
        prices.append(price)
        
        # Get book page URL and fetch content
        book_url = book.h3.a['href']
        if 'catalogue/' not in book_url:
            book_url = 'catalogue/' + book_url
        book_url = BASE_URL + book_url.replace('../', '')
        
        book_page = requests.get(book_url)
        book_soup = BeautifulSoup(book_page.content, 'html.parser')
        
        # Get image URL
        image = book_soup.find('img')
        image_url = image['src'].replace('../', '')
        image_urls.append(BASE_URL + image_url)
        
        # Get category
        category = book_soup.find('ul', class_='breadcrumb').find_all('li')[2].text.strip()
        categories.append(category)
        
        # Get description
        description_elem = book_soup.find('div', id='product_description')
        if description_elem and description_elem.find_next_sibling('p'):
            description = description_elem.find_next_sibling('p').text.strip()
        else:
            description = f"A {category} book"  # Fallback description
        descriptions.append(description)
        
        time.sleep(1)  # Be nice to the server


books_data = []
for i in range(len(titles)):
    book_dict = {
        'title': titles[i],
        'price': prices[i],
        'image_url': image_urls[i],
        'category': categories[i],
        'description': descriptions[i]
    }
    books_data.append(book_dict)

# Save to CSV file
with open('books.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'price', 'image_url', 'category', 'description'])
    writer.writeheader()
    writer.writerows(books_data)



Scraping page 1 of 50...
Scraping page 2 of 50...
Scraping page 3 of 50...
Scraping page 4 of 50...
Scraping page 5 of 50...
Scraping page 6 of 50...
Scraping page 7 of 50...
Scraping page 8 of 50...
Scraping page 9 of 50...


In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Read the books data
books_df = pd.read_csv('books.csv')
users_df = pd.read_csv('Users.csv')

# Create Categories.csv
unique_categories = books_df['category'].unique()
categories_data = []
for i, category in enumerate(unique_categories, 1):
    if category != 'Default' and category != 'Add a comment':
        categories_data.append({
            'id': i,
            'name': category,
            'description': f'Books in the {category} category'
        })

categories_df = pd.DataFrame(categories_data)
categories_df.to_csv('Categories.csv', index=False)


In [11]:
products_data = []
product_id = 1
num_sellers = 9

for index, book in books_df.iterrows():
    num_listings = random.randint(2, 4) 
    for k in range(num_listings):
        base_price = float(book['price'])
        price_variation = random.uniform(-0.2, 0.2)
        adjusted_price = round(base_price * (1 + price_variation), 2)
        
        products_data.append({
            'id': product_id,
            'name': book['title'],
            'description': book['description'],
            'image_url': book['image_url'],
            'category_id': categories_df[categories_df['name'] == book['category']]['id'].values[0] if book['category'] in categories_df['name'].values else 1,
            'seller_id': random.randint(1, num_sellers),
            'price': adjusted_price,
            'inventory': random.randint(0, 100)
        })
        product_id += 1
        
        if k < num_listings - 1:
            product_id += random.randint(3, 7)

random.shuffle(products_data)

products_df = pd.DataFrame(products_data)
products_df.to_csv('Products.csv', index=False)

In [12]:
purchases_data = []
purchase_id = 1
for user_id in users_df['id']:
    num_purchases = random.randint(20, 30)
    for _ in range(num_purchases):
        product = random.choice(products_data)
        time_purchased = datetime.now() - timedelta(days=random.randint(1, 365))
        purchases_data.append({
            'id': purchase_id,
            'uid': user_id,
            'pid': product['id'],
            'time_purchased': time_purchased.strftime('%Y-%m-%d %H:%M:%S'),
            'fulfilled': random.choice([True, False]),
            'quantity': random.randint(1, 3),
            'coupon_code': None
        })
        purchase_id += 1

purchases_df = pd.DataFrame(purchases_data)
purchases_df.to_csv('Purchases.csv', index=False)

In [13]:
product_reviews_data = []
review_id = 1
for purchase in purchases_data:
    if random.random() < 0.7:  # 70% chance of leaving a review
        product_reviews_data.append({
            'id': review_id,
            'uid': purchase['uid'],
            'pid': purchase['pid'],
            'rscore': random.randint(1, 5),
            'time_reviewed': (datetime.strptime(purchase['time_purchased'], '%Y-%m-%d %H:%M:%S') + 
                            timedelta(days=random.randint(1, 14))).strftime('%Y-%m-%d %H:%M:%S'),
            'for_seller': False
        })
        review_id += 1

product_reviews_df = pd.DataFrame(product_reviews_data)
product_reviews_df.to_csv('ProductReviews.csv', index=False)


In [14]:
seller_reviews_data = []
review_id = 1
for purchase in purchases_data:
    if random.random() < 0.5:  # 50% chance of leaving a seller review
        product = next(p for p in products_data if p['id'] == purchase['pid'])
        seller_reviews_data.append({
            'id': review_id,
            'uid': purchase['uid'],
            'sid': product['seller_id'],
            'rscore': random.randint(1, 5),
            'time_reviewed': (datetime.strptime(purchase['time_purchased'], '%Y-%m-%d %H:%M:%S') + 
                            timedelta(days=random.randint(1, 14))).strftime('%Y-%m-%d %H:%M:%S'),
            'for_seller': True
        })
        review_id += 1

seller_reviews_df = pd.DataFrame(seller_reviews_data)
seller_reviews_df.to_csv('SellerReviews.csv', index=False)