In [2]:
import csv
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date 
import time 

### Scrap the books (name, price, rate) for each category and put them into a CSV & Excel file
### https://books.toscrape.com/

In [3]:
response = requests.request('GET',"https://books.toscrape.com/")
soup = BeautifulSoup(response.content, 'html.parser')


In [4]:
### original attempt, scrapped as it didn't account for multiple pages per category ###
#==============================================================================================#
# #def scrape_categories():
    
# all_categories = soup.find('div',attrs = { 'class':'side_categories'}).find('ul').find_all('a')
# categories_names = []
# categories_links = []
# categories = []

# for category in all_categories:
#     categories_names.append(category.get_text().strip())
#     categories_links.append(category.get('href'))

# for index in range(len(all_categories)):
#     categories.append((categories_names[index] , 'https://books.toscrape.com/'+categories_links[index]))

# #return categories

In [5]:
#attempt 2 
def scrape_pages():
    
    all_categories = soup.find('div',attrs = { 'class':'side_categories'}).find('ul').find_all('a')
    
    categories_names = []       #all main categories 
    categories_main_pages = []  #index/main page of each category '../../index.html'
    categories_links = []       #all links/directories for all categories' main page
    all_pages = []   #all pages' data (category,link)
    
        #save each category's name & link to lists 'names' and 'links'
    for category in all_categories:
        categories_names.append(category.get_text().strip())
        categories_links.append(category.get('href'))
    
        #loop by number of categories
    for index in range(len(all_categories)):
        
            #save category's main page data to list 'main_pages'
        categories_main_pages.append((categories_names[index] , 'https://books.toscrape.com/'+categories_links[index]))
        
            #detect 'next' button in category's main page
        category_response = requests.request('GET',categories_main_pages[index][1])
        category_soup = BeautifulSoup(category_response.content,'html.parser')
        next_btn = category_soup.find('li',attrs={'class':'current'})
        
            #if there is no "next" button (category has single page), add main page data to list 'all_pages' 
        if next_btn == None:   
            all_pages.append((categories_names[index] , 'https://books.toscrape.com/'+categories_links[index]))
            
        else:
                #detect how many pages are there for the category 
            count = int(next_btn.get_text().split()[-1])
            
                #loop by number of pages and add each page's data to list 'all_pages'
            for i in range(1,count+1):
                all_pages.append((categories_names[index] , 'https://books.toscrape.com/'+categories_links[index][:-10]+f'page-{i}.html'))
                
    return all_pages

In [6]:
def scrape_books(all_pages):
    
    book_entries = [] #each book's data (category, name, price, rating)
    
        #loop by each page in , find each book's information and save it in list 'books' 
    for category in range(50,len(all_pages)):     #skips pages with category 'Books' (from 0 to 49) as it contains all other books
        
        response = requests.request('GET',all_pages[category][1])
        soup = BeautifulSoup(response.content,'html.parser')
        books = soup.find_all('article',attrs={'class':'product_pod'}) #finds all the books in the page and puts them in a list 
        
            #loop by number of books found in page
        for book in range(len(books)):
            
                #collect book info
            cat = all_pages[category][0]
            name = books[book].find('h3').find('a').get('title')
            price = books[book].find('p',attrs={'class':'price_color'}).get_text()[1:] + " £"
            rating = books[book].find('p').get('class')[1] + " star(s)"
            
                #append book info to list 'book_entries'
            book_entries.append({"cat": cat, "name": name, "price":price, "rating": rating})
            
    return book_entries


In [11]:
def scrape_the_website():
    
    print("scrapping website pages, please wait...")
    pages = scrape_pages()
    
    print("scraping website pages complete, scrapping book data...")
    book_data = scrape_books(pages)
    
    print("scraping book data complete, writing file...")
    time.sleep(2)
    
    with open('books to scrape web scrapping.csv','w') as f:
        writer = csv.DictWriter(f,fieldnames = ['cat','name','price','rating'])
        writer.writeheader()
        for i in range(len(book_data)):
            writer.writerow(book_data[i])     
    print("done.")

In [13]:
scrape_the_website()

scrapping website pages, please wait...
scraping website pages complete, scrapping book data...
scraping book data complete, writing file...
done.


In [14]:
data = pd.read_csv('books to scrape web scrapping.csv')

In [15]:
data.sample(10)

Unnamed: 0,cat,name,price,rating
294,Childrens,The White Cat and the Monk: A Retelling of the...,58.08 �,Four star(s)
830,Poetry,Twenty Love Poems and a Song of Despair,30.95 �,Four star(s)
456,Default,Maude (1883-1993):She Grew Up with the country,18.02 �,Two star(s)
498,Default,Every Last Word,46.47 �,Three star(s)
719,Fantasy,Harry Potter and the Order of the Phoenix (Har...,31.63 �,Four star(s)
518,Default,Modern Day Fables,47.44 �,Two star(s)
221,Womens Fiction,Something Borrowed (Darcy & Rachel #1),48.96 �,Five star(s)
306,Childrens,Maybe Something Beautiful: How Art Transformed...,22.54 �,One star(s)
273,Fiction,Atlas Shrugged,26.58 �,Five star(s)
440,Music,Love Is a Mix Tape (Music #1),18.03 �,One star(s)


In [16]:
len(data)    #1000 Books

1000