In [1]:
import re

import requests
from bs4 import BeautifulSoup

In [2]:
!pwd

/home/anthelix/Documents/projetGit/WebScraping


In [3]:
main_url = "http://books.toscrape.com/index.html"
result = requests.get(main_url)
scraped = BeautifulSoup(result.text, 'html.parser')

In [4]:
def getAndParseURL(url):
    response = requests.get(url)
    scraped = BeautifulSoup(response.text, 'html.parser')
    return(scraped)

In [13]:
#Find book URLs on the main page
scraped.find("article", class_ = "product_pod").div.a.get('href')
main_page_products_urls = [x.div.a.get('href') for x in scraped.findAll("article", class_ = "product_pod")]
print(str(len(main_page_products_urls)) + " fetched products URLs")
print("One example:")
print(main_page_products_urls[0])

6 fetched products URLs
One example:
../neither-here-nor-there-travels-in-europe_198/index.html


In [6]:
def getBooksURLs(url):
    '''
    retrieve book links on any given page of the website
    '''
    scraped = getAndParseURL(url)
    # remove the index.html part of the base url before returning the results
    return(["/".join(url.split("/")[:-1]) + "/" + x.div.a.get('href') for x in scraped.findAll("article", class_ = "product_pod")])

In [7]:
#Find book categories URLs on the main page
# Getting the URLs of subsections of a website can be very useful if we want to scrape a specific part of it
categories_urls = [main_url + x.get('href') for x in scraped.find_all("a", href=re.compile("catalogue/category/books"))]
categories_urls = categories_urls[1:] # we remove the first one because it corresponds to all the books

print(str(len(categories_urls)) + " fetched categories URLs")
print("Some examples:")
categories_urls[:5]

50 fetched categories URLs
Some examples:


['http://books.toscrape.com/index.htmlcatalogue/category/books/travel_2/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/mystery_3/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/historical-fiction_4/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/sequential-art_5/index.html',
 'http://books.toscrape.com/index.htmlcatalogue/category/books/classics_6/index.html']

In [8]:
# Get all pages URLs
# store all the results into a list
pages_urls = [main_url]

scraped = getAndParseURL(pages_urls[0])

# while we get two matches, this means that the webpage contains a 'previous' and a 'next' button
# if there is only one button, this means that we are either on the first page or on the last page
# we stop when we get to the last page

while len(scraped.findAll("a", href=re.compile("page"))) == 2 or len(pages_urls) == 1:
    
    # get the new complete url by adding the fetched URL to the base URL (and removing the .html part of the base URL)
    new_url = "/".join(pages_urls[-1].split("/")[:-1]) + "/" + scraped.findAll("a", href=re.compile("page"))[-1].get("href")
    
    # add the URL to the list
    pages_urls.append(new_url)
    
    # parse the next page
    scraped = getAndParseURL(new_url)
    

print(str(len(pages_urls)) + " fetched URLs")
print("Some examples:")
pages_urls[:5]

50 fetched URLs
Some examples:


['http://books.toscrape.com/index.html',
 'http://books.toscrape.com/catalogue/page-2.html',
 'http://books.toscrape.com/catalogue/page-3.html',
 'http://books.toscrape.com/catalogue/page-4.html',
 'http://books.toscrape.com/catalogue/page-5.html']

In [9]:
#We could have just created this list by incrementing ‘page-X.html’ until 50.
#but if the number of pages changes??
#One solution could be to increment the value until we get on a 404 page.
#The 200 code indicates that there is no error. The 404 code tells us that the page was not found.
result = requests.get("http://books.toscrape.com/catalogue/page-50.html")
print("status code for page 50: " + str(result.status_code))

result = requests.get("http://books.toscrape.com/catalogue/page-51.html")
print("status code for page 51: " + str(result.status_code))

status code for page 50: 200
status code for page 51: 404


In [10]:
#with the changes
pages_urls = []

new_page = "http://books.toscrape.com/catalogue/page-1.html"
while requests.get(new_page).status_code == 200:
    pages_urls.append(new_page)
    new_page = pages_urls[-1].split("-")[0] + "-" + str(int(pages_urls[-1].split("-")[1].split(".")[0]) + 1) + ".html"
    

print(str(len(pages_urls)) + " fetched URLs")
print("Some examples:")
pages_urls[:5]

50 fetched URLs
Some examples:


['http://books.toscrape.com/catalogue/page-1.html',
 'http://books.toscrape.com/catalogue/page-2.html',
 'http://books.toscrape.com/catalogue/page-3.html',
 'http://books.toscrape.com/catalogue/page-4.html',
 'http://books.toscrape.com/catalogue/page-5.html']

In [11]:
#Get all products URLs

booksURLs = []
for page in pages_urls:
    booksURLs.extend(getBooksURLs(page))
    
print(str(len(booksURLs)) + " fetched URLs")
print("Some examples:")
booksURLs[:5]

1000 fetched URLs
Some examples:


['http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'http://books.toscrape.com/catalogue/soumission_998/index.html',
 'http://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html']

In [12]:
#Get product data     book title, price, availability, image, category, rating
names = []
prices = []
nb_in_stock = []
img_urls = []
categories = []
ratings = []

# scrape data for every book URL: this may take some time
for url in booksURLs:
    scraped = getAndParseURL(url)
    # product name
    names.append(scraped.find("div", class_ = re.compile("product_main")).h1.text)
    # product price
    prices.append(scraped.find("p", class_ = "price_color").text[2:]) # get rid of the pound sign
    # number of available products
    nb_in_stock.append(re.sub("[^0-9]", "", scraped.find("p", class_ = "instock availability").text)) # get rid of non numerical characters
    # image url
    img_urls.append(url.replace("index.html", "") + scraped.find("img").get("src"))
    # product category
    categories.append(scraped.find("a", href = re.compile("../category/books/")).get("href").split("/")[3])
    # ratings
    ratings.append(scraped.find("p", class_ = re.compile("star-rating")).get("class")[1])
    
# add data into pandas df
import pandas as pd

scraped_data = pd.DataFrame({'name': names, 'price': prices, 'nb_in_stock': nb_in_stock, "url_img": img_urls, "product_category": categories, "rating": ratings})
scraped_data.head()

Unnamed: 0,name,price,nb_in_stock,url_img,product_category,rating
0,A Light in the Attic,51.77,22,http://books.toscrape.com/catalogue/a-light-in...,poetry_23,Three
1,Tipping the Velvet,53.74,20,http://books.toscrape.com/catalogue/tipping-th...,historical-fiction_4,One
2,Soumission,50.1,20,http://books.toscrape.com/catalogue/soumission...,fiction_10,One
3,Sharp Objects,47.82,20,http://books.toscrape.com/catalogue/sharp-obje...,mystery_3,Four
4,Sapiens: A Brief History of Humankind,54.23,20,http://books.toscrape.com/catalogue/sapiens-a-...,history_32,Five


Some data cleaning may be useful before using them:

    transform the ratings into numerical values
    remove the numbers in the product_category column

https://github.com/jonathanoheix/scraping_basics_with_beautifulsoup/blob/master/scraping_basics_with_beautifulsoup.ipynb