In [1]:
from bs4 import BeautifulSoup

import requests
import pandas as pd
import time
from random import randrange

In [2]:
#Number of pages to crawl each page contain 30 books
NUMBER_OF_PAGES = 2
URL = 'https://www.bookdepository.com/search?searchLang=123&page=1' 

In [3]:
#Loading soup object by url address
def getSoupObj(url):
    s = 5
    while True:
        try:
            r = requests.get(url)
            break
        except:
            print('Failed now you should wait', s, 'seconds.')
            time.sleep(s)
            s += 5
        
    soupObj = BeautifulSoup(r.content, 'html.parser')
    return soupObj

In [4]:
#individual properties functions
def getTitle(soupObj):
    title_name=soupObj.find('h1', {'itemprop': 'name'})
    return title_name.text.strip() if title_name is not None else None
    
def getPagesCount(soupObj):
    pages_count= soupObj.find('span', {'itemprop': 'numberOfPages'})
    return int(pages_count.text.strip().replace(' pages', '')) if pages_count is not None else None
    
def getDatePublished(soupObj):
    date_published=soupObj.find('span', {'itemprop': 'datePublished'})
    return date_published.text.strip() if date_published is not None else None
    
def getCategory(soupObj):
    item_ol = soupObj.find('ol', {'class': 'breadcrumb'})
    if item_ol is None:
        return None
    return {t.text.strip() for t in item_ol('a')} 

def getBestSellerRank(soupObj):
    biblio_info = soupObj.find('ul', {'class':'biblio-info'})
    if biblio_info is None or biblio_info('li') == []:
        return None
    return int(biblio_info('li')[-1].find('span').text.strip().replace(',', ''))

def getRating(soupObj):
    book_rating=soupObj.find('span', {'itemprop': 'ratingValue'})
    return float(book_rating.text.strip()) if book_rating is not None else None
     
def ratingCount(soupObj):
    rating_count=soupObj.find('meta', {'itemprop': 'ratingCount'})
    return int(rating_count['content']) if  rating_count is not None else None

def getAuthor(soupObj):
    author=soupObj.find('span', {'itemprop': 'name'})
    return author.text.strip() if author is not None else None

def getBackType(soupObj):
    backtype_ul=soupObj.find('ul',{'class':'meta-info hidden-md'})
    return backtype_ul.find('li').text.strip() if backtype_ul is not None else None

def getPrice(soupObj):
    book_price = soupObj.find('span', {'class': 'sale-price'})
    #ValueError
    if book_price is not None:
        try:
            return float(book_price.text.strip()[1:]) 
        except ValueError:
            return None
    return None
    
def getUrlOfGoodReads(url):
    s = getSoupObj(url)
    exit_to_goodreads = s.find('a', {'class':'exit-to-goodreads'}) 
    if exit_to_goodreads is not None:
        return 'https://www.goodreads.com/book/show' + exit_to_goodreads['href'].replace('/external/GR','')
    return None

def extractNumberOfTextReviews(url):
    found = True
    if not url:
        return None
    for i in range(10):
        time.sleep(randrange(1,3))
        s = getSoupObj(url)
        number_of_text_reviews = s.find('meta', {'itemprop': 'reviewCount'})
        if number_of_text_reviews:
            break
    if not number_of_text_reviews:
        return None
    return number_of_text_reviews['content']


In [5]:
#extract properties book by book to a list 
def getProp(soupObj):
    prop = {
        'title': getTitle(soupObj) ,
        'pages-count': getPagesCount(soupObj),
        'date-published': getDatePublished(soupObj),
        'category': getCategory(soupObj),
        'bestsellers-rank': getBestSellerRank(soupObj),
        'rating':  getRating(soupObj),
        'rating-count': ratingCount(soupObj),
        'author': getAuthor(soupObj),
        'back-type': getBackType(soupObj),
        'price-ILS': getPrice(soupObj)
    }
    return prop

In [6]:
#crawl the urls of the books from the main page
# page after page
def exctractBooksUrls():
    books_urls = []
    page_base_url = 'https://www.bookdepository.com/search?searchLang=123&page='
    book_base_url = 'https://www.bookdepository.com'

    for i in range(NUMBER_OF_PAGES):
        url = f'{page_base_url}{i + 1}'
        print('page number', i + 1, url)
        currentPage = getSoupObj(url)
        items = currentPage('div', {'class': 'book-item'})
        
        for book in items:
            book_url = book.find('h3').find('a')['href']
            books_urls.append(f'{book_base_url}{book_url}')    

    return books_urls
    

In [7]:
def buildDictForDataFrame(booksUrls):
    df_dict = {
        'title': [],
        'pages-count': [],
        'date-published': [],
        'category': [],
        'bestsellers-rank': [],
        'rating': [],
        'rating-count': [],
        'text-reviews-count': [],
        'author': [],
        'back-type': [],
        'price-ILS': []
    }
    ctr = 0
    for url in booksUrls:
        ctr += 1
        print('book number', ctr)
        book_soup = getSoupObj(url)
        book = getProp(book_soup)
        for key in book:
            df_dict[key].append(book[key])        
    return df_dict
        

In [8]:
def getUrlOfGoodReads(url):
    s = getSoupObj(url)
    exit_to_goodreads = s.find('a', {'class':'exit-to-goodreads'}) 
    if exit_to_goodreads is not None:
        return 'https://www.goodreads.com/book/show' + exit_to_goodreads['href'].replace('/external/GR','')
    return None

In [9]:
def extractNumberOfTextReviews(url):
    found = True
    if not url:
        return None
    for i in range(10):
        s = getSoupObj(url)
        number_of_text_reviews = s.find('meta', {'itemprop': 'reviewCount'})
        if number_of_text_reviews:
            break
    if not number_of_text_reviews:
        return None
    return number_of_text_reviews['content']


In [10]:
def buildListOfUrl(path):
    l = []
    with open(path, 'r') as f:
        l = f.read().split('\n')
    return l

In [11]:
def load_csv(path):
    return pd.read_csv(path)

In [12]:
books_urls = exctractBooksUrls()

page number 1 https://www.bookdepository.com/search?searchLang=123&page=1
page number 2 https://www.bookdepository.com/search?searchLang=123&page=2


In [13]:
df_dict = buildDictForDataFrame(books_urls)

book number 1
book number 2
book number 3
book number 4
book number 5
book number 6
book number 7
book number 8
book number 9
book number 10
book number 11
book number 12
book number 13
book number 14
book number 15
book number 16
book number 17
book number 18
book number 19
book number 20
book number 21
book number 22
book number 23
book number 24
book number 25
book number 26
book number 27
book number 28
book number 29
book number 30
book number 31
book number 32
book number 33
book number 34
book number 35
book number 36
book number 37
book number 38
book number 39
book number 40
book number 41
book number 42
book number 43
book number 44
book number 45
book number 46
book number 47
book number 48
book number 49
book number 50
book number 51
book number 52
book number 53
book number 54
book number 55
book number 56
book number 57
book number 58
book number 59
book number 60


In [14]:
ctr = 1
for url in books_urls:
    print('good reads', ctr)
    ctr += 1
    
    gr_url = getUrlOfGoodReads(url)
    df_dict['text-reviews-count'].append(extractNumberOfTextReviews(gr_url))

good reads 1
good reads 2
good reads 3
good reads 4
good reads 5
good reads 6
good reads 7
good reads 8
good reads 9
good reads 10
good reads 11
good reads 12
good reads 13
good reads 14
good reads 15
good reads 16
good reads 17
good reads 18
good reads 19
good reads 20
good reads 21
good reads 22
good reads 23
good reads 24
good reads 25
good reads 26
good reads 27
good reads 28
good reads 29
good reads 30
good reads 31
good reads 32
good reads 33
good reads 34
good reads 35
good reads 36
good reads 37
good reads 38
good reads 39
good reads 40
good reads 41
good reads 42
good reads 43
good reads 44
good reads 45
good reads 46
good reads 47
good reads 48
good reads 49
good reads 50
good reads 51
good reads 52
good reads 53
good reads 54
good reads 55
good reads 56
good reads 57
good reads 58
good reads 59
good reads 60


In [15]:
df = pd.DataFrame(df_dict)

In [16]:
df

Unnamed: 0,title,pages-count,date-published,category,bestsellers-rank,rating,rating-count,text-reviews-count,author,back-type,price-ILS
0,It Ends With Us: The most heartbreaking novel ...,384,02 Aug 2016,"{Adult & Contemporary Romance, Romance, Povert...",2,4.43,570742,64523,Colleen Hoover,Paperback,46.55
1,Sapiens : A Brief History of Humankind,512,01 May 2015,"{Popular Science, Social & Cultural History, H...",58,4.39,705492,42344,Yuval Noah Harari,Paperback,62.48
2,Dear Zoo : A Lift-the-flap Book,18,08 May 2007,"{Baby Books, Animal Stories}",194,4.28,23537,693,Rod Campbell,Board book,39.79
3,The Barefoot Investor : The Only Money Guide Y...,296,01 Jul 2018,"{Investment & Securities, Finance & Accounting...",7327,4.42,14836,1273,Scott Pape,Paperback,76.6
4,Milk and Honey,208,08 Jul 2016,"{Poetry By Individual Poets, Mind, Body & Spir...",291,4.03,467374,33409,Rupi Kaur,Paperback,58.5
5,Where the Crawdads Sing,384,20 Dec 2019,"{Crime Fiction, Thriller Books, Crime, Contemp...",27,4.46,1485146,120704,Delia Owens,Paperback,45.42
6,The Very Hungry Caterpillar,24,29 Sep 1994,"{Classic Books for Children, Children's Genera...",103,4.29,441851,8802,Eric Carle,Board book,42.59
7,The Power of Now : (20th Anniversary Edition),224,07 Jan 2016,"{Religious & Spiritual Leaders, Mind, Body & S...",174,4.13,263373,11740,Eckhart Tolle,Paperback,55.92
8,The Midnight Library : The No.1 Sunday Times b...,304,18 Feb 2021,"{People & Places, Contemporary Fiction}",9,4.08,702381,86435,Matt Haig,Paperback,48.94
9,Giraffes Can't Dance,32,01 May 2014,"{Children's Fiction, Numbers & Counting, Story...",129,4.27,28625,1629,Giles Andreae,Paperback,41.52


In [18]:
df.to_csv('book_deposits.csv')