Some information about the project:

1. Define the problem you dealing with
2. Defining the data and which way is best to retrive it
3. Analyzing the data you have retrived
4. Visualize EDA
5. Further data investigation (which algorithems to use)
6. Solution implementation

The exersice: <br/>
Crawl over book deposit website, generate a dataset. <br/>
and make a model that can predict rather the book will be best seller or not


In [48]:
# Used for enable to see the result in the notebook
%matplotlib inline

from bs4 import BeautifulSoup

import requests
import pandas as pd
import time
from random import randrange

from typing import Final, Any
import matplotlib.pyplot as plt

For this our data source will be `https://www.bookdepository.com`. Let's fetch some data

# 1. Fetching all data

In [28]:
# Number of pages to crawl each page contain 30 books
NUMBER_OF_PAGES: Final[int] = 2
URL: Final[str] = 'https://www.bookdepository.com/search?searchLang=<language-id>&page=<page-number>' # Example url: 'https://www.bookdepository.com/search?searchLang=123&page=1'

In [29]:
# TODO: Make this function async

# Loading soup object by url address
def get_soup_obj(url: str) -> BeautifulSoup:
    seconds: int = 5

    while True:
        try:
            r = requests.get(url)
            break
        except:
            print(f'Failed now you should wait {seconds}, seconds.')
            time.sleep(seconds)
            seconds += 5
        
    soupObj = BeautifulSoup(r.content, 'html.parser')
    return soupObj

In [30]:
# Extracting features from the given page
def getTitle(soupObj: BeautifulSoup) -> str:
    title_name = soupObj.find('h1', {'itemprop': 'name'})
    return title_name.text.strip() if title_name is not None else None
    
def getPagesCount(soupObj: BeautifulSoup) -> int:
    pages_count = soupObj.find('span', {'itemprop': 'numberOfPages'})
    return int(pages_count.text.strip().replace(' pages', '')) if pages_count is not None else None
    
def getDatePublished(soupObj: BeautifulSoup) -> str:
    date_published = soupObj.find('span', {'itemprop': 'datePublished'})
    return date_published.text.strip() if date_published is not None else None
    
def getCategory(soupObj: BeautifulSoup) -> list[str]:
    item_ol = soupObj.find('ol', {'class': 'breadcrumb'})
    if item_ol is None:
        return None

    return {t.text.strip() for t in item_ol('a')} 

def getBestSellerRank(soupObj: BeautifulSoup) -> int:
    biblio_info = soupObj.find('ul', {'class':'biblio-info'})
    if biblio_info is None or biblio_info('li') == []:
        return None

    return int(biblio_info('li')[-1].find('span').text.strip().replace(',', ''))

def getRating(soupObj: BeautifulSoup) -> float:
    book_rating = soupObj.find('span', {'itemprop': 'ratingValue'})
    return float(book_rating.text.strip()) if book_rating is not None else None
     
def ratingCount(soupObj: BeautifulSoup) -> int:
    rating_count=soupObj.find('meta', {'itemprop': 'ratingCount'})
    return int(rating_count['content']) if  rating_count is not None else None

def getAuthor(soupObj: BeautifulSoup) -> str:
    author=soupObj.find('span', {'itemprop': 'name'})
    return author.text.strip() if author is not None else None

def getBackType(soupObj: BeautifulSoup) -> str:
    backtype_ul=soupObj.find('ul',{'class':'meta-info hidden-md'})
    return backtype_ul.find('li').text.strip() if backtype_ul is not None else None

def getPrice(soupObj: BeautifulSoup) -> float:
    book_price = soupObj.find('span', {'class': 'sale-price'})
    #ValueError
    if book_price is not None:
        try:
            return float(book_price.text.strip()[1:]) 
        except ValueError:
            return None
    return None
    
def getUrlOfGoodReads(url: str) -> str:
    s = get_soup_obj(url)
    
    exit_to_goodreads = s.find('a', {'class':'exit-to-goodreads'}) 
    if exit_to_goodreads is not None:
        return 'https://www.goodreads.com/book/show' + exit_to_goodreads['href'].replace('/external/GR','')

    return None

def extractNumberOfTextReviews(url: str):
    found = True
    if not url:
        return None

    for i in range(10):
        time.sleep(randrange(1, 3))
        s = get_soup_obj(url)
        number_of_text_reviews = s.find('meta', {'itemprop': 'reviewCount'})
        if number_of_text_reviews:
            break

    if not number_of_text_reviews:
        return None
        
    return number_of_text_reviews['content']


In [31]:
# Extract properties book by book to a list 
def getProp(soupObj: BeautifulSoup) -> dict[str, Any]:
    prop = {
        'title': getTitle(soupObj) ,
        'pages-count': getPagesCount(soupObj),
        'date-published': getDatePublished(soupObj),
        'category': getCategory(soupObj),
        'bestsellers-rank': getBestSellerRank(soupObj),
        'rating':  getRating(soupObj),
        'rating-count': ratingCount(soupObj),
        'author': getAuthor(soupObj),
        'back-type': getBackType(soupObj),
        'price-ILS': getPrice(soupObj)
    }

    return prop

In [32]:
# Crawl the urls of the books from the main page
# page after page
def exctractBooksUrls(pages: int) -> list[str]:
    books_urls = []
    site_domain: str = 'https://www.bookdepository.com'
    page_base_url = f'{site_domain}/search?searchLang=123&page='

    for i in range(pages):
        url: str = f'{page_base_url}{i + 1}'
        print('page number', i + 1, url)

        currentPage: BeautifulSoup = get_soup_obj(url)
        items = currentPage('div', {'class': 'book-item'})
        
        for book in items:
            book_url = book.find('h3').find('a')['href']
            books_urls.append(f'{site_domain}{book_url}')    

    return books_urls
    

In [33]:
def buildDictForDataFrame(booksUrls: list[str]) -> dict[str, list]:
    df_dict: dict[str, list] = {
        'title': [],
        'pages-count': [],
        'date-published': [],
        'category': [],
        'bestsellers-rank': [],
        'rating': [],
        'rating-count': [],
        'text-reviews-count': [],
        'author': [],
        'back-type': [],
        'price-ILS': []
    }

    counter: int = 0
    for url in booksUrls:
        counter += 1
        print('book number', counter)

        book_soup = get_soup_obj(url)
        book = getProp(book_soup)
        
        for key in book:
            df_dict[key].append(book[key])
            
    return df_dict

In [34]:
def getUrlOfGoodReads(url: str):
    s: BeautifulSoup = get_soup_obj(url)
    exit_to_goodreads = s.find('a', {'class':'exit-to-goodreads'}) 

    if exit_to_goodreads is not None:
        return 'https://www.goodreads.com/book/show' + exit_to_goodreads['href'].replace('/external/GR','')
    
    return None

In [35]:
# def extractNumberOfTextReviews(url: str):
#     found: bool = True
#     if not url:
#         return None

#     for i in range(10):
#         s: BeautifulSoup = get_soup_obj(url)
#         number_of_text_reviews = s.find('meta', {'itemprop': 'reviewCount'})
#         if number_of_text_reviews:
#             break

#     if not number_of_text_reviews:
#         return None
        
#     return number_of_text_reviews['content']


In [36]:
def buildListOfUrl(path: str):
    l = []

    with open(path, 'r') as f:
        l = f.read().split('\n')
        
    return l

In [37]:
def load_csv(path: str) -> None:
    return pd.read_csv(path)

In [38]:
books_urls = exctractBooksUrls(2)

page number 1 https://www.bookdepository.com/search?searchLang=123&page=1
page number 2 https://www.bookdepository.com/search?searchLang=123&page=2


In [39]:
df_dict = buildDictForDataFrame(books_urls)
print(df_dict)

book number 1
book number 2
book number 3
book number 4
book number 5
book number 6
book number 7
book number 8
book number 9
book number 10
book number 11
book number 12
book number 13
book number 14
book number 15
book number 16
book number 17
book number 18
book number 19
book number 20
book number 21
book number 22
book number 23
book number 24
book number 25
book number 26
book number 27
book number 28
book number 29
book number 30
book number 31
book number 32
book number 33
book number 34
book number 35
book number 36
book number 37
book number 38
book number 39
book number 40
book number 41
book number 42
book number 43
book number 44
book number 45
book number 46
book number 47
book number 48
book number 49
book number 50
book number 51
book number 52
book number 53
book number 54
book number 55
book number 56
book number 57
book number 58
book number 59
book number 60
{'title': ["It Ends With Us: The most heartbreaking novel you'll ever read : The most heartbreaking novel you

In [40]:
ctr: int = 1
for url in books_urls:
    print('good reads', ctr)
    ctr += 1
    
    gr_url = getUrlOfGoodReads(url)
    df_dict['text-reviews-count'].append(extractNumberOfTextReviews(gr_url))

good reads 1
good reads 2
good reads 3
good reads 4
good reads 5
good reads 6
good reads 7
good reads 8
good reads 9
good reads 10
good reads 11
good reads 12
good reads 13
good reads 14
good reads 15
good reads 16
good reads 17
good reads 18
good reads 19
good reads 20
good reads 21
good reads 22
good reads 23
good reads 24
good reads 25
good reads 26
good reads 27
good reads 28
good reads 29
good reads 30
good reads 31
good reads 32
good reads 33
good reads 34
good reads 35
good reads 36
good reads 37
good reads 38
good reads 39
good reads 40
good reads 41
good reads 42
good reads 43
good reads 44
good reads 45
good reads 46
good reads 47
good reads 48
good reads 49
good reads 50
good reads 51
good reads 52
good reads 53
good reads 54
good reads 55
good reads 56
good reads 57
good reads 58
good reads 59
good reads 60


In [41]:
df = pd.DataFrame(df_dict)

In [42]:
df.head()

Unnamed: 0,title,pages-count,date-published,category,bestsellers-rank,rating,rating-count,text-reviews-count,author,back-type,price-ILS
0,It Ends With Us: The most heartbreaking novel ...,384,02 Aug 2016,"{Romance, Family & Relationships, Safety In Th...",2,4.43,573274,65345,Colleen Hoover,Paperback,46.55
1,Sapiens : A Brief History of Humankind,512,01 May 2015,"{Early Man, Social & Cultural History, History...",72,4.39,705951,42404,Yuval Noah Harari,Paperback,62.48
2,Dear Zoo : A Lift-the-flap Book,18,08 May 2007,"{Baby Books, Animal Stories}",184,4.28,23550,692,Rod Campbell,Board book,39.73
3,The Barefoot Investor : The Only Money Guide Y...,296,01 Jul 2018,"{Personal Finance, Investment & Securities, Fi...",7383,4.42,14836,1276,Scott Pape,Paperback,76.6
4,Milk and Honey,208,08 Jul 2016,"{Poetry By Individual Poets, Mind, Body & Spir...",291,4.03,469183,33454,Rupi Kaur,Paperback,53.2


Good work, now let's save this for further investigation

In [45]:
df.to_csv('../data/book_deposits.csv')

# 2. Visuallizing the data

First thing let's see what we are dealing with

In [47]:
df = pd.read_csv('../data/book_deposits.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,pages-count,date-published,category,bestsellers-rank,rating,rating-count,text-reviews-count,author,back-type,price-ILS
0,0,It Ends With Us: The most heartbreaking novel ...,384,02 Aug 2016,"{'Romance', 'Family & Relationships', 'Safety ...",2,4.43,573274,65345,Colleen Hoover,Paperback,46.55
1,1,Sapiens : A Brief History of Humankind,512,01 May 2015,"{'Early Man', 'Social & Cultural History', 'Hi...",72,4.39,705951,42404,Yuval Noah Harari,Paperback,62.48
2,2,Dear Zoo : A Lift-the-flap Book,18,08 May 2007,"{'Baby Books', 'Animal Stories'}",184,4.28,23550,692,Rod Campbell,Board book,39.73
3,3,The Barefoot Investor : The Only Money Guide Y...,296,01 Jul 2018,"{'Personal Finance', 'Investment & Securities'...",7383,4.42,14836,1276,Scott Pape,Paperback,76.6
4,4,Milk and Honey,208,08 Jul 2016,"{'Poetry By Individual Poets', 'Mind, Body & S...",291,4.03,469183,33454,Rupi Kaur,Paperback,53.2
