In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

In [3]:
headers = {
    'authority': 'scrapeme.live',
    'dnt': '1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}

In [4]:
data_list = []
for pagenum in range(1,3):
    
    #get request to the url
    url = 'https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/ref=zg_bs_pg_2?_encoding=UTF8&pg='+str(pagenum)
    r = requests.get(url, headers=headers)

    content = r.content

    soup = BeautifulSoup(content)
    s = soup.find('ol', attrs = {'id':"zg-ordered-list"})

    dl = []

    for d in s.findAll('span', attrs={'class': 'aok-inline-block zg-item'}):
        dl.append(d)

    for d in dl:
        try:
            name = d.find('div', attrs = {'class': 'p13n-sc-truncate p13n-sc-line-clamp-1 p13n-sc-truncate-desktop-type2'}).text.strip()
        except:
            name = d.find('div', attrs = {'class': 'p13n-sc-truncate p13n-sc-line-clamp-1'}).text.strip()
            
        try:
            author = d.find('span', attrs = {'class': 'a-size-small a-color-base'}).text.strip()
        except:
            author = d.find('div', attrs = {'class': 'a-row a-size-small'}).text.strip()
            
        price = d.find('span', attrs = {'class': 'p13n-sc-price'}).text.strip()
        rating_ = d.find('span', attrs = {'class': 'a-icon-alt'}) 
        
        if rating_ is not None:
            rating = rating_.text.strip().replace(" out of 5 stars","")
            num_user_rating = d.find('a', attrs = {'class': 'a-size-small a-link-normal'}).text.strip().replace(",","")
        else:
            #-999 is set for the books with rating unavailable
            rating = -999
            num_user_rating = -999
            
        #adding the result of all requests into one list    
        data_list.append([name,author,rating,num_user_rating, price])



In [5]:
df = pd.DataFrame(data_list, columns = ['Name','Author','Rating','Num_user_rated', 'Price']) 

#converted to numeric for further use
df["Num_user_rated"] = pd.to_numeric(df["Num_user_rated"])
df['Rating'] = pd.to_numeric(df['Rating'])

In [6]:
class Book:
    
    def __init__(self, title, author, rating, num_users_rated, price):
        """
        Args:
            title: Title of the book
            author: Author of the book
            rating: Rating given out of 5
            num_users_rated: Number of users that rated the book
            price: Price of thee book
        """
        
        self.title = title
        self.author = author
        self.rating = rating
        self.num_users_rated = num_users_rated
        self.price = price
        
    
    def pretty_print(self):
        """Prints the book information in a pretty format:
        """
        
        print('"%s" by %s with rating %s costs %s \n' % (self.title,self.author,self.rating, self.price))
        
    
    def rating_eval(self):
        """ Checks weather the number of ratings given is considered significant.
        Is considered SSR if more than 5000 ratings
        """        
        
        if (self.num_users_rated > 5000):
            return "Statistically significant rating - SSR"
        else:
            return "Not statistically significant rating - NSSR"
        

In [7]:
#to try
for row in df.itertuples():    
    single_book = Book(row.Name,row.Author,row.Rating,row.Num_user_rated,row.Price)
    #print(single_book.rating_eval())
    #single_book.pretty_print()

In [8]:
class Library:
    
    def __init__(self):
        
        """Attribute of type list called Book_list which will be an empty list for now.
        """
        
        self.book_list = []
        pass
    
    def get_top_5(self):
        
        """Returns the information related to 5 books that have the highest rating using the method pretty_print.
        """
        
        five_highest = df.sort_values("Rating",ascending=False).iloc[0:5]
        
        for row in five_highest.itertuples():    
            some_book = Book(row.Name,row.Author,row.Rating,row.Num_user_rated,row.Price)
            some_book.pretty_print()
        
    def simple_search(self, title):
        
        """
        Args:
            title: Title searched: string

        Returns:
            prints the information related to all books with titles exactly
            matching the searched title using the method pretty_print.
        """
        
        simple_search_result = df[df['Name'] == "%s" %title]
        
        for row in simple_search_result.itertuples():    
            simple_book_found = Book(row.Name,row.Author,row.Rating,row.Num_user_rated,row.Price)
            simple_book_found.pretty_print()        
    
    def complex_search(self, title):
        
        """
        Args:
            title: Title searched: string

        Returns:
            The book(s) with titles containing the most number of words from the searched title 
            (using the method pretty_print),
            if no such book, returns "nothing matched your search".
        """
        
        search_result = df[df['Name'] == "%s" %title]
        
        if len(search_result) == 0:
            name_l = [re.sub(r"[^a-zA-Z0-9_ ']", '', name) for name in df.Name]
            words = [word.lower().split() for word in name_l]
            search_words = [search_word.lower() for search_word in title.split()]
            
            j_list = []

            for words_per_name in words:
                j=0
                for serach_word in search_words:
                    j=words_per_name.count(serach_word)
                j_list.append(j)

            if max(j_list) > 0:
                book_index  = j_list.index(max(j_list))
                search_result = df.iloc[[book_index]]
            else:
                return "Nothing matched your search"
                
        for row in search_result.itertuples():    
            book_found = Book(row.Name,row.Author,row.Rating,row.Num_user_rated,row.Price)
            book_found.pretty_print()
     

In [9]:
a = Library()

a.complex_search("Fox horse")


"The Boy, the Mole, the Fox and the Horse" by Charlie Mackesy with rating 4.9 costs $14.03 



- A simple amazon scraper to extract bestseller books' details and prices from Amazon.com using Python Requests and Beautifulsoup. 
- User agent is set, so the amazon lets us view the content. 
- -999 is used as a way of imputation for unavailable/missing data 
- Some basic Regex and manipulations were used to improve data for further use. 
- Complex search is done by finding the book title that has the highest number of occurences from the searched string
