In [1]:
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
from urllib.parse import urljoin
from openpyxl import Workbook
import pytz

In [2]:
bst = pytz.timezone('Asia/Dhaka')

#create a new Workbook
wb = Workbook()
#create a new worksheet
ws = wb.active
#add headers to the first row of the worksheet
headers = ["Date", "Time","Book Title", "Author", "Category", "Original Price", "Sell Price", "Discount", "Best Seller Merit", "E-book Availability", "E-book Price", "Ratings Count",
           "Reviews Count", "Rating", "Publisher", "Language", "Country"]
ws.append(headers)

In [None]:
total_page = 166
page_number = 101
while page_number <= total_page:
    
    print(f"---------------data is loading from {page_number}---------\n\n")
    #make connection with website
    url = 'https://www.rokomari.com/search?term=book&search_type=BOOK&page='+ str(page_number)
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
    page = requests.get(url,headers=headers)   
    soup_1 = BeautifulSoup(page.content,"html.parser")  
    soup_2 = BeautifulSoup(soup_1.prettify(),"html.parser")    
    books_div_area = soup_2.find_all("div",class_='book-list-wrapper')   
    book_links = [elem.find("a").get('href') for elem in books_div_area]

    
    for link in book_links:
        
        full_link = urljoin(url, link)  # Concatenating the base URL with the link
        book_page = requests.get(full_link, headers=headers)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        book_soup_prettify = BeautifulSoup(book_soup.prettify(),"html.parser")
        #pillar
        main_header_1 = book_soup_prettify.find("div",class_="col details-book-main-info align-self-center")
        main_header_2 = book_soup_prettify.find("div",class_="details-stationary__details h-100")
        books_header_1 = book_soup_prettify.find("div", class_="details-book-main-info__header") 
        books_header_2 = book_soup_prettify.find("div",class_="details-stationary__details h-100")
        additional_info_1 = book_soup_prettify.find("div",id="book-additional-specification")
        additional_info_2 = book_soup_prettify.find("table",class_="specification-section__table border-top pt-3 mt-4")      
        #initialize variables
        book_title = book_author = category = original_price = sell_price = discount = best_seller_merit = ebook_availability = ebook_price = ratings_count = reviews_count = rating = publisher_name = country_name = language_name = ""
    
        if main_header_1 and books_header_1: 
            
            current_date = datetime.now().date()
            current_time = datetime.now(bst)
            current_time = current_time.strftime('%I:%M:%S %p')

            #book title
            try:
                title = books_header_1.find("h1")
                book_title = title.contents[0].strip()
            except AttributeError:
                book_title = ""
                pass
            #book author name
            try:
                book_author = main_header_1.find("p",class_="details-book-info__content-author").find("a").text.strip()
            except AttributeError:
                book_author = ""
                pass
            #category name
            try:
                category = main_header_1.find("div",class_="details-book-info__content-category d-flex align-items-center").find("a").text.strip()
            except AttributeError:
                category = ""
                pass
            #original price
            try:
                original_price_div = main_header_1.find("div", class_="details-book-info__content-book-price")
                if original_price_div and original_price_div.find("strike"):
                    #original price exists, so extract the text
                    original_price = original_price_div.find("strike").text.strip()
                else:
                    #original price does not exist
                    original_price = ""  #or set it to some default value
            except  AttributeError:
                original_price = ""
                pass
            #sell price
            try:
                sell_price = main_header_1.find("span",class_="sell-price").text.strip()
            except AttributeError:
                sell_price = ""
                pass
            #discount amount
            try:
                discount_div = main_header_1.find("div", class_="details-book-info__content-book-price")
                if discount_div and discount_div.find("span",class_="js--save-message"):
                    discount = discount_div.find(class_="js--save-message").text.strip()
                else:
                    discount = ""  #or set it to some default value
            except AttributeError:
                discount = ""
                pass
            #best selller badge
            try:
                best_seller_merit_div = main_header_1.find("p",class_="best-seller-badge")
                if best_seller_merit_div:
                    best_seller_merit = best_seller_merit_div.find("span").text.strip()
                else:
                    best_seller_merit = ""
                    
            except AttributeError:
                best_seller_merit = ""
                pass        
            #ebook availability
            try:
                ebook_availability_ensure = main_header_1.find("div",class_="ebook-buy-now d-flex align-items-center")
                if ebook_availability_ensure:
                    ebook_availability = "Yes"
                else:
                    ebook_availability = "No"
            except AttributeError:
                ebook_availability = "No"
                pass
            #ebook price
            try:
                ebook_availability_ensure = main_header_1.find("div",class_="ebook-buy-now d-flex align-items-center")
                if ebook_availability_ensure:
                    ebook_price = ebook_availability_ensure.find("p",class_="ebook-price").text.strip()
                else:
                    ebook_price = ""
            except AttributeError:
                ebook_price = ""
                pass   
            #ratings count
            try:
                ratings_reviwe = main_header_1.find("div", class_="details-book-info__content-rating").find("span", "ml-2").text.strip()
                ratings_text, reviews_text = ratings_reviwe.split('|')
                # Extract ratings count
                ratings_count = ratings_text.strip().split()[0]
            except AttributeError:
                ratings_count = ""
                pass
            #reviews count
            try:
                ratings_reviwe = main_header_1.find("div", class_="details-book-info__content-rating").find("span", "ml-2").text.strip()
                ratings_text, reviews_text = ratings_reviwe.split('|')
                reviews_count = reviews_text.strip().split()[0]
            except AttributeError:
                reviews_count = ""
                pass
            #ratings
            try:
                rating = book_soup_prettify.find("div",class_="review-wrapper__rating-summary").find("h3",class_="summary-title").text.strip()
            except AttributeError:
                rating = ""
                pass        
            #publisher name
            try:          
                publisher_name_info = additional_info_1.find("table",class_="table table-bordered")
                if publisher_name_info and publisher_name_info.find("td",class_="publisher-link").find("a"):
                    publisher_name = publisher_name_info.find("td",class_="publisher-link").find("a").text.strip()
                else:
                    publisher_name = ""
            except AttributeError:
                publisher_name = ""
            #country name
            try:
                country_name_infos = additional_info_1.find_all("td")
                for country_name_info in country_name_infos:
                    if country_name_info.text.strip() == "Country":
                        c_name = country_name_info.find_next_sibling("td")
                        country_name = c_name.get_text(strip=True)
                        break
                    else:
                        country_name = ""
                        continue
            except AttributeError:
                country_name = ""
                pass
            #language information
            try:
                for country_name_info in country_name_infos:
                    if country_name_info.text.strip() == "Language":
                        l_name = country_name_info.find_next_sibling("td")
                        language_name = l_name.get_text(strip=True)
                        break
                    else:
                        language_name = ""
                        continue
            except AttributeError:
                language_name = ""
                pass
            #add the extracted data to the worksheet
            row_data = [current_date, current_time, book_title, book_author, category, original_price, sell_price, discount, best_seller_merit, ebook_availability, ebook_price, ratings_count,
                        reviews_count, rating, publisher_name, language_name, country_name]
            ws.append(row_data)
               
            print(f"Information of {book_title}: {page_number}\n\n")
            print("Current Date:", current_date)
            print("Current Time:", current_time)
            print(f"Book Title:{book_title}")
            print(f"Book Author Name:{book_author}")
            print(f"Book Catagory:{category}")
            print(f"Original Price:{original_price} tk")
            print(f"Sell Price:{sell_price} tk")
            print(f"Discount Rate:{discount}")
            print(f"Best seller merit:{best_seller_merit}")
            print(f"E-book availability:{ebook_availability}")
            print(f"E-book price:{ebook_price}")
            print(f"Total Ratings:{ratings_count}")
            print(f"Total Review:{reviews_count}")
            print(f"Ratings:{rating}")
            print(f"Publication name:{publisher_name}")
            print(f"Language:{language_name}")
            print(f"Country name:{country_name}")
            print(f"-----------------\n\n") 
            
        elif main_header_2 and books_header_2:

            current_date = datetime.now().date()
            current_time = datetime.now(bst)
            current_time = current_time.strftime('%I:%M:%S %p')
            
            #book name
            try:
                title = books_header_2.find("h1", class_="title mb-0")
                book_title = title.contents[0].strip()
            except AttributeError:
                book_title = ""
                pass          
            #author name
            try:
                book_author = main_header_2.find("p", class_="details-stationary__brand").find("a").text.strip()
            except AttributeError:
                book_author = ""
                pass     
            #category name
            try:
                category = main_header_2.find("p", class_="details-stationary__category").find("a").text.strip()
            except AttributeError:
                category = ""
                pass            
            #original price
            try:
                original_price_div = main_header_2.find("div", class_="details-stationary__price")
                if original_price_div and original_price_div.find("strike"):
                    original_price = original_price_div.find("strike").text.strip()
                else:
                    original_price = ""
            except AttributeError:
                original_price = ""
                pass            
            #sell price
            try:
                sell_price = main_header_2.find("span", class_="sell-price").text.strip()
            except AttributeError:
                sell_price = ""
                pass           
            #discount rate
            try:
                discount_div = main_header_2.find("div", class_="details-stationary__price")
                if discount_div and discount_div.find("span", class_="js--save-message"):
                    discount = discount_div.find(class_="js--save-message").text.strip()
                else:
                    discount = ""
            except AttributeError:
                discount = ""
                pass

            #best selller badge
            try:
                best_seller_merit_div = main_header_2.find("p",class_="best-seller-badge")
                if best_seller_merit_div:
                    best_seller_merit = best_seller_merit_div.find("span").text.strip()
                else:
                    best_seller_merit = ""
                    
            except AttributeError:
                best_seller_merit = ""
                pass
            #ebook availability
            try:
                ebook_availability_ensure = main_header_2.find("div",class_="ebook-buy-now d-flex align-items-center")
                if ebook_availability_ensure:
                    ebook_availability = "Yes"
                else:
                    ebook_availability = "No"
            except AttributeError:
                ebook_availability = "No"
                pass
            #ebook price
            try:
                ebook_availability_ensure = main_header_2.find("div",class_="ebook-buy-now d-flex align-items-center")
                if ebook_availability_ensure:
                    ebook_price = ebook_availability_ensure.find("p",class_="ebook-price").text.strip()
                else:
                    ebook_price = ""
            except AttributeError:
                ebook_price = ""
                pass          
            #rating count
            try:
                ratings_reviwe = main_header_2.find("div", class_="details-stationary__stars").find("span", class_="ml-2").text.strip()
                ratings_text, reviews_text = ratings_reviwe.split('|')
                # Extract ratings count
                ratings_count = ratings_text.strip().split()[0]
            except AttributeError:
                ratings_count = ""
                pass
            #reviews count
            try:
                ratings_reviwe = main_header_2.find("div", class_="details-stationary__stars").find("span", class_="ml-2").text.strip()
                ratings_text, reviews_text = ratings_reviwe.split('|')
                reviews_count = reviews_text.strip().split()[0]
            except AttributeError:
                reviews_count = ""
                pass

            #ratings
            try:
                rating = book_soup_prettify.find("div",class_="details-ratings-review__rating-summary").find("h3",class_="summary-title").text.strip()
            except AttributeError:
                rating = ""
                pass
            #publisher name
            try:
                publisher_infos = additional_info_2.find_all("td", class_="proDetailValue")
                for publisher_info in publisher_infos:
                    if publisher_info.find("a"):
                        publisher_name = publisher_info.find("a").text.strip()
                        break
                else:
                    publisher_name = ""
            except AttributeError:
                publisher_name = ""
                pass          
            #country name
            try:
                country_name_infos = additional_info_2.find_all("td")
                for country_name_info in country_name_infos:
                    if country_name_info.text.strip() ==  "Country of Origin":
                        c_name = country_name_info.find_next_sibling("td")
                        country_name = c_name.get_text(strip=True)
                        break
                else:
                    country_name = ""
            except AttributeError:
                country_name = ""
                pass          
            #language name
            try:
                for country_name_info in country_name_infos:
                    if country_name_info.text.strip() == "Version":
                        l_name = country_name_info.find_next_sibling("td")
                        language_name = l_name.get_text(strip=True)
                        break
                else:
                    language_name = ""
            except AttributeError:
                language_name = ""
                pass

            #add the extracted data to the worksheet
            row_data = [current_date, current_time, book_title, book_author, category, original_price, sell_price, discount, best_seller_merit, ebook_availability, ebook_price, ratings_count,
                        reviews_count, rating, publisher_name, language_name, country_name]
            ws.append(row_data)
            
            print(f"Information of {book_title}: {page_number}\n\n")
            print("Current Date:", current_date)
            print("Current Time:", current_time)
            print(f"Book Title:{book_title}")
            print(f"Book Author Name:{book_author}")
            print(f"Book Catagory:{category}")
            print(f"Original Price:{original_price} tk")
            print(f"Sell Price:{sell_price} tk")
            print(f"Discount Rate:{discount}")
            print(f"Best seller merit:{best_seller_merit}")
            print(f"E-book availability:{ebook_availability}")
            print(f"E-book price:{ebook_price}")
            print(f"Total Ratings:{ratings_count}")
            print(f"Total Review:{reviews_count}")
            print(f"Ratings:{rating}")
            print(f"Publication name:{publisher_name}")
            print(f"Language:{language_name}")
            print(f"Country name:{country_name}")
            print("-----------------\n\n")
        else:
            print("Book title not found for:", full_link)

    print(f"\n\n---------------data is loaded of {page_number}---------\n\n")
    page_number = page_number + 1
    
#save the workbook
wb.save("C:/Users/asmto/OneDrive/Desktop/rokomari web scrapping projects/rokomari_books_informations2.xlsx")
print(f"Total loaded page {page_number}")

---------------data is loading from 101---------


Information of এক অলৌকিক ঘটনা: 101


Current Date: 2024-04-15
Current Time: 09:21:44 PM
Book Title:এক অলৌকিক ঘটনা
Book Author Name:সানিয়াসনাইন খান
Book Catagory:When 8-12: Religious Books
Original Price:TK. 100 tk
Sell Price:TK. 86 tk
Discount Rate:You Save TK. 14 (14%)
Best seller merit:
E-book availability:No
E-book price:
Total Ratings:3
Total Review:2
Ratings:5.0
Publication name:Banglaprokash
Language:Bangla & English
Country name:Bangladesh
-----------------


Information of নজরুলের শিশু কিশোর সাহিত্য: 101


Current Date: 2024-04-15
Current Time: 09:21:46 PM
Book Title:নজরুলের শিশু কিশোর সাহিত্য
Book Author Name:কাজী নজরুল ইসলাম
Book Catagory:শিশু-কিশোর সাহিত্য/রচনা সমগ্র
Original Price: tk
Sell Price:TK. 400 tk
Discount Rate:
Best seller merit:
E-book availability:No
E-book price:
Total Ratings:1
Total Review:1
Ratings:5.0
Publication name:নজরুল ইন্সটিটিউট
Language:বাংলা
Country name:বাংলাদেশ
-----------------


Information of 