In [1]:
from selenium import webdriver
import time
import json
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


DRIVER_PATH = r'C:\Users\abdul.saboor\Documents\chromedriver.exe'
time_out = 20

def get_element_text_from(parent,by_name):
    
    try:
        return parent.find_element_by_class_name(by_name).text
    except Exception as e:
        pass
    return ""

def close_pop_up(driver):
    try:
        button = driver.find_element_by_xpath("/html/body/div[3]/div/div/div[1]/button")
        print("Closing Pop-up")
        button.click()
    except:
        pass
    return driver


def page_load_wait(driver):
    try:
        myElem = WebDriverWait(driver, time_out).until(EC.presence_of_element_located((By.CLASS_NAME, 'quotes')))
    except TimeoutException:
        print("Loading took too much time!")
    
def open_browser(url):
    # Open Browser and redirect to URL
    driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    driver.get(url)
    page_load_wait(driver)
    return driver

def get_quotes_from_pages(page, page_number, re_try = True):
    try:
        quotes = []
        for div in page.find_elements_by_class_name("quote"):
            quote_text_author = get_element_text_from(div, "quoteText").split("\n")
            quote = quote_text_author[0]
            author = quote_text_author[1]
            tags = get_element_text_from(div, "greyText").replace("tags: ","")
            likes = get_element_text_from(div, "right")
            if likes:
                likes = likes.split()[0]
            else:
                likes = 0
            quotes.append({
                "Page": page_number,
                "Quote" : quote,
                "Author/Book": author,
                "Tags": tags,
                "Likes": likes,
            })
        print(f"Quotes found on this page: {len(quotes)}")
    except:
        if re_try:
            page = close_pop_up(page)
            get_quotes_from_pages(page, page_number, re_try = False)
    return quotes

def move_to_next_page(driver):
    try:
        driver = close_pop_up(driver)
        next_button = driver.find_element_by_class_name("next_page")
        if not next_button.get_property('disabled'):
            print("\n---------------------- Moving on next Page \n")
            next_button.click()
            page_load_wait(driver)
            return driver
    except Exception as e:
        pass
    driver.close()
    return None
    
def get_page_number(driver):
    try:
        return driver.find_element_by_class_name("current").text
    except Exception as e:
        try:
            get_url = driver.current_url 
            if "page=" in get_url:
                return get_url.split("page=")[1]
        except:
            print(f"Exception in get page number: {e}")
            return 0

In [2]:
url = 'https://www.goodreads.com/work/quotes/4835472-o-alquimista'
driver = open_browser(url)

In [3]:
quotes = []
last_page_number = 0
while True:
    page_number = get_page_number(driver)
    print(f"************************************************************* PAGE: {page_number}")
    if last_page_number==page_number:
        break
    else:
        last_page_number = page_number
    print(f"URL: { driver.current_url }")
    quotes.extend(get_quotes_from_pages(driver, page_number))
    driver = move_to_next_page(driver)
    if not driver:
        break
print("Scraping finished")


In [14]:
result = pd.DataFrame(quotes)

In [15]:
result

Unnamed: 0,Page,Quote,Author/Book,Tags,Likes
0,1,"“And, when you want something, all the univers...","― Paulo Coelho, The Alchemist",inspirational,18366
1,1,“It's the possibility of having a dream come t...,"― Paulo Coelho, The Alchemist",inspirational,14844
2,1,"“When we love, we always strive to become bett...","― Paulo Coelho, The Alchemist","hope, inspiration, love, santiago",12265
3,1,“One is loved because one is loved. No reason ...,"― Paulo Coelho, The Alchemist",love,11818
4,1,“There is only one thing that makes a dream im...,"― Paulo Coelho, The Alchemist","achievement, dreams, failure, fear",11164
...,...,...,...,...,...
1834,63,“...the world we live in will be either better...,"― Paulo Coelho, The Alchemist",,0
1835,63,“El Universo fue creado por una lengua que tod...,"― Paulo Coelho, El alquimista",,0
1836,64,"“Sometimes it's better to be with the sheep, w...","― Paulo Coelho, The Alchemist",,0
1837,64,“What’s the world’s greatest lie?” the boy ask...,“It’s this: that at a certain point in our liv...,,0


In [16]:
result.to_csv("alquimista.csv", index=False,encoding="utf-8-sig")