In [None]:
# install selenium
!pip install selenium


In [7]:
#imports
import pandas as pd
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')  # Add the ChromeDriver path to the system path for execution.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import time
import requests
import re
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'


In [8]:
#inputs are the df and timeout time - how long you want to let a page load to gather information
def sg_scraper(df,timeout=60):

    master = 'app.thestorygraph.com' #site we are using
    search_master = '/browse?search_term=' #search url
    results = []  #we are using this to store results for each row

    # Create chrome instance and configure
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=name')
    wd = webdriver.Chrome(options=chrome_options)

    wd.get('https://'+master+search_master)

    close_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.ID, "close-cookies-popup"))
        )

        # Click the SVG element
    close_button.click()

    #iterate through each row
    for index, row in df.iterrows():
        title = row['Title']
        author = row['Authors'].replace(", ", " ")
        isbn = row['ISBN/UID']
        format = row['Format']
        data = {}


        #the scraper will search with isbn first, or with title + author if isbn is null
        if pd.isna(isbn):
            search = f"{title.replace(' ', '%20')}%20{author.replace(' ', '%20')}"
        else:
            search = isbn
        search_url = f"https://{master}{search_master}{search}"


        #get the url for the book search
        try:
            wd.get(search_url)

            #get the link to the book
            book_link = wd.find_element(By.XPATH, "//h1[@class='font-bold text-xl']/a").get_attribute('href')

            #we can also get the book id from the website
            book_id = book_link.split("/books/")[-1]
            data['book_id'] = book_id
            #go to the new link
            wd.get(book_link)

            #MINUTES/PAGES BLOCK

            try:
                #set minutes and pages to null
                data['minutes'] = np.nan
                data['pages'] = np.nan

                #find text with length information
                text = wd.find_element(By.XPATH,'/html/body/div[1]/div/main/div/div[3]/div/div[2]/p').text.strip()

                #if the format is audio, we will get the hours and minutes and convert into minutes
                if format == 'audio':
                  #get the string to only give back numbers
                  audio_match = re.search(r'(\d+)\s*hours?,\s*(\d+)\s*minutes?', text)
                  hours = int(audio_match.group(1)) if audio_match else np.nan #hours
                  minutes = int(audio_match.group(2)) if audio_match else np.nan #minutes
                  length = hours * 60 + minutes #length convert into minutes
                  data['minutes'] = length #set minutes
                #if format is not an audiobook
                else:
                  #get number of pages
                  pages_match = re.search(r"(\d+)\s+pages", text)
                  pages = int(pages_match.group(1)) if pages_match else np.nan
                  data['pages'] = pages #set

            except Exception as e:
                pass

            #PUBLISHED YEAR BLOCK
            try:
                #find year if it is available
                year_match = re.search(r"first pub (\d{4})", text)
                pub_year = int(year_match.group(1)) if year_match else float('nan')
                data['pub_year'] = pub_year

                #if no pub year, set to null
            except (AttributeError, ValueError):
                 data['pub_year'] = np.nan
            except Exception as e:
                data['pub_year'] = np.nan

            #GENRES BLOCK
            try:
              #get genres (teal text)
              genres = [genre.text for genre in wd.find_elements(By.CLASS_NAME, 'text-teal-700') if genre.text.strip()]
              data['genres'] = genres
            except Exception as e:
              data['genres'] = []

            #MOOD/PACING BLOCK
            try:
              #get moods (pink text)
              moods = [mood.text for mood in wd.find_elements(By.CLASS_NAME, 'text-pink-500') if mood.text.strip()]
              data['moods'] = moods
            except Exception as e:
              data['moods'] = []

            #SERIES BLOCK
            try:
              #find series, and if it exists then get series name
              wd.find_element(By.XPATH,"/html/body/div[1]/div/main/div/div[4]/div[1]/div[2]/div[1]/h3/p[2]/a")
              series_name = wd.find_element(By.XPATH,"/html/body/div[1]/div/main/div/div[3]/div/div[2]/div[1]/h3/p[1]/a[1]").text
              series = True
            #if no series is found series is set to false and returns a series name of null
            except:
              series = False
              series_name = np.nan

            data['series'] = series
            data['series_name'] = series_name

            #click read more button to get blurb info
            try:
              read_more_button = WebDriverWait(wd, 1).until(EC.element_to_be_clickable((By.CLASS_NAME, "read-more-btn")) )
              read_more_button.click()
            except:
              pass

            #BLURB BLOCK
            try:
              #find blurb, otherwise return as null
              blurb = wd.find_elements(By.CLASS_NAME, "trix-content")[0].text
              data['blurb'] = blurb
            except Exception as e:
              data['blurb'] = np.nan

            #NUMBER OF REVIEWS BLOCK
            #this can take awhile to load, adjust timeout to how long you are willing to wait for the review count to pop up
            try:
              reviews = WebDriverWait(wd, timeout).until(
                  EC.presence_of_element_located((By.CLASS_NAME, "inverse-link"))
              ).text
              #get reviews
              match = re.search(r'([\d,]+)\s*reviews?', reviews)
              review_count_str = int(match.group(1).replace(',', ''))
              data['reviews'] = review_count_str
            except Exception as e:
              data['reviews'] = np.nan

            #AVERAGE STAR RATING BLOCK
            try:
              #get star rating and convert to float
              star_rating = float(wd.find_element(By.CLASS_NAME,"average-star-rating").text)
              data['star_rating'] = star_rating
            except Exception as e:
              data['star_rating'] = np.nan



        except (requests.exceptions.RequestException, AttributeError, TypeError, KeyError) as e:
            print(f"Error processing {search_url}: {e}")
            #all the new columns and old
            data['book_id'], data['pages'], data['minutes'], data['pub_year'], data['genres'], data['moods'], data['series'], data['series_name'], data['blurb'], data['reviews'], data['star_rating'] = np.nan,np.nan, np.nan,np.nan, [], [], np.nan, np.nan, np.nan, np.nan, np.nan
        except Exception as e:
            print(f"An unexpected error occurred(general): {e}")
            data['book_id'], data['pages'], data['minutes'], data['pub_year'], data['genres'], data['moods'], data['series'], data['series_name'], data['blurb'], data['reviews'], data['star_rating'] = np.nan,np.nan, np.nan,np.nan, [], [], np.nan, np.nan, np.nan, np.nan, np.nan

        results.append(data)

    # Create new columns from the results
    df['book_id'] = [result.get('book_id', np.nan) for result in results]
    df['pub_year'] = [result.get('pub_year', np.nan) for result in results]
    df['genres'] = [result.get('genres', []) for result in results]
    df['moods'] = [result.get('moods', []) for result in results]
    df['series'] = [result.get('series', False) for result in results]
    df['series_name'] = [result.get('series_name', np.nan) for result in results]
    df['blurb'] = [result.get('blurb', np.nan) for result in results]
    df['reviews'] = [result.get('reviews', np.nan) for result in results]
    df['star_rating'] = [result.get('star_rating', np.nan) for result in results]
    df['minutes'] = [result.get('minutes', np.nan) for result in results]
    df['pages'] = [result.get('pages', np.nan) for result in results]


    return df



In [9]:
#upload your dataframe to whatever the name and location is
df = pd.read_csv('storygraphExport.csv')

In [None]:
#change the timeout to however long you are willing to wait to let a page load for each book, in seconds
scraped = sg_scraper(df,timeout=60)

In [11]:
scraped.to_csv('scraped_storygraphExport.csv') #change this to whatever name you like