# Web Scraping GoodReads.com with Selenium/Python
For a write-up, check out the blog post write-up here: https://readingtheroad.wordpress.com/2019/08/11/web-scraping-in-python-using-selenium/

## Importing libraries

In [1]:
# Selenium for controlling web-browsers
from selenium import webdriver
# For data storage and manipulation
import pandas as pd
# For regex
import re

EXE_PATH = "/Users/aprakash/Documents/personal/chromedriver"

## Helper functions

In [2]:
def goodreads_login(driver, username, password):
    """ Log into page
    Assuming that the user is on the login page, log in with credentials
    
    Args:
    driver(object): selenium browser object
    username(str): username
    password(str): password
    
    Returns:
    None
    """
    driver.find_element_by_id("user_email").send_keys(username)
    driver.find_element_by_id("user_password").send_keys(password)
    driver.find_element_by_name("next").click()
    
    
def go_to_page(driver, pagename):
    """Navigate to the webpage of choice in current tab
    """
    driver.get(pagename)
    
    
def click_till_disabled(driver, class_name):
    """Click element of class name, unless disabled.
    Given an open page, it clicks on buttons with the class name till not disabled.
    
    Args:
    driver(object): selenium browser object
    class_name(str): class name of the button to be clicked
    
    Returns:
    bool: success of clicking
    """
    import selenium.common.exceptions as selexcept 
    try:
        els = driver.find_element_by_class_name(class_name)
        if 'disabled' in els.get_attribute('class'):
            return False
        els.click()
    except selexcept.NoSuchElementException:
        return False
    return True


def open_new_tab(driver, page_name, tab_location):
    """Opens a new tab and switches focus to it.
    From the current tab, opens a new tab at the end and switches focus to it
    
    Args:
    driver(object): selenium browser object
    page_name(str): url to be opened in this tab
    tab_location(int): index where the tab needs to be opened
    
    Returns:
    None
    """
    # Opened a new tab
    driver.execute_script('''window.open("about:blank", "_blank");''')
    # Switched focus to the new tab
    driver.switch_to_window(driver.window_handles[tab_location])
    # Opened the user profile
    driver.get(page_name)
    
    
def regex_token(text, prefix, suffix):
    """Extract a token from string
    Extract (with regex) the first occurence of group between prefix and suffix
    
    Args:
    text(str): string in which token exists
    prefix(str): string pattern before token
    suffix(str): string pattern after token
    
    Returns:
    string: extracted token
    """
    extracted_ = re.search(prefix+r'(.*?)'+suffix, text).group(1)
    return extracted_


def get_book_genre(driver, href):
    """Open book details page and get genres
    Open goodreads.com page in new tab that list book details, and extract book genres
    
    Args:
    driver(object): selenium browser object
    href(str): link to book page
    
    Returns:
    list(str): list of genres associated with book
    """
    n_tabs = len(driver.window_handles)
    open_new_tab(driver, href, n_tabs)
    
    els = driver.find_elements_by_class_name('actionLinkLite.bookPageGenreLink')
    genres = []
    for el in els:
        if 'users' not in el.text:
            genres.append(el.text)

    driver.close()
    driver.switch_to_window(driver.window_handles[n_tabs - 1])
    return genres


def get_books_on_page(driver):
    """Given a GoodReads.com page listing books, return book information.
    Finds all books on the webpage and collects names and hyperlinks for each book
    
    Args:
    driver(object): selenium browser object
    
    Returns:
    Pandas DataFrmae(object): list of book names and hyperlinks
    """
    
    column_names = ['Book Name', 'Book Link']
    data = pd.DataFrame(columns=column_names)
    els = driver.find_elements_by_class_name('field.title')
    for el in els:
        try:
            book_name = el.find_element_by_class_name('value').text
            book_link = el.find_element_by_class_name('value').find_element_by_tag_name('a').get_attribute('href')
            this_data = pd.DataFrame([[book_name, book_link]], columns=column_names)
            data = data.append(this_data, ignore_index=True)
        except:
            pass
    return data

## Web Scraping Protocol

### Logging in

In [None]:
# Step 1.  Opening a blank webpage
driver = webdriver.Chrome(executable_path=EXE_PATH)
# Step 2. Signing in
go_to_page(driver,
           pagename='https://www.goodreads.com/user/sign_in')
goodreads_login(driver,
                username=USERNAME,
                password=PASSWORD)


### Getting to the Friends tab

In [None]:
go_to_page(driver,
           pagename='https://www.goodreads.com/friend')


### Extracting information about friends and their book

In [None]:
# Step 1. Create the global storage space to store information about friends
column_names = ['Book Name', 'Book Link']
all_book_data = pd.DataFrame(columns=column_names)

# Step 2. Creating some flags to mark the process
CLASS_NAME = 'next_page'
next_icon = True # Checking when to stop navigation through friends lists
N_FRIENDS = 0 # Counting the number of friends analyzed

# Step 3. Scraping through all friends
while next_icon:
    
    # Getting the list friends featured on this page
    friends = driver.find_elements_by_class_name('userLink')
    
    # Iterating through each friend
    for friend in friends:
        
        N_FRIENDS +=1
        
        # Getting the profile link of each friend
        href = friend.get_attribute('href')
        # Extracting the userid of the friend
        userid = regex_token(href, 'show/', '-')
        
        # Getting the link for their 'Read' shelf
        new_link = "https://www.goodreads.com/review/list/{}?shelf=read".format(userid)
        # Open new tab for each friend's read list
        n_tabs = len(driver.window_handles)
        open_new_tab(driver, new_link, n_tabs)
        
        # Getting information from friend's reading list
        # Get the first twenty books
        books = get_books_on_page(driver)
        all_book_data = all_book_data.append(books, ignore_index=True)
        
        # Close the tab for friends reading list
        driver.close()
        driver.switch_to_window(driver.window_handles[n_tabs - 1])
        
    # Go to next page if it exists
    next_icon = click_till_disabled(driver, CLASS_NAME)

### Getting genres related to each book

In [None]:
genres_list = []

for index, row in all_book_data.iterrows():
    
    genres = get_book_genre(driver, row['Book Link'])
    all_book_data.loc[index,'Genre'] = ','.join(genres)
    genres_list.extend(genres)

### Post-processing

In [None]:
# Writing to file
all_book_data.to_csv('data.csv')

In [None]:
# Gettign the most common genres
genre_data = pd.DataFrame(genres_list, columns=['Genre'])
genre_data['Genre'].value_counts()

In [None]:
# Getting the most common names
all_book_data['Book Name'].value_counts()