# Scraping

This notebook includes all the code used to scrape the information from Goodreads.com

In [1]:
# import statements

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import ssl #only necessary for Mac
import re
import pandas as pd
import numpy as np

context = ssl._create_unverified_context()  # only necessary for Mac

In [2]:
# Get list of must read books
def scrape_table(url, max_page, context):
  """
  Function scraping a url for table and returning dataframe with links to
  the books mentioned in the table. 
  
  Parameters:
  url: link to page where table is located
  max_page: how many pages of the table are wanted
  context: only necessary for Mac in order to get the right ssl key
  
  Return:
  Dataframe with all the links to the books as entries
  """
  page = 1
  all_titles = []
  while page < max_page + 1:
      targetUrl = url + str(page)
      targetRequest = Request(targetUrl)
      response = urlopen(targetRequest, context=context)
      responseText = response.read()
      response.close()
      soup = BeautifulSoup(responseText, 'html.parser')
      targetTable = soup.select("table")[0]
      a = targetTable.find_all('a', {'itemprop': 'url'})
      tmp = re.findall(r'href=("(.*?)")', str(a))
      title = []
      for i in range(len(tmp)):
          if i % 2 == 0:
              t = tmp[i][0]
              title.append(t)
      all_titles += title
      print(f"Done scraping page {page}")
      page += 1
  all_titles_df = pd.DataFrame(all_titles, columns=['Book link'])
  return all_titles_df

First at table with all the books on a wanted list is scraped, to later use as a look-up for gathering information about each book. 

In [None]:
# Getting the top 2000 books from the list on goodreads

url_good = "https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once?page="
max_page = 20

table_of_books = scrape_table(url_good, max_page, context)

In [None]:
url_bad = "https://www.goodreads.com/list/show/23974.Worst_Rated_Books_on_Goodreads?page="
table_of_books_neg = scrape_table(url_bad, max_page, context)

Next the information about each book must be retrieved. To do this two functions are defined. The first is used to retreive reviews from a given book and the second retreives all necessary information needed about the book and gathers it in a dataframe. 

In [None]:
# Get information from must read books

def get_reviews(target_reviews, max_rev):
    """
    The function gets a list of target reviews and opens individual pages for 
    the reviews and scrapes the raw text. 
    
    Parameter:
    target_reviews: Result set from soup.find_all with list of all review entries
    max_rev: The target_reviews returns 30 reviews per book, and this parameters
     allows a way to reduce how many reviews are wanted
    
    Return: 
    a list with each review as a raw text entry. 
    """
    reviews = []
    if len(target_reviews) < max_rev:
        max_rev = len(target_reviews)
    for i in range(max_rev):
        rev_link = target_reviews[i].find_all("a")[0]['href']
        link = main_link + rev_link
        while True:
          try:
            link_request = Request(link)
            link_response = urlopen(link_request, context=context)
            link_text = link_response.read()
            link_response.close()
            if link_response.getcode() == 200:
              break
          except Exception as inst:
            print(inst)
        
        subsoup = BeautifulSoup(link_text, 'html.parser')
        
        tmp_review = subsoup.find_all('div', itemprop='reviewBody')
        if len(tmp_review) < 1:
          continue

        full_review = tmp_review[0].get_text()
        reviews.append(full_review)
        print(f"    Review no. {i} acquired")
    return reviews


In [None]:
def scrape_book_info(main_link, table_of_books, context, start_entry, end_entry, max_rev):
  """
  The function uses the links from table_of_books to retrieve information
  about each book. 
  
  Parameter:
  table_of_books: dataframe with link to each book as an entry
  context: only necessary for Mac in order to get the right ssl key
  no_books: this parameter lets you tune how many of the books from 
   table_of_books you want information about
  max_review: parameter allowing you to decide how many reviews you want per. book
  
  Return:
  Dataframe including:
  title: title of the book
  rating: goodreads rating of the book
  related: list of related works
  genre: top genre picked for this book
  review: list of raw text of the reviews of the book 
  """
  total_df = pd.DataFrame(columns = ['title', 'rating', 'author', 'rating_count', 'related', 'genre'])
  for i in range(start_entry, end_entry):
    print(f'Starting scraping of entry no. {i}')
    targetUrl = main_link + eval(table_of_books['Book link'][i])
    while True:
      try:
          targetRequest = Request(targetUrl)
          response = urlopen(targetRequest, context=context)
          responseText = response.read()
          response.close()
          if response.getcode() == 200:
            break
      except Exception as inst:
          print(inst)

    soup = BeautifulSoup(responseText, 'html.parser')

    target_title = soup.find_all("h1")
    book_title = target_title[0].get_text(strip = True)

    target_rating = soup.find_all('span', {'itemprop' : 'ratingValue'})
    rating = target_rating[0].get_text(strip = True)

    target_author = soup.find_all('span', {'itemprop': 'name'})
    author = target_author[0].get_text(strip = True)

    target_rating_count = soup.find_all('meta', {'itemprop': 'ratingCount'})
    rating_count = int(target_rating_count[0].attrs['content'])

    target_related = soup.find_all('li', class_='cover' )
    related = []
    for rel in target_related:
        t = rel.find_all("a")
        t1 = t[0].find_all('img')[0]['alt']
        related.append(t1)

    target_genre = soup.find_all('a', class_='actionLinkLite bookPageGenreLink')
    if len(target_genre) < 1:
      genre = None
    else:
      genre = target_genre[0].get_text()

    target_reviews = soup.find_all('div', class_='reviewHeader uitext stacked')
    max_rev = max_rev
    reviews = get_reviews(target_reviews, max_rev)

    total_df = total_df.append({'title': book_title,
                                  'rating': rating,
                                  'author' : author,
                                  'rating_count' : rating_count,
                                  'related': [related],
                                  'genre': genre
                                }, ignore_index=True)
      
  return total_df

In [None]:
# Information about each book is scraped and after every 10th gathered book the information is saved as a pickle
# This is done in order to overcome if code fails mid-scraping-process. 
main_link = "https://www.goodreads.com"
chunks = np.arange(0, 20000, 10)
total_df = pd.DataFrame(columns = ['title', 'rating', 'related', 'genre'])
for i in range(len(chunks)-1):
  book_info = scrape_book_info(main_link, table_of_books, context, chunks[i], chunks[i+1], 15)
  total_df = total_df.append(book_info)
  total_df.to_pickle('extended_book_info.pcl')
  