<a href="https://colab.research.google.com/github/carrieacheung/rory-gilmore-booklist-analysis/blob/main/collect_goodreads_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries

from bs4 import BeautifulSoup
import requests
import re
import collections
import time
import pandas as pd

In [None]:
def get_title(soup):
  '''
  get_title extracts the book title from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :title: {string} The title of the book
  '''
  try:
    find_title = soup.find('h1', {'id': 'bookTitle'}) # Finds the <h1> tag containing the book title
    title = find_title.get_text(strip=True) # Extracts just the text from the <h1> tag 
  except:
    title = '' # If no title is found
  return title

In [None]:
def get_book_number(soup):
  '''
  get_book_number extracts the book number from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :book_number: {string} The number of the book in the series
  '''
  try:
    find_series = soup.find('h2', {'id': 'bookSeries'}) # Finds the <h2> tag containing the <a> tag
    find_series_details = find_series.find('a') # Finds the <a> tag containing the series details
    series_details = find_series_details.get_text(strip=True) # Extracts just the text from the <a> tag 
    book_number = re.findall(r"#(\d+)", series_details)[0] # Finds the book number from the extracted text
  except:
    book_number = '' # If no book number is found
  return book_number

In [None]:
def get_series_title(soup):
  '''
  get_series_title extracts the series title from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :series_title: {string} The title of the series the book is in
  '''
  try:
    find_series = soup.find('h2', {'id': 'bookSeries'}) # Finds the <h2> tag containing the <a> tag
    find_series_details = find_series.find('a') # Finds the <a> tag containing the series details
    series_details = find_series_details.get_text(strip=True) # Extracts just the text from the <a> tag
    series_title = re.findall(r"\(([^)]*)\s#", series_details)[0] # Finds the series title from the extracted text
  except:
    series_title = '' # If no series title is found
  return series_title

In [None]:
def get_series_length(soup):
  '''
  get_series_length extracts the length of the from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :series_length: {string} The length of the series that the book is in
  '''
  try:
    find_series_list = soup.find('div', {'class': 'seriesList'}) # Finds the <div> tag containing the <span> tag
    find_series_span = find_series_list.find('span') # Finds the <span> tag nested at the same level as the series length
    find_series_length = find_series_span.next_sibling.strip() # Finds text containing the series length
    series_length = re.findall(r"(\d+)\sbooks", find_series_length)[0] # Finds the series length from the text
  except:
    series_length = '' # If no series length is found
  return series_length

In [None]:
def get_author(soup):
  '''
  get_author extracts the author(s) from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :author: {string} The author(s) comma seperated
  '''
  try:
    author_dict = collections.defaultdict(list) # Initializes a dictionary that allows new keys to be added
    find_author_container = soup.find_all('div', {'class': 'authorName__container'}) # Finds all the <div> tags containing author information
    for author_details in find_author_container: # Loops through each author element
      find_author_name = author_details.find('span', {'itemprop': 'name'}) # Finds the <span> tag containing the author's name
      author_name = find_author_name.get_text(strip=True) # Extracts just the text from the <span> name tag
      find_author_label = author_details.find('span', {'class': 'greyText'}) # Finds the <span> tag containing the author's label
      if find_author_label: 
        author_label = find_author_label.get_text(strip=True) # Extracts just the text from the <span> label tag
        author_dict[author_label].append(author_name) # Creates a list of authors for each label in a dictionary
      else:
        author_dict['(Goodreads Author)'].append(author_name) # Authors without a label are labelled as 'Goodreads Author'
    author = ', '.join(author_dict['(Goodreads Author)']) # Joins the author(s) comma separated
  except:
    author = '' # If no author is found
  return author

In [None]:
def get_publication_date(soup):
  '''
  get_publication_date extracts the month, day and year of publication from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :publication_date: {list} Contains strings of the month, day and year 
  '''
  try:
    find_date = soup.find('nobr', {'class': 'greyText'}) # Finds <nobr> tag containing the publication date
    if find_date:
      pass
    else:
      find_date = soup.find('div', {'id': 'details'}) # If no <nobr> tag is found, finds the publication date from a different tag
    date = find_date.get_text(strip=True) # Extracts just the text from the tag
    try:
      month = re.findall(r'ublished.*?(January|February|March|April|May|June|July|August|September|October|November|December)', date)[0] # Finds the month from the text
    except:
      month = '' # If no month is found
    try:
      day = re.findall(r'ublished.*?(\d{1,2})', date)[0] # Finds the day from the text
    except:
      day = '' # If no day is found
    try:
      year = re.findall(r'ublished.*?(\d{4})', date)[0] # Finds the year from the text
    except:
      year = '' # If no year is found
    publication_date = [month, day, year] # Saves month, day and year in a list
  except:
    publication_date = ['', '', ''] # If month, day and year are not found
  return publication_date

In [None]:
def get_number_of_pages(soup):
  '''
  get_number_of_pages extracts the number of pages in the book from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :publication_date: {string} The number of pages
  '''
  try:
    find_pages = soup.find('span', {'itemprop': 'numberOfPages'}) # Finds the <span> tag containing the number of pages
    pages = find_pages.get_text(strip=True) # Extracts just the text from the tag
    number_of_pages = re.findall(r'\b\d+\b', pages)[0] # Finds the number of pages from the text
  except:
    number_of_pages = '' # If the number of pages is not found
  return number_of_pages

In [None]:
def get_genres(soup):
  '''
  get_genres extracts the genres of book from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :genres: {string} The genres of the book comma seperated
  '''
  try:
    genre_list = [] # Initializes a list
    find_genre_list = soup.find_all('div', {'class': 'elementList'}) # Finds all the <div> tags containing the genres
    for genre_details in find_genre_list: # Loops through each genre
      find_genre = genre_details.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'}) # Finds all the <a> tags containing the genre and subgenre
      main_genre = find_genre[-1] # If a subgenre is present this becomes the main genre, else the main genre is just the genre
      genre = main_genre.get_text(strip=True) # Extracts just the text from the tag
      genre_list.append(genre) # Adds the genre to the list
    genres = ', '.join(genre_list) # Joins the genres comma seperated
  except: 
    genres = '' # If the genres are not found
  return genres

In [None]:
def get_similar_books(soup):
  '''
  get_similar_books extracts all of the similar books from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :rating_value: {string} A list of all the similar books
  '''
  try:
    similar_books_list = [] # Initialize list 
    find_similar_book_carousel = soup.find('div', {'class': 'bookCarousel'}) # Finds <div> tag containing <li> tags
    find_similar_books = find_similar_book_carousel.find_all('li', {'class': 'cover'}) # Finds all the <li> tags containg the similar book details
    for similar_book_details in find_similar_books: # Loops through the similar books
      try:
        similar_book_title = similar_book_details.find('img')['alt'] # The title of a similar book
        similar_book_author = re.findall(r'(?<=\\">)(.+?)(?=<)', str(similar_book_details))[1] # The author or a similar book
        similar_book = similar_book_title + ' by ' + similar_book_author # Combines the title and author
        similar_books_list.append(similar_book) # Add the similar book to the list
      except:
        pass
    similar_books = ',, '.join(similar_books_list) # Join the list together, comma seperated
  except:
    similar_books = '' # If the similar books are not found
  return similar_books

In [None]:
def get_rating_value(soup):
  '''
  get_rating_value extracts the rating value of the book from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :rating_value: {string} The rating value 0.00 - 5.00
  '''
  try:
    find_rating_value = soup.find('span', {'itemprop': 'ratingValue'}) # Finds the <span> tag containing the rating value
    rating_value = find_rating_value.get_text(strip=True) # Extracts just the text from the tag
  except:
    rating_value = '' # If the rating value is not found
  return rating_value

In [None]:
def get_number_of_ratings(soup):
  '''
  get_number_of_ratings extracts the number of ratings the book has from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :number_of_ratings: {string} The number of ratings
  '''
  try:
    find_number_of_ratings = soup.find('meta', {'itemprop': 'ratingCount'}) # Finds the <meta> tag containg the number of ratings
    number_of_ratings = find_number_of_ratings['content'] # Extracts just the text from the tag
  except:
    number_of_ratings = '' # If the number of ratings is not found
  return number_of_ratings

In [None]:
def get_rating_distribution(soup):
  '''
  get_rating_distribution extracts the number of ratings for each value (1-5) from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :rating_distribution: {dictionary} The number of reviews as a string for each rating value
  '''
  try:
    rating_distribution_table = re.findall(r'\[(\d+)\, (\d+)\, (\d+)\, (\d+)\, (\d+)\]', str(soup))[0] # Finds the rating distribution
    rating_distribution = {'5': rating_distribution_table[0], # Extracts the rating distrubtion text and saves it in a dictionary
                           '4': rating_distribution_table[1],
                           '3': rating_distribution_table[2],
                           '2': rating_distribution_table[3],
                           '1': rating_distribution_table[4]}
  except:
    rating_distribution = {'5': '', 
                           '4': '',
                           '3': '',
                           '2': '',
                           '1': ''}
  return rating_distribution

In [None]:
def get_number_of_reviews(soup):
  '''
  get_number_of_reviews extracts the number of reviews the book has from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :number_of_reviews: {string} The number of reviews
  '''
  try:
    find_number_of_reviews = soup.find('meta', {'itemprop': 'reviewCount'}) # Finds the <mate> tag containing the number of reviews
    number_of_reviews = find_number_of_reviews['content'] # The text of the number of reviews
  except:
    number_of_reviews = '' # If the number of reviews is not found
  return number_of_reviews

In [None]:
def get_cover_img(soup):
  '''
  get_cover_img extracts the cover image from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :cover_img: {string} The tag for the cover image
  '''
  try:
    find_book_cover = soup.find('div', {'class': 'bookCoverPrimary'}) # Finds the <div> tag containing the cover img
    cover_img = find_book_cover.find('img')['src'] # Find the cover img
  except:
    cover_img = '' # If the cover image is not found
  return cover_img

In [None]:
def get_synopsis(soup):
  '''
  get_synopsis extracts the book synopsis from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :synopsis: {string} The book's synopsis
  '''
  try:
    find_synopsis = soup.find('div', {'id': 'description'}) # Finds the <div> tag containg the <span> tag
    find_synopsis_span = find_synopsis.find('span') # Finds the <span> tag containg the synopsis
    synopsis = find_synopsis_span.get_text(strip=True) # Extracts just the text from the tag
    synopsis = synopsis.replace("&apos", "'") # Fixes the apostrophes
  except:
    synopsis = '' # If the synopsis is not found
  return synopsis

In [None]:
def get_book_reviews(soup):
  '''
  get_book_reviews extracts the book reviews from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :book_reviews: {string} List of all of the book reviews, comma seperated
  '''
  try:
    all_reviews = [] # Initialize list
    find_book_reviews = soup.find_all('div', {'class': 'friendReviews elementListBrown'}) # Finds all the <div> tags containing reviews in <span> tags
    for book_review in find_book_reviews: # Loops through all the reviews
      find_star_rating = book_review.find_all('span', {'class', 'staticStar p10'}) # Finds all the <span> tags containing star ratings
      star_rating = str(len(find_star_rating)) # Finds the star rating
      find_review = book_review.find('span', {'class': 'readable'}) # Finds the <span> tag containg the review
      find_review_display = find_review.find_all('span') # Finds all the <span> tags containg display and not displayed reviews
      if len(find_review_display) == 1: # If only one displayed review
        full_review = find_review_display[0].get_text(strip=True) # Extracts just the text from the <span> tag with the review
      else:
        find_a = find_review.find('a', {'href': '#'}) # Finds the <a> tag containg the review id
        review_id = find_a['data-text-id'] # Extracts the user id
        id = 'freeText' + review_id # Creates the full id for the review
        find_full_review = find_review.find('span', {'id': id}) # Finds the <span> tag containing the full review using the full id
        full_review = find_full_review.get_text(strip=True) # Extracts just the text from the tag
      review = star_rating + '_' + full_review # Combines the star rating and the review
      all_reviews.append(review) # Adds the review to the list
    book_reviews = ',, '.join(all_reviews) # Joins all the reviews comma seperated
  except:
    book_reviews = '' # If the book reviews are not found
  return book_reviews

In [None]:
def get_goodreads_data(soup):
  '''
  get_goodreads_data extracts all of the wanted data from the BeautifulSoup object

  [Args]
  :soup: {BeautifulSoup object} Created from the book's goodreads webpage source code

  [Returns]
  :goodreads_data: {dictionary} The book's data from goodreads
  '''
  goodreads_data = {'Title': get_title(soup), 
                    'Author(s)': get_author(soup),
                    'Number in Series': get_book_number(soup),
                    'Series Title': get_series_title(soup),
                    'Length of Series': get_series_length(soup),
                    'Publication Month': get_publication_date(soup)[0],
                    'Publication Day': get_publication_date(soup)[1],
                    'Publication Year': get_publication_date(soup)[2],
                    'Number of Pages': get_number_of_pages(soup),
                    'Genres': get_genres(soup),
                    'Similar Books': get_similar_books(soup),
                    'Rating': get_rating_value(soup),
                    'Number of Ratings': get_number_of_ratings(soup),
                    'Number of 5-Star Ratings': get_rating_distribution(soup)['5'],
                    'Number of 4-Star Ratings': get_rating_distribution(soup)['4'],
                    'Number of 3-Star Ratings': get_rating_distribution(soup)['3'],
                    'Number of 2-Star Ratings': get_rating_distribution(soup)['2'],
                    'Number of 1-Star Ratings': get_rating_distribution(soup)['1'],
                    'Number of Reviews': get_number_of_reviews(soup),
                    'Cover Image Tag': get_cover_img(soup),
                    'Synopsis': get_synopsis(soup)}
  return goodreads_data

In [None]:
def main():
  goodreads_ids = [] # Initializes the list for the goodreads ids of each book
  with open('/content/drive/MyDrive/Colab Notebooks/Rory Gilmore Book List/goodreads_ids.txt', 'r') as f: # Opens a txt file with the goodreads ids
    for line in f: # Loop through each goodreads id
      goodreads_ids.append(line.strip()) # Clean the goodreads id and add it to the initialized list
  rg_challenge_dict = collections.defaultdict(list) # Initialize a dictionary that the goddreads data will be added to
  review_txt = '' # Initialize the string for reviews
  for goodreads_id in goodreads_ids: # Loop through each book
    url = 'https://www.goodreads.com/book/show/' + goodreads_id # The goodreads url for the book
    print(url) 
    title = '' # Initializes the value of the title
    while_count = 10 # Sets an initial value for the counter
    while not title: # Runs until the page is correctly requested, and a title can be found
      page = requests.get(url) # Gets the html webpage
      soup = BeautifulSoup(page.content, "html.parser") # Creates a parse tree
      title = get_title(soup) # Extracts the title from the parse tree
      while_count = while_count - 1 # Decrease the count
      if while_count < 0: # The loop breaks when the title is found or after it runs 10 times
        break
    goodreads_data = get_goodreads_data(soup) # Extracts all of the data from goodreads
    key_list = list(goodreads_data.keys()) # Lists the keys from the dictionary
    for key in key_list: # Loops through each key
      rg_challenge_dict[key].append(goodreads_data[key]) # Appends the books data to the dictionary
    review_txt = review_txt + 'Title: ' + title + '|| ' + get_book_reviews(soup) + ' ||| ' # Combines all of the book reviews into a string
    time.sleep(2) # Takes a break for the goodreads server
  rg_df = pd.DataFrame(rg_challenge_dict) # Converts the dictionary with data from all of the books into a datafram
  rg_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Rory Gilmore Book List/goodreads_data.csv')  # Saves the dataframe as a csv file
  text_file = open("/content/drive/MyDrive/Colab Notebooks/Rory Gilmore Book List/book_reviews.txt", "w") # Creates a new txt file
  text_file.write(review_txt) # Saves the string with the reviews to the txt file
  text_file.close() # clost the txt file

In [None]:
if __name__ == '__main__':
    main()

https://www.goodreads.com/book/show/40961427-1984
https://www.goodreads.com/book/show/1487493.Absolute_Rage
https://www.goodreads.com/book/show/53835.The_Age_of_Innocence
https://www.goodreads.com/book/show/2956.The_Adventures_of_Huckleberry_Finn
https://www.goodreads.com/book/show/24583.The_Adventures_of_Tom_Sawyer
https://www.goodreads.com/book/show/60671823-alice-s-adventures-in-wonderland
https://www.goodreads.com/book/show/96123.All_the_President_s_Men
https://www.goodreads.com/book/show/469571.All_the_Pretty_Horses
https://www.goodreads.com/book/show/3985.The_Amazing_Adventures_of_Kavalier_Clay
https://www.goodreads.com/book/show/22165.American_Steel
https://www.goodreads.com/book/show/331319.An_American_Tragedy
https://www.goodreads.com/book/show/16299.And_Then_There_Were_None
https://www.goodreads.com/book/show/9800581-the-andy-warhol-diaries
https://www.goodreads.com/book/show/252577.Angela_s_Ashes
https://www.goodreads.com/book/show/92250.Millennium_Approaches
https://www.goo