# Goodreads: Science Fiction Books by Female Authors (Scraping to a CSV)
Scrape the fields below, and save as a CSV file.

- Field	Example
- Rank	1
- Title	The Handmaid's Tale
- Author	Margaret Atwood
- Score	score: 30,733
- Votes	314 people voted
- Rating	4.09 avg rating — 1,101,120 ratings

In [4]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [5]:
response = requests.get('https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors')
doc = BeautifulSoup(response.text)

In [6]:
books = doc.find_all('tr')

In [7]:
rows = []
for book in books:
    row = {}
    row['Rank'] = book.find(class_="number").text.strip()
    row['Title'] = book.find(class_="bookTitle").text.strip()
    row['Author'] = book.find(class_="authorName").text.strip()
    row['Score'] = book.find(href="#").text.strip()
    row['Votes'] = book.find(href="#").find_next_sibling('a').text
    row['Rating'] = book.find(class_="minirating").text.strip()
    rows.append(row)
rows

[{'Rank': '1',
  'Title': "The Handmaid's Tale",
  'Author': 'Margaret Atwood',
  'Score': 'score: 30,733',
  'Votes': '314 people voted',
  'Rating': '4.09 avg rating — 1,102,318 ratings'},
 {'Rank': '2',
  'Title': 'The Hunger Games (The Hunger Games, #1)',
  'Author': 'Suzanne Collins',
  'Score': 'score: 28,553',
  'Votes': '292 people voted',
  'Rating': '4.33 avg rating — 5,742,147 ratings'},
 {'Rank': '3',
  'Title': 'Frankenstein',
  'Author': 'Mary Wollstonecraft Shelley',
  'Score': 'score: 21,909',
  'Votes': '224 people voted',
  'Rating': '3.78 avg rating — 1,023,439 ratings'},
 {'Rank': '4',
  'Title': 'A Wrinkle in Time (Time Quintet, #1)',
  'Author': "Madeleine L'Engle",
  'Score': 'score: 18,720',
  'Votes': '196 people voted',
  'Rating': '4.01 avg rating — 903,270 ratings'},
 {'Rank': '5',
  'Title': 'The Left Hand of Darkness (Hainish Cycle #4)',
  'Author': 'Ursula K. Le Guin',
  'Score': 'score: 17,920',
  'Votes': '184 people voted',
  'Rating': '4.06 avg rating

## Cleaning Up

In [120]:
import re
rows = []

for book in books:
    row = {}
    row['Rank'] = book.find(class_="number").text.strip()
    row['Title'] = book.find(class_="bookTitle").text.strip()
    row['Author'] = book.find(class_="authorName").text.strip()
    row['Number in Series'] = book.find(class_="bookTitle").text.strip()
    row['Series'] = book.find(class_="bookTitle").text.strip()
    row['Score'] = book.find(href="#").text.strip()
    row['Votes'] = book.find(href="#").find_next_sibling('a').text
    row['Rating'] = book.find(class_="minirating").text.strip()
    row['Number of Ratings'] = book.find(class_="minirating").text.strip()
        
    rows.append(row)
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,Author,Number in Series,Number of Ratings,Rank,Rating,Score,Series,Title,Votes
0,Margaret Atwood,The Handmaid's Tale,"4.09 avg rating — 1,102,318 ratings",1,"4.09 avg rating — 1,102,318 ratings","score: 30,733",The Handmaid's Tale,The Handmaid's Tale,314 people voted
1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)","4.33 avg rating — 5,742,147 ratings",2,"4.33 avg rating — 5,742,147 ratings","score: 28,553","The Hunger Games (The Hunger Games, #1)","The Hunger Games (The Hunger Games, #1)",292 people voted
2,Mary Wollstonecraft Shelley,Frankenstein,"3.78 avg rating — 1,023,439 ratings",3,"3.78 avg rating — 1,023,439 ratings","score: 21,909",Frankenstein,Frankenstein,224 people voted
3,Madeleine L'Engle,"A Wrinkle in Time (Time Quintet, #1)","4.01 avg rating — 903,270 ratings",4,"4.01 avg rating — 903,270 ratings","score: 18,720","A Wrinkle in Time (Time Quintet, #1)","A Wrinkle in Time (Time Quintet, #1)",196 people voted
4,Ursula K. Le Guin,The Left Hand of Darkness (Hainish Cycle #4),"4.06 avg rating — 98,786 ratings",5,"4.06 avg rating — 98,786 ratings","score: 17,920",The Left Hand of Darkness (Hainish Cycle #4),The Left Hand of Darkness (Hainish Cycle #4),184 people voted


In [122]:
import re
title = "( [(].*[)])"
series_number = "(.* #)"
series = "(.*[(])"
score = "(.* )"
rating = "( .*$)"
num_ratings = "(.* — )"
votes = "( .*)$"

In [123]:
#Title Regex
df['Title'] = df['Title'].str.replace(title, '', regex = True)

# Series Regex
df['Series'] = df['Series'].str.replace(series, '', regex = True)
df['Series'] = df['Series'].str.replace('( #[0-9]*[)]$)', '', regex = True)
df['Series'] = df['Series'].str.replace('(,$)', '', regex = True)

#Number in Series Regex
df['Number in Series'] = df['Number in Series'].str.replace(series_number, '', regex = True)
df['Number in Series'] = df['Number in Series'].str.replace('([)]$)', '', regex = True)
df['Number in Series'] = df['Number in Series'].str.replace("([a-zA-z'])", '', regex = True)

#Num of Ratings Regex
df['Number of Ratings'] = df['Number of Ratings'].str.replace(num_ratings, '', regex = True)
df['Number of Ratings'] = df['Number of Ratings'].str.replace('( .*$)', '', regex = True)

#Rating Regex
df['Rating'] = df['Rating'].str.replace(rating, '', regex = True)

#Score Regex
df['Score'] = df['Score'].str.replace(score, '', regex = True)

#Votes
df['Votes'] = df['Votes'].str.replace(votes, '', regex = True)

In [124]:
df.head()

Unnamed: 0,Author,Number in Series,Number of Ratings,Rank,Rating,Score,Series,Title,Votes
0,Margaret Atwood,,1102318,1,4.09,30733,The Handmaid's Tale,The Handmaid's Tale,314
1,Suzanne Collins,1.0,5742147,2,4.33,28553,The Hunger Games,The Hunger Games,292
2,Mary Wollstonecraft Shelley,,1023439,3,3.78,21909,Frankenstein,Frankenstein,224
3,Madeleine L'Engle,1.0,903270,4,4.01,18720,Time Quintet,A Wrinkle in Time,196
4,Ursula K. Le Guin,4.0,98786,5,4.06,17920,Hainish Cycle,The Left Hand of Darkness,184


In [125]:
df.to_csv("Goodreads.csv", index=False)