In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://www.goodreads.com/list/show/6934.Science_Fiction_Books_by_Female_Authors')
doc = BeautifulSoup(response.text)

In [3]:
booklist = []

for book in doc.find_all('tr', itemtype='http://schema.org/Book'):
    booksdict = {}
    booksdict['Rank'] = book.find(class_='number').text.strip()
    booksdict['Name'] = book.find(class_='bookTitle').text.strip()
    booksdict['Author'] = book.find(class_='authorName').text.strip()
    booksdict['Score'] = book.find('a', href='#').text.strip()
    booksdict['Votes'] = book.find('a', href='#').find_next_sibling('a').text.strip()
    booksdict['Rating'] = book.find('span', class_='minirating').text.strip()
    booklist.append(booksdict)

df = pd.DataFrame(booklist)
df.head()

Unnamed: 0,Author,Name,Rank,Rating,Score,Votes
0,Margaret Atwood,The Handmaid's Tale,1,"4.09 avg rating — 1,101,568 ratings","score: 30,733",314 people voted
1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2,"4.33 avg rating — 5,741,857 ratings","score: 28,553",292 people voted
2,Mary Wollstonecraft Shelley,Frankenstein,3,"3.78 avg rating — 1,023,245 ratings","score: 21,909",224 people voted
3,Madeleine L'Engle,"A Wrinkle in Time (Time Quintet, #1)",4,"4.01 avg rating — 902,852 ratings","score: 18,720",196 people voted
4,Ursula K. Le Guin,The Left Hand of Darkness (Hainish Cycle #4),5,"4.06 avg rating — 98,748 ratings","score: 17,920",184 people voted


In [4]:
import re

In [5]:
df['Rating'] = df['Rating'].str.extract(r'(^\d[.]\d\d)')
df['Score'] = df['Score'].str.extract(r'score:(\s(.*)$)')
df['Votes'] = df['Votes'].str.extract(r'(^(\d+))')
df['Title'] = df.Name.str.extract(r"([\w\s']*)[(]?")
df.head(10)

Unnamed: 0,Author,Name,Rank,Rating,Score,Votes,Title
0,Margaret Atwood,The Handmaid's Tale,1,4.09,30733,314,The Handmaid's Tale
1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2,4.33,28553,292,The Hunger Games
2,Mary Wollstonecraft Shelley,Frankenstein,3,3.78,21909,224,Frankenstein
3,Madeleine L'Engle,"A Wrinkle in Time (Time Quintet, #1)",4,4.01,18720,196,A Wrinkle in Time
4,Ursula K. Le Guin,The Left Hand of Darkness (Hainish Cycle #4),5,4.06,17920,184,The Left Hand of Darkness
5,Veronica Roth,"Divergent (Divergent, #1)",6,4.21,13326,138,Divergent
6,Suzanne Collins,"Catching Fire (The Hunger Games, #2)",7,4.29,12749,133,Catching Fire
7,Lois Lowry,"The Giver (The Giver, #1)",8,4.12,12399,129,The Giver
8,Octavia E. Butler,Kindred,9,4.23,11070,116,Kindred
9,Ursula K. Le Guin,The Dispossessed (Hainish Cycle #6),10,4.21,10731,112,The Dispossessed


In [8]:
df['Series'] = df.Name.str.extract(r'[(](.*\w\D)[,?|#]')
df.head(20)

Unnamed: 0,Author,Name,Rank,Rating,Score,Votes,Title,Series
0,Margaret Atwood,The Handmaid's Tale,1,4.09,30733,314,The Handmaid's Tale,
1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2,4.33,28553,292,The Hunger Games,The Hunger Games
2,Mary Wollstonecraft Shelley,Frankenstein,3,3.78,21909,224,Frankenstein,
3,Madeleine L'Engle,"A Wrinkle in Time (Time Quintet, #1)",4,4.01,18720,196,A Wrinkle in Time,Time Quintet
4,Ursula K. Le Guin,The Left Hand of Darkness (Hainish Cycle #4),5,4.06,17920,184,The Left Hand of Darkness,Hainish Cycle
5,Veronica Roth,"Divergent (Divergent, #1)",6,4.21,13326,138,Divergent,Divergent
6,Suzanne Collins,"Catching Fire (The Hunger Games, #2)",7,4.29,12749,133,Catching Fire,The Hunger Games
7,Lois Lowry,"The Giver (The Giver, #1)",8,4.12,12399,129,The Giver,The Giver
8,Octavia E. Butler,Kindred,9,4.23,11070,116,Kindred,
9,Ursula K. Le Guin,The Dispossessed (Hainish Cycle #6),10,4.21,10731,112,The Dispossessed,Hainish Cycle


In [9]:
df['Series Number'] = df.Name.str.extract(r'(\d)')
df.head()

Unnamed: 0,Author,Name,Rank,Rating,Score,Votes,Title,Series,Series Number
0,Margaret Atwood,The Handmaid's Tale,1,4.09,30733,314,The Handmaid's Tale,,
1,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",2,4.33,28553,292,The Hunger Games,The Hunger Games,1.0
2,Mary Wollstonecraft Shelley,Frankenstein,3,3.78,21909,224,Frankenstein,,
3,Madeleine L'Engle,"A Wrinkle in Time (Time Quintet, #1)",4,4.01,18720,196,A Wrinkle in Time,Time Quintet,1.0
4,Ursula K. Le Guin,The Left Hand of Darkness (Hainish Cycle #4),5,4.06,17920,184,The Left Hand of Darkness,Hainish Cycle,4.0


In [10]:
df = df.drop(['Name'], axis=1)
df.head(15)

Unnamed: 0,Author,Rank,Rating,Score,Votes,Title,Series,Series Number
0,Margaret Atwood,1,4.09,30733,314,The Handmaid's Tale,,
1,Suzanne Collins,2,4.33,28553,292,The Hunger Games,The Hunger Games,1.0
2,Mary Wollstonecraft Shelley,3,3.78,21909,224,Frankenstein,,
3,Madeleine L'Engle,4,4.01,18720,196,A Wrinkle in Time,Time Quintet,1.0
4,Ursula K. Le Guin,5,4.06,17920,184,The Left Hand of Darkness,Hainish Cycle,4.0


In [11]:
df.to_csv("Goodreads.csv", index=False)