# Imports

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import json
import pandas as pd
import numpy as np 

# Scrape links to all books with summaries available

## Testing - scrape first page (first 50 books)

In [2]:
#URL for main page with list of books summaries are available for

url = 'https://novelguide.com/novelguides'

In [3]:
#Check that request is successful

response = requests.get(url)
response.status_code

200

In [4]:
#Extract html

html = response.text

In [5]:
#Pass html to BeautifulSoup

soup = BeautifulSoup(html, 'lxml')

In [6]:
#List of table elements of page containing names and links for books
td_list = []
for tr in soup.find('table', {'class':'cols-2'}).find_all('tr')[1:]:
    td_list.append(tr.find_all('td'))

#find link in each table element and create list of links
book_links = []
for i in range(len(td_list)):
    for n in range(len(td_list[i])):
        if td_list[i][n].find('a') != None:  
            book_links.append('https://novelguide.com' + td_list[i][n].find('a')['href'])

book_links

['https://novelguide.com/index.php/a-portrait-of-the-artist-as-a-young-man',
 'https://novelguide.com/index.php/dubliners',
 'https://novelguide.com/index.php/middlemarch',
 'https://novelguide.com/index.php/the-diary-of-a-young-girl',
 'https://novelguide.com/index.php/jurassic-park/index.html',
 'https://novelguide.com/index.php/beowulf',
 'https://novelguide.com/index.php/everything-is-illuminated',
 'https://novelguide.com/index.php/inherit-the-wind/index.html',
 'https://novelguide.com/index.php/go-tell-it-on-the-mountain',
 'https://novelguide.com/index.php/a-farewell-to-arms',
 'https://novelguide.com/index.php/the-age-of-innocence',
 'https://novelguide.com/index.php/bonfire-of-the-vanities',
 'https://novelguide.com/index.php/across-five-aprils',
 'https://novelguide.com/index.php/the-color-purple',
 'https://novelguide.com/index.php/the-assistant',
 'https://novelguide.com/index.php/aristotles-politics/index.html',
 'https://novelguide.com/index.php/the-count-of-monte-cristo'

In [7]:
#Number of links scraped

len(book_links)

50

## Scrape links to all books

In [2]:
#Starting with first page of books, create dictionary for each book with a corresponding title and link and add to list of book links
# iterate through each page of website (8 total) and repeat this process to collect all titles and links

next_page=1
book_links = []
url = 'https://novelguide.com/novelguides'
for page in range(8):
    response = requests.get(url)
    time.sleep(0.5)

    if response.status_code==200:
        html = response.text
        soup = BeautifulSoup(html, 'lxml')

        print(f'request {next_page} successful')

        td_list = []
        for tr in soup.find('table', {'class':'cols-2'}).find_all('tr')[1:]:
            td_list.append(tr.find_all('td'))

        for i in range(len(td_list)):
            for n in range(len(td_list[i])):
                if td_list[i][n].find('a') != None:
                    book = {}
                    book['title'] = td_list[i][n].find('a').text
                    book['link'] = 'https://novelguide.com' + td_list[i][n].find('a')['href']

                    book_links.append(book)   

        print(f'books from page {next_page} added')

        try:
            url = f'https://novelguide.com/novelguides?items_per_page=50&page={next_page}'
            next_page += 1

        except:
            print('Error scraping page')


request 1 successful
books from page 1 added
request 2 successful
books from page 2 added
request 3 successful
books from page 3 added
request 4 successful
books from page 4 added
request 5 successful
books from page 5 added
request 6 successful
books from page 6 added
request 7 successful
books from page 7 added
request 8 successful
books from page 8 added


In [3]:
#Examine list of book links

book_links[:10]

[{'title': 'The Ambassadors',
  'link': 'https://novelguide.com/the-ambassadors'},
 {'title': 'Ivanhoe', 'link': 'https://novelguide.com/ivanhoe'},
 {'title': 'Crito', 'link': 'https://novelguide.com/crito'},
 {'title': 'Emma', 'link': 'https://novelguide.com/emma'},
 {'title': 'The Assistant', 'link': 'https://novelguide.com/the-assistant'},
 {'title': 'A Passage to India',
  'link': 'https://novelguide.com/a-passage-to-india'},
 {'title': 'Henry VIII',
  'link': 'https://novelguide.com/henry-viii/index.html'},
 {'title': 'Merry Wives of Windsor',
  'link': 'https://novelguide.com/merry-wives-of-windsor'},
 {'title': 'Antigone', 'link': 'https://novelguide.com/antigone-jean-anouilh'},
 {'title': 'A Clockwork Orange',
  'link': 'https://novelguide.com/a-clockwork-orange'}]

In [4]:
#Number of entries in book links list

len(book_links)

382

## Save book links to json file

In [5]:
with open('./data/working_data/book_summaries/book_links.json', 'w') as fp:
    json.dump(book_links, fp)

# Scrape links to all book summaries

## Scrape links

In [6]:
#Read in dictionary of book links created in last step

book_links_df = pd.read_json('./data/working_data/book_summaries/book_links.json')
book_links_df.head()

Unnamed: 0,title,link
0,The Ambassadors,https://novelguide.com/the-ambassadors
1,Ivanhoe,https://novelguide.com/ivanhoe
2,Crito,https://novelguide.com/crito
3,Emma,https://novelguide.com/emma
4,The Assistant,https://novelguide.com/the-assistant


In [8]:
#Iterate through list of book links and create new list of dictionaries containing the title of a book and the links to all
# of the chapter summaries for this book

book_summary_links = []

for i in book_links_df.index:

    book = {}
    book['title'] = book_links_df.iloc[i]['title']
    book['summaries'] = []

    url = book_links_df.iloc[i]['link']
    response = requests.get(url)
    time.sleep(0.5)

    if response.status_code==200:
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
            
        for li in soup.find('div', {'id':'mainWrapper'}).find('div', {'id':'header'}).find('div', {'id':'content_top_full'}).find('div', {'id':'block-booknavigation-3'}).find_all('li'):
            if 'summary' in li.find('a').text.lower():
                book['summaries'].append('https://novelguide.com' + li.find('a')['href'])
    book_summary_links.append(book)
    print(f'book {i} summary links added')


book 0 summary links added
book 1 summary links added
book 2 summary links added
book 3 summary links added
book 4 summary links added
book 5 summary links added
book 6 summary links added
book 7 summary links added
book 8 summary links added
book 9 summary links added
book 10 summary links added
book 11 summary links added
book 12 summary links added
book 13 summary links added
book 14 summary links added
book 15 summary links added
book 16 summary links added
book 17 summary links added
book 18 summary links added
book 19 summary links added
book 20 summary links added
book 21 summary links added
book 22 summary links added
book 23 summary links added
book 24 summary links added
book 25 summary links added
book 26 summary links added
book 27 summary links added
book 28 summary links added
book 29 summary links added
book 30 summary links added
book 31 summary links added
book 32 summary links added
book 33 summary links added
book 34 summary links added
book 35 summary links added
bo

In [10]:
#Examine list created

book_summary_links[:2]

[{'title': 'The Ambassadors',
  'summaries': ['https://novelguide.com/the-ambassadors/novel-summary']},
 {'title': 'Ivanhoe',
  'summaries': ['https://novelguide.com/ivanhoe/novel-summary']}]

In [11]:
#Number of entries in book summary links list - confirmed to be the same as book links
# (all books and corresponding summary links successfully scraped)

len(book_summary_links)

382

## Save book summary links to json file

In [12]:
with open('./data/working_data/book_summaries/book_summary_links.json', 'w') as fp:
    json.dump(book_summary_links, fp)

# Filter to usable summaries
Texts for which both full text and chapter summaries are accessible
<br>
(see 01_02_crosscheck_summaries_text.ipynb for determination of usable texts)

In [31]:
#Read in dictionary of book summary links created in last step

with open('./data/working_data/book_summaries/book_summary_links.json') as json_file:
    book_summary_links = json.load(json_file)

In [15]:
#Read in dictionary of usable books (created in 01_02_crosscheck_summaries_texts.ipynb)

with open('./data/working_data/usable_books.json') as json_file:
    usable_books = json.load(json_file)

In [16]:
#Examine titles in usable books dictionary

usable_books.keys()

dict_keys(['Julius Caesar', 'David Copperfield', 'Crito', 'Madame Bovary', 'Of Human Bondage', 'Oliver Twist', 'Cyrano de Bergerac', 'A Tale of Two Cities', 'Cymbeline', "Uncle Tom's Cabin", 'Little Women', 'Walden', 'Macbeth', "Gulliver's Travels", 'The Awakening', 'Dubliners', 'Bleak House', 'A Christmas Carol', 'Beowulf', 'Merry Wives of Windsor', 'The Portrait of a Lady', 'Silas Marner', 'Wuthering Heights', 'The Call of the Wild', 'Middlemarch', 'The Canterbury Tales', 'Siddhartha', "A Connecticut Yankee in King Arthur's Court", 'Les Miserables', 'Divine Comedy', 'Ivanhoe', 'My Antonia', 'Hard Times', 'Phaedo', 'Candide', 'Henry VIII', 'Native Son', 'Meno', 'Notes from the Underground', 'The Count of Monte Cristo', 'Daisy Miller', 'The Comedy of Errors', 'King Lear', 'Adam Bede', 'Crime and Punishment', 'The Beast in the Jungle', 'As You Like It', 'The Adventures of Tom Sawyer', 'The Age of Innocence', 'The Scarlet Letter', 'The Secret Sharer', 'An Ideal Husband', 'Timon of Athens

In [17]:
#Create list of titles in book summaries scraped also included in the usable books titles 

usable_summaries = []
for summary in book_summary_links:
    if summary['title'] in list(usable_books.keys()):
        usable_summaries.append(summary)

In [18]:
#Number of usable titles in book summaries

len(usable_summaries)

80

# Scrape summaries

## Summary scraper

In [101]:
#Create empty lists to fill with titles and texts of chapter summaries, along with the corresponding book title for each
summary_titles = []
summary_texts = []
book_titles = []

#For each book in the list of usable summaries, iterate through the corresponding list of links to chapter summaries and 
# add the title and text of the summary to the corresponding list
b_num=1  #counter created to keep track of number of summaries scraped
for book in usable_summaries:
    s_num=1
    for s in book['summaries']:
        s_url = s
        s_response = requests.get(s_url)
        time.sleep(0.5)
        if s_response.status_code==200:
            s_html = s_response.text
            s_soup = BeautifulSoup(s_html, 'lxml')
            page_main_content = s_soup.find('div', {'id':'mainWrapper'}).find('div', {'id':'header'}).find('div', {'id':'page-wrapper'}).find('div', {'id':'main-content'})
            content_inner_inner = page_main_content.find('div', {'id':'content-inner'}).find('div', {'id':'content-inner-inner'})
            novel_guide_content= content_inner_inner.find('div', {'id':'content-content'}).find('div', {'id':'block-novelguide-content'})
            summary_text = novel_guide_content.find('div', {'class':'content'}).find('div', {'class':'clear-block'}).find('div', {'class':'content clear-block'}).find('div').text
            summary_title = content_inner_inner.find('h1', {'class':'title'}).text
            summary_titles.append(summary_title)
            summary_texts.append(summary_text)
            book_titles.append(book['title'])
            print(f'book {b_num} summary {s_num} added')
            s_num +=1
        else:
            print('Error with summary request')
    b_num +=1
                    

book 1 summary 1 added
book 2 summary 1 added
book 2 summary 2 added
book 2 summary 3 added
book 3 summary 1 added
book 3 summary 2 added
book 3 summary 3 added
book 3 summary 4 added
book 3 summary 5 added
book 3 summary 6 added
book 3 summary 7 added
book 3 summary 8 added
book 3 summary 9 added
book 3 summary 10 added
book 3 summary 11 added
book 3 summary 12 added
book 3 summary 13 added
book 3 summary 14 added
book 4 summary 1 added
book 4 summary 2 added
book 4 summary 3 added
book 4 summary 4 added
book 4 summary 5 added
book 4 summary 6 added
book 4 summary 7 added
book 4 summary 8 added
book 4 summary 9 added
book 4 summary 10 added
book 4 summary 11 added
book 4 summary 12 added
book 4 summary 13 added
book 4 summary 14 added
book 4 summary 15 added
book 5 summary 1 added
book 5 summary 2 added
book 5 summary 3 added
book 5 summary 4 added
book 5 summary 5 added
book 5 summary 6 added
book 5 summary 7 added
book 5 summary 8 added
book 5 summary 9 added
book 5 summary 10 added

In [103]:
#Total number of book chapter summaries scraped

len(summary_texts)

862

## Create dataframe of summaries scraped

In [105]:
df_summaries = pd.DataFrame(np.ones(862))

In [106]:
df_summaries['title'] = summary_titles

In [107]:
df_summaries

Unnamed: 0,0,title
0,1.0,Ivanhoe: Summary
1,1.0,Crito: Novel Summary: Chapter 1
2,1.0,NovelGuide: Crito: Novel Summary: Chapter 2
3,1.0,NovelGuide: Crito: Novel Summary: Chapter 3
4,1.0,Henry VIII: Summary: The Prologue
...,...,...
857,1.0,The Iliad: Novel Summary: Chapters 23-24
858,1.0,The Hound of the Baskervilles: Novel Summary: ...
859,1.0,The Hound of the Baskervilles: Novel Summary: ...
860,1.0,The Hound of the Baskervilles: Novel Summary: ...


In [109]:
df_summaries['text'] = summary_texts

In [110]:
df_summaries['book_title'] = book_titles

In [111]:
df_summaries.drop(columns=0)

Unnamed: 0,title,text,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito
2,NovelGuide: Crito: Novel Summary: Chapter 2,"Crito enters the cell, depressed himself at th...",Crito
3,NovelGuide: Crito: Novel Summary: Chapter 3,"In this part of the dialogue, Socrates gets to...",Crito
4,Henry VIII: Summary: The Prologue,\n\tThe Prologue enters and explains that he i...,Henry VIII
...,...,...,...
857,The Iliad: Novel Summary: Chapters 23-24,Chapter 23: The Greeks finish their mourning f...,The Iliad
858,The Hound of the Baskervilles: Novel Summary: ...,\nNote: All page numbers in this summary and ...,The Hound of the Baskervilles
859,The Hound of the Baskervilles: Novel Summary: ...,\nSummary: Mortimer reads to Holmes and Watso...,The Hound of the Baskervilles
860,The Hound of the Baskervilles: Novel Summary: ...,\nSummary: Holmes urges Watson to report any ...,The Hound of the Baskervilles


In [112]:
df_summaries.drop(columns=0, inplace=True)

In [113]:
df_summaries.head()

Unnamed: 0,title,text,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito
2,NovelGuide: Crito: Novel Summary: Chapter 2,"Crito enters the cell, depressed himself at th...",Crito
3,NovelGuide: Crito: Novel Summary: Chapter 3,"In this part of the dialogue, Socrates gets to...",Crito
4,Henry VIII: Summary: The Prologue,\n\tThe Prologue enters and explains that he i...,Henry VIII


In [114]:
df_summaries.shape

(862, 3)

## Export summaries dataframe to csv

In [115]:
df_summaries.to_csv('./data/working_data/book_summaries/summaries.csv') 