# Imports

In [31]:
import pandas as pd
import json

# List of book titles with summaries accessible

In [5]:
#Read in dictionary of scraped links to book chapter summaries

df_summaries = pd.read_json('./data/working_data/book_summaries/book_summary_links.json')

In [6]:
df_summaries.head(2)

Unnamed: 0,title,summaries
0,By Night in Chile,[https://novelguide.com/by-night-in-chile/nove...
1,Julius Caesar,[https://novelguide.com/julius-caesar/summarie...


In [7]:
#Create list of book titles for which chapter summary links have been scraped

summary_titles = []

for i in df_summaries.index:
    summary_titles.append(df_summaries.iloc[i]['title'])

In [8]:
summary_titles[:10]

['By Night in Chile',
 'Julius Caesar',
 'Antigone',
 "Aristotle's Politics",
 'Odysseus',
 'David Copperfield',
 'Crito',
 'Madame Bovary',
 'Henry IV Part 2',
 'Kidnapped']

# List of titles with full texts

In [9]:
#Read in dictionary of scraped link to book full texts

df_texts = pd.read_json('./data/working_data/book_texts/book_text_links_unfiltered.json')

In [10]:
df_texts.head(2)

Unnamed: 0,title,link
0,"An ""Attic"" Philosopher, entire by Emile Souvestre",http://www.fullbooks.com/An-Attic-Philosopher-...
1,"An ""Attic"" Philosopher, v1 by Emile Souvestre",http://www.fullbooks.com/An-Attic-Philosopher-...


In [11]:
#Create list of book titles for which full text links have been scraped

text_titles = []

for i in df_texts.index:
    text_titles.append(df_texts.iloc[i]['title'])

In [12]:
text_titles[:10]

['An "Attic" Philosopher, entire by Emile Souvestre',
 'An "Attic" Philosopher, v1 by Emile Souvestre',
 'An "Attic" Philosopher, v2 by Emile Souvestre',
 'An "Attic" Philosopher, v3 by Emile Souvestre',
 '"Co. Aytch" by Sam R. Watkins',
 'The "Goldfish" by Arthur Train',
 '"In Darkest England and The Way Out" by General William Booth',
 '"Speaking of Operations--" by Irvin S. Cobb',
 "'Lena Rivers by Mary J. Holmes",
 "'Tis Sixty Years Since by Charles Francis Adams"]

# Cross check lists

In [23]:
#Loop through lists of titles to find titles included in both summary titles and text titles
#Titles do not match up exactly (full text titles are usually longer form with the author included), so matches are approximated
# by searching for each title in the summary titles as a substring in each of the full text titles

#Add all matches found to a dictionary with the form {'summary title' : 'corresponding full text title'} 

usable_books = {}

for st in summary_titles:
    for tt in text_titles:
        if st in tt:
            usable_books[st] = tt

In [25]:
#Dictionary of usable books returned by nested loop above

usable_books

{'Julius Caesar': 'The Tragedie of Julius Caesar by William Shakespeare',
 'David Copperfield': 'David Copperfield by Charles Dickens',
 'Crito': 'Crito by Plato',
 'Madame Bovary': 'Madame Bovary by Gustave Flaubert',
 'Kidnapped': 'A Kidnapped Santa Claus by L. Frank Baum',
 'Night': 'Twelfth Night; or, What You Will by William Shakespeare [Hudson edition]',
 'Of Human Bondage': 'Of Human Bondage by W. Somerset Maugham',
 'Oliver Twist': 'Oliver Twist by Charles Dickens',
 'Cyrano de Bergerac': 'Cyrano de Bergerac by Edmond Rostand',
 'A Tale of Two Cities': 'A Tale of Two Cities, by Charles Dickens [A story of the French Revolution]',
 'Cymbeline': 'The Tragedie of Cymbeline by William Shakespeare',
 "Uncle Tom's Cabin": "Uncle Tom's Cabin, Young Folks' Edition by Harriet Beecher Stowe",
 'Little Women': 'Little Women by Louisa May Alcott',
 'Walden': 'Walden, by Henry David Thoreau',
 'Macbeth': 'The Tragedie of Macbeth by William Shakespeare',
 "Gulliver's Travels": "Gulliver's Tr

In [27]:
#Manually select titles that have been obviously inaccurately matched to drop from dictionary

to_drop = [
    'Kidnapped', 'Night', 'Emma', 'The Chosen', 'Dracula', 'Beloved', 
    'Pygmalion', 'Common Sense', 'Washington Square', 'Kindred', 'Ulysses', 
    'Jazz', 'The Prince', 'Robinson Crusoe', 'Consolation of Philosophy'
    ]

In [28]:
#Drop titles selected above from usable books dictionary

for key in to_drop:
    del usable_books[key]

In [30]:
#Final number of titles in usable books dictionary

len(usable_books)

75

# Save usable books list to json file

In [33]:
with open('./data/working_data/usable_books.json', 'w') as fp:
    json.dump(usable_books, fp)