# Imports

In [1]:
import pandas as pd
import json

# Read in data

In [2]:
#Read in summaries data

df_summaries = pd.read_csv('./data/working_data/book_summaries/summaries.csv')

In [3]:
df_summaries.head(2)

Unnamed: 0.1,Unnamed: 0,title,text,book_title
0,0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito


In [4]:
df_summaries.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df_summaries.head(2)

Unnamed: 0,title,text,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito


In [6]:
#Read in full texts data

df_texts = pd.read_csv('./data/working_data/book_texts/full_texts.csv')

In [7]:
df_texts.head(2)

Unnamed: 0.1,Unnamed: 0,title,text
0,0,The Scarlet Letter by Hawthorne,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...
1,1,The Hound of the Baskervilles by A. Conan Doyle,The Hound of the BaskervillesbyA. Conan Doyle\...


In [8]:
df_texts.drop(columns='Unnamed: 0', inplace=True)

In [9]:
df_texts.head(2)

Unnamed: 0,title,text
0,The Scarlet Letter by Hawthorne,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...
1,The Hound of the Baskervilles by A. Conan Doyle,The Hound of the BaskervillesbyA. Conan Doyle\...


# Match up book titles in two dataframes

In [10]:
#Number of columns in df_texts

len(df_texts)

45

In [11]:
#Number of columns in df_summaries

len(df_summaries['book_title'].unique())

51

In [12]:
#Open dictionary of full text book titles to summary book titles

with open('./data/working_data/fulltext_to_summary_dict.json') as json_file:
    fulltext_to_summary = json.load(json_file)

In [13]:
fulltext_to_summary

{'The Scarlet Letter by Hawthorne': 'The Scarlet Letter',
 'The Hound of the Baskervilles by A. Conan Doyle': 'The Hound of the Baskervilles',
 'The Life of Henry VIII by William Shakespeare [Dunlap edition]': 'Henry VIII',
 'Of Human Bondage by W. Somerset Maugham': 'Of Human Bondage',
 'The Complete Works of William Shakespeare Cymbeline': 'Cymbeline',
 'Ivanhoe by Walter Scott': 'Ivanhoe',
 'My Antonia by Willa Cather': 'My Antonia',
 'The Call of the Wild, by Jack London': 'The Call of the Wild',
 'The Adventures of Tom Sawyer, Complete by Mark Twain (Samuel Clemens)': 'The Adventures of Tom Sawyer',
 'Candide by Voltaire': 'Candide',
 'The Age of Innocence by Edith Wharton': 'The Age of Innocence',
 'The Canterbury Tales and Other Poems by Geoffrey Chaucer': 'The Canterbury Tales',
 'Daisy Miller, by Henry James': 'Daisy Miller',
 'Pride and Prejudice by Jane Austen': 'Pride and Prejudice',
 'Winesburg, Ohio by Sherwood Anderson': 'Winesburg, Ohio',
 'The Adventures of Huckleberry

In [14]:
#For every book in full text, add corresponding title from df_summaries to 'book_titles' list

book_titles = []
for title in df_texts['title']:
    book_titles.append(fulltext_to_summary[title])

In [15]:
#Create new column of shorter form titles from df_summaries in df_texts using 'book_titles' list created in last step

df_texts['book_title'] = book_titles

In [16]:
#Check that new column has been added correctly

df_texts.head(20)

Unnamed: 0,title,text,book_title
0,The Scarlet Letter by Hawthorne,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...,The Scarlet Letter
1,The Hound of the Baskervilles by A. Conan Doyle,The Hound of the BaskervillesbyA. Conan Doyle\...,The Hound of the Baskervilles
2,The Life of Henry VIII by William Shakespeare ...,The Life of Henry VIIIbyWilliam Shakespeare [D...,Henry VIII
3,Of Human Bondage by W. Somerset Maugham,Of Human BondagebyW. Somerset Maugham\n\n\n\n\...,Of Human Bondage
4,The Complete Works of William Shakespeare Cymb...,The Complete Works of William Shakespeare Cymb...,Cymbeline
5,Ivanhoe by Walter Scott,"IvanhoebyWalter Scott\n\n\n\n\r\nPrepared by ""...",Ivanhoe
6,My Antonia by Willa Cather,My AntoniabyWilla Cather\r\n\r\nsoftware or an...,My Antonia
7,"The Call of the Wild, by Jack London","The Call of the Wild, by Jack London\n\n\n\n\n...",The Call of the Wild
8,"The Adventures of Tom Sawyer, Complete by Mark...","The Adventures of Tom Sawyer, CompletebyMark T...",The Adventures of Tom Sawyer
9,Candide by Voltaire,"CandidebyVoltaire\r\n\r\neditions, all of whic...",Candide


In [17]:
#Testing drop of original longer form 'title' column in df_texts

df_texts.drop(columns='title')

Unnamed: 0,text,book_title
0,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...,The Scarlet Letter
1,The Hound of the BaskervillesbyA. Conan Doyle\...,The Hound of the Baskervilles
2,The Life of Henry VIIIbyWilliam Shakespeare [D...,Henry VIII
3,Of Human BondagebyW. Somerset Maugham\n\n\n\n\...,Of Human Bondage
4,The Complete Works of William Shakespeare Cymb...,Cymbeline
5,"IvanhoebyWalter Scott\n\n\n\n\r\nPrepared by ""...",Ivanhoe
6,My AntoniabyWilla Cather\r\n\r\nsoftware or an...,My Antonia
7,"The Call of the Wild, by Jack London\n\n\n\n\n...",The Call of the Wild
8,"The Adventures of Tom Sawyer, CompletebyMark T...",The Adventures of Tom Sawyer
9,"CandidebyVoltaire\r\n\r\neditions, all of whic...",Candide


In [18]:
#Permanently drop original longer form 'title' column in df_texts

df_texts.drop(columns='title', inplace=True)

# Match up / Rename columns in two dataframes

In [19]:
#Testing rename of 'title' column in df_summaries to more specific 'chapter_title'

df_summaries.rename(columns= {'title':'chapter_title'})

Unnamed: 0,chapter_title,text,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito
2,NovelGuide: Crito: Novel Summary: Chapter 2,"Crito enters the cell, depressed himself at th...",Crito
3,NovelGuide: Crito: Novel Summary: Chapter 3,"In this part of the dialogue, Socrates gets to...",Crito
4,Henry VIII: Summary: The Prologue,\n\tThe Prologue enters and explains that he i...,Henry VIII
...,...,...,...
857,The Iliad: Novel Summary: Chapters 23-24,Chapter 23: The Greeks finish their mourning f...,The Iliad
858,The Hound of the Baskervilles: Novel Summary: ...,\nNote: All page numbers in this summary and ...,The Hound of the Baskervilles
859,The Hound of the Baskervilles: Novel Summary: ...,\nSummary: Mortimer reads to Holmes and Watso...,The Hound of the Baskervilles
860,The Hound of the Baskervilles: Novel Summary: ...,\nSummary: Holmes urges Watson to report any ...,The Hound of the Baskervilles


In [20]:
#Permanently rename 'title' column in df_summaries to more specific 'chapter_title'

df_summaries.rename(columns= {'title':'chapter_title'}, inplace=True)

In [21]:
#Check rename and names of other columns in df_summaries

df_summaries.head(2)

Unnamed: 0,chapter_title,text,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito


In [22]:
#Check names of columns in df_texts

df_texts.head(2)

Unnamed: 0,text,book_title
0,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...,The Scarlet Letter
1,The Hound of the BaskervillesbyA. Conan Doyle\...,The Hound of the Baskervilles


In [23]:
#For clarity, rename 'text in df_summaries to 'chapter_summary' and 'text' in df_texts to 'full_text'

df_summaries.rename(columns={'text':'chapter_summary'}, inplace=True)
df_texts.rename(columns={'text':'full_text'}, inplace=True)

In [24]:
#Check renamings in df_summaries

df_summaries.head(2)

Unnamed: 0,chapter_title,chapter_summary,book_title
0,Ivanhoe: Summary,Chapter I\n\tThe story is set in England in th...,Ivanhoe
1,Crito: Novel Summary: Chapter 1,"In 399 B.C., Athens sought someone to blame fo...",Crito


In [25]:
#Check renamings in df_texts

df_texts.head(2)

Unnamed: 0,full_text,book_title
0,The Scarlet LetterbyHawthorne\n\n\n\n\n\r\nTHE...,The Scarlet Letter
1,The Hound of the BaskervillesbyA. Conan Doyle\...,The Hound of the Baskervilles


# Filter dataframes to contain same book titles

In [26]:
#Number of unique book titles in df_texts

len(df_texts)

45

In [27]:
#Number of unique book titles in df_summaries

len(df_summaries['book_title'].unique())

51

In [28]:
#Create set of unique book titles in df_texts

df_texts_set = set(df_texts['book_title'])
len(df_texts_set)

45

In [29]:
#Create set of unique book titles in df_summaries

df_summaries_set = set(df_summaries['book_title'].unique())
len(df_summaries_set)

51

In [30]:
#Use set.intersection to find titles in both df_texts and df_summaries sets - 30 titles total

text_summary_titles = df_texts_set.intersection(df_summaries_set)
len(text_summary_titles)

30

In [31]:
#In order to create filtered subset of df_summaries only including titles in both df_summaries and df_texts, start with new one-row dataframe containing the first title
# in text_summary_titles (list created above)

df_summaries_filtered = df_summaries[df_summaries['book_title'] == list(text_summary_titles)[0]]

In [32]:
#For the rest of the titles in text_summary_titles, create one-row data frames with the title's summary data and add to the bottom of the current dataframe

for i in range(1, len(text_summary_titles)):
    df_summaries_filtered = pd.concat([df_summaries_filtered, df_summaries[df_summaries['book_title'] == list(text_summary_titles)[i]]])

In [33]:
#Check number of titles in new dataframe - should be 30

len(df_summaries_filtered['book_title'].unique())

30

In [34]:
#Check size of new summaries dataframe

len(df_summaries_filtered)

501

In [35]:
#In order to create filtered subset of df_texts only including titles in both df_summaries and df_texts, start with new one-row dataframe containing the first title
# in text_summary_titles (list created above)

df_texts_filtered = df_texts[df_texts['book_title'] == list(text_summary_titles)[0]]

In [36]:
#For the rest of the titles in text_summary_titles, create one-row data frames with the title's text data and add to the bottom of the current dataframe

for i in range(1, len(text_summary_titles)):
    df_texts_filtered = pd.concat([df_texts_filtered, df_texts[df_texts['book_title'] == list(text_summary_titles)[i]]])

In [37]:
#Check number of titles in new dataframe - should be 30

len(df_texts_filtered['book_title'])

30

# Check for & address duplicates in summaries data

In [38]:
#Check for dataframe duplicates - 146 found

df_summaries_filtered.duplicated().sum()

146

In [40]:
#Check shape of overall dataframe - a signficant number of rows are duplicates

df_summaries_filtered.shape

(501, 3)

In [41]:
#Check duplicates specifically in 'chapter_title' column - matches expected value

df_summaries_filtered['chapter_title'].duplicated().sum()

146

Dropping duplicates will result in a somewhat significant data loss. However, this seems to be the best option at this point. Including duplicates would cause these texts to have a disproportionate effect when used in models. As noted previously, gathering and continuing to develop and refine models with more data will be a planned future step for this project.

In [44]:
#Testing drop of all duplicates in dataframe

df_summaries_filtered.drop_duplicates()

Unnamed: 0,chapter_title,chapter_summary,book_title
96,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide
97,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide
98,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide
99,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide
100,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide
...,...,...,...
681,"Winesburg, Ohio: Novel Summary: The Book of th...",Summary\n\tAn old writer has a bed that his ca...,"Winesburg, Ohio"
682,"Winesburg, Ohio: Novel Summary: The Philosophe...",The Philosopher\n\tDoctor Parcival is one of t...,"Winesburg, Ohio"
683,"Winesburg, Ohio: Novel Summary: The Strength o...",The Strength of God\n\tReverend Curtis Hartman...,"Winesburg, Ohio"
684,"Winesburg, Ohio: Novel Summary: The Untold Lie","Summary\n\tRay Pearson, an older farmhand, is ...","Winesburg, Ohio"


In [45]:
#Permanently drop duplicates from dataframe

df_summaries_dup_dropped = df_summaries_filtered.drop_duplicates()

In [46]:
#Check new shape of dataframe

df_summaries_dup_dropped.shape

(355, 3)

# Simplify summary chapter titles 

Upon examination of the full list of chapter titles opened in a spearate text editor, it can be seen that almost all of the chapter titles have a format of 
<br>
-'Book Title': Novel Summary: Chapter 'chapter number(s)'- 
<br>
<br>
'Book Title: Novel Summary: ' will be dropped from each chapter title to simplify chapter designations

In [47]:
#Check index of current summaries df -  to be used in loop to build simplified chapter titles column

df_summaries_dup_dropped.index

Int64Index([ 96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
            ...
            676, 677, 678, 679, 680, 681, 682, 683, 684, 253],
           dtype='int64', length=355)

In [48]:
#Testing reset of index 

df_summaries_dup_dropped.reset_index()

Unnamed: 0,index,chapter_title,chapter_summary,book_title
0,96,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide
1,97,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide
2,98,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide
3,99,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide
4,100,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide
...,...,...,...,...
350,681,"Winesburg, Ohio: Novel Summary: The Book of th...",Summary\n\tAn old writer has a bed that his ca...,"Winesburg, Ohio"
351,682,"Winesburg, Ohio: Novel Summary: The Philosophe...",The Philosopher\n\tDoctor Parcival is one of t...,"Winesburg, Ohio"
352,683,"Winesburg, Ohio: Novel Summary: The Strength o...",The Strength of God\n\tReverend Curtis Hartman...,"Winesburg, Ohio"
353,684,"Winesburg, Ohio: Novel Summary: The Untold Lie","Summary\n\tRay Pearson, an older farmhand, is ...","Winesburg, Ohio"


In [49]:
#Permanently reset index

df_summaries_dup_dropped.reset_index(inplace=True)

In [50]:
#Examine df with new index

df_summaries_dup_dropped.head(15)

Unnamed: 0,index,chapter_title,chapter_summary,book_title
0,96,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide
1,97,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide
2,98,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide
3,99,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide
4,100,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide
5,101,Candide: Novel Summary: Chapters 9-10,Chapter 9\nSeeing what he believes to be anoth...,Candide
6,102,Candide: Novel Summary: Chapters 11-12,Chapter 11\nIn this chapter the old woman begi...,Candide
7,103,Candide: Novel Summary: Chapters 13-14,Chapter 13\n As the story returns to the prese...,Candide
8,104,Candide: Novel Summary: Chapters 15-16,"Chapter 15\nThe commander, Cunégonde's brother...",Candide
9,105,Candide: Novel Summary: Chapters 17-18,Chapter 17\n In this chapter the reader is int...,Candide


In [51]:
#Drop original index column

df_summaries_dup_dropped.drop(columns='index', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summaries_dup_dropped.drop(columns='index', inplace=True)


In [52]:
#Examine changes to df

df_summaries_dup_dropped.head()

Unnamed: 0,chapter_title,chapter_summary,book_title
0,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide
1,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide
2,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide
3,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide
4,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide


In [53]:
#For 'chapter_title' column in each row of summaries df, add shortened chapter title to list

shortened_ch_titles = []
for i in df_summaries_dup_dropped.index:
    bk_title = df_summaries_dup_dropped.iloc[i]['book_title']
    shortened_title = df_summaries_dup_dropped.iloc[i]['chapter_title'].replace(f'{bk_title}: Novel Summary: ', '')
    shortened_ch_titles.append(shortened_title)

In [54]:
#Examine newly created list

shortened_ch_titles[:20]

['Candide: Summary: Chapters 1-2',
 'Chapters 1-2',
 'Chapters 3-4',
 'Chapters 5-6',
 'Chapters 7-8',
 'Chapters 9-10',
 'Chapters 11-12',
 'Chapters 13-14',
 'Chapters 15-16',
 'Chapters 17-18',
 'Chapters 19-20',
 'Chapters 21-22',
 'Chapters 23-24',
 'Chapters 25-26',
 'Chapters 27-28',
 'The Age of Innocence: Novel Summary',
 'Chapters 1-3',
 'Chapters 4-6',
 'Chapters 7-9',
 'Chapters 10-12']

In [55]:
#Used list to add new shortened chapter titles column to summaries df

df_summaries_dup_dropped['chapters'] = shortened_ch_titles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_summaries_dup_dropped['chapters'] = shortened_ch_titles


In [57]:
#Examine changes to df

df_summaries_dup_dropped.head(15)

Unnamed: 0,chapter_title,chapter_summary,book_title,chapters
0,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide,Candide: Summary: Chapters 1-2
1,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide,Chapters 1-2
2,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide,Chapters 3-4
3,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide,Chapters 5-6
4,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide,Chapters 7-8
5,Candide: Novel Summary: Chapters 9-10,Chapter 9\nSeeing what he believes to be anoth...,Candide,Chapters 9-10
6,Candide: Novel Summary: Chapters 11-12,Chapter 11\nIn this chapter the old woman begi...,Candide,Chapters 11-12
7,Candide: Novel Summary: Chapters 13-14,Chapter 13\n As the story returns to the prese...,Candide,Chapters 13-14
8,Candide: Novel Summary: Chapters 15-16,"Chapter 15\nThe commander, Cunégonde's brother...",Candide,Chapters 15-16
9,Candide: Novel Summary: Chapters 17-18,Chapter 17\n In this chapter the reader is int...,Candide,Chapters 17-18


# Export filtered & homogenized dataframes to csv files

In [58]:
df_texts_filtered.to_csv('./data/working_data/book_texts/full_texts_filtered.csv', index=False)
df_summaries_filtered.to_csv('./data/working_data/book_summaries/summaries_filtered.csv', index=False)

# Create new dataframe including book titles, chapter titles, chapter summaries, & book full texts (to be used in process of dividing full texts into chapters) 

In [59]:
#For each row in the summaries df, add the corresponding book full text to a list

full_texts = []

for i in df_summaries_dup_dropped.index:
    book = df_summaries_dup_dropped.iloc[i]['book_title']
    full_text = df_texts_filtered[df_texts_filtered['book_title']==book]['full_text']
    full_texts.append(full_text)

In [60]:
#Create a copy of the most recent edit of the summaries df

summaries_and_fulltexts = df_summaries_dup_dropped.copy(deep=True)

In [61]:
#Add list of full texts created to 'summaries_and_fulltexts' df as new 'full_text' column

summaries_and_fulltexts['full_text'] = full_texts

In [62]:
#Examine new dataframe

summaries_and_fulltexts.head()

Unnamed: 0,chapter_title,chapter_summary,book_title,chapters,full_text
0,Candide: Summary: Chapters 1-2,Chapter 1\n Voltaire begins his picaresque tal...,Candide,Candide: Summary: Chapters 1-2,"9 CandidebyVoltaire\r\n\r\neditions, all of..."
1,Candide: Novel Summary: Chapters 1-2,Chapter 1: Voltaire begins his picaresque tale...,Candide,Chapters 1-2,"9 CandidebyVoltaire\r\n\r\neditions, all of..."
2,Candide: Novel Summary: Chapters 3-4,Chapter 3\nChapter 3 takes place in the midst ...,Candide,Chapters 3-4,"9 CandidebyVoltaire\r\n\r\neditions, all of..."
3,Candide: Novel Summary: Chapters 5-6,Chapter 5\nTraveling to Lisbon on a business t...,Candide,Chapters 5-6,"9 CandidebyVoltaire\r\n\r\neditions, all of..."
4,Candide: Novel Summary: Chapters 7-8,Chapter 7\nThe strange old woman takes Candide...,Candide,Chapters 7-8,"9 CandidebyVoltaire\r\n\r\neditions, all of..."


# Export summaries & full texts dataframe to csv

In [63]:
summaries_and_fulltexts.to_csv('./data/working_data/summaries_and_full_texts.csv', index=False)