# Appendix 1: Data Cleaning

In [25]:
import re
import bs4 #this may be appendices only
import requests #this may be appendices only
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
import datetime
import seaborn as sns
from   sklearn.linear_model import LinearRegression, LogisticRegression
from   sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

In [26]:
#ingest LibraryReads archive

libraryreads = pd.read_excel("LibraryReadsMasterList.xlsx")
#libraryreads=libraryreads.drop(libraryreads.loc[:,'Unnamed: 13':'Unnamed: 23'].columns, axis=1)

#rename columns
orig_lr_col = libraryreads.columns
new_lr_col = [i.lower().replace(' ','_').rstrip().lstrip() for i in orig_lr_col]
libraryreads.columns=new_lr_col
libraryreads=libraryreads.rename({'list_date':'lr_my','author_(last,_first)':'author','top_pick':'lr_top', 'hall_of_fame':'lr_hof','year-end_favorite_list':'lr_yfav','nominator/library':'nom_lib_lr'}, axis="columns")


#review rows without title (likely header rows for each month's list)
libraryreads[libraryreads['title'].isna()==True].to_csv('LibraryReads_NoTitle.csv',)
#confirmed that all rows without a title are header only
##Note: December 2016/January 2017 were combined into a single list, but it doesn't appear additional books were nominated/included (verified)

libraryreads[libraryreads['title'].isna() == False].describe
libraryreads=libraryreads[libraryreads['title'].isna()==False].reset_index() #remove month header rows and reset index


#Find bad isbn numbers - there are some with alphabet characters and one with internal spaces
libraryreads['isbn'] = libraryreads['isbn'].astype('string')
libraryreads['isbn']=libraryreads.isbn.str.strip().tolist()
libraryreads.isbn.str.isdigit().value_counts()
libraryreads[libraryreads.isbn.str.isdigit()==False]

#update ISBN numbers for those 4 rows to be all digits, to transform col into int
#index 127, Party of 2 - use hardcover IBSN, not trade paperback
libraryreads.loc[127,'isbn']='9780593100813'
libraryreads.loc[127,'isbn']
#index 246 and 336: strip out '-'
libraryreads.loc[246,'isbn']=libraryreads.loc[246,'isbn'].replace('-','')
libraryreads.loc[311,'isbn']=libraryreads.loc[311,'isbn'].replace('-','')
#index 529 - should be 0, not o
libraryreads.loc[529,'isbn']='9780385534246'

libraryreads[libraryreads.isbn.str.len() != 13]

#ISBN from booklist, corroborated by NoveList: 9780062460226
libraryreads.loc[527,'isbn']='9780062460226'

pub_date_fix_list=[] #fix type of pub_date column
for i in libraryreads.index:
    if isinstance(libraryreads.pub_date[i],datetime.datetime)==False:
        pub_date_fix_list.append(i)
#        print(i,libraryreads.pub_date[i], type(libraryreads.pub_date[i]))

libraryreads.iloc[21]
#pubdate for The Kindest Lie: 2/2/2021
libraryreads.loc[21,'pub_date'] = datetime.datetime.strptime('02/02/2021','%m/%d/%Y')

#index 353 looks fine from here: 11/28/2018
libraryreads.loc[353,'pub_date'] = datetime.datetime.strptime(libraryreads.pub_date[353],'%m/%d/%Y')
#Handful of May dates were entered incorrectly - 3 digits for the day, 1 for the month.
pub_date_fix_list=[]
for i in libraryreads.index:
    if isinstance(libraryreads.pub_date[i],datetime.datetime)==False:
        pub_date_fix_list.append(i)
#        print(i,libraryreads.pub_date[i], type(libraryreads.pub_date[i]))


for i in pub_date_fix_list:
    libraryreads.loc[i,'pub_date']=datetime.datetime.strptime('0'+libraryreads.pub_date[i][0:2]+libraryreads.pub_date[i][3:],'%m/%d/%Y')

libraryreads['isbn']=pd.to_numeric(libraryreads['isbn'])
#libraryreads['isbn'].to_csv('LibraryReads_isbn.csv') #output ISBN list to CSV for manual search in the LC catalog

In [27]:
libraryreads=libraryreads.drop(columns='index') #remove index column created for LibraryReads Archive, use pandas index

See Appendix III for code related to webscraping Library of Congress Catalog, generating the loc_results_v2.csv file. 

In [28]:
loc_results = pd.read_csv('loc_results_v2.csv',index_col=0)
#turn string back into a list
to_remove=['\'',']','.','[']
for i in to_remove:
    loc_results.genres = loc_results.genres.str.replace(i,'')

loc_results.genres = loc_results.genres.str.lstrip()
loc_results['genres']=loc_results.genres.str.split(',')
for j in range(len(loc_results)):
    for k in range(len(loc_results.loc[j,'genres'])):
        loc_results.loc[j,'genres'][k] = loc_results.loc[j,'genres'][k].rstrip().lstrip()

loc_results['genres']=loc_results.apply(lambda row: list(set(row['genres'])), axis = 1) #remove duplicate form/genre classifications within a list associated with a single ISBN


In [29]:

loc_results['isbn']=loc_results['isbn'].str.rstrip() 
loc_results.dtypes
loc_results['isbn'] = loc_results['isbn'].astype('string')
loc_results['isbn']=loc_results.isbn.str.strip().tolist()
#print(loc_results.isbn.str.isdigit().value_counts()) #check if removing leading or trailing spaces rectifies any of the non-integer ISBN values (no anticipated successes)
loc_results[loc_results.isbn.str.isdigit()==False]

#ISBN 10, not ISBN 13 was first ISBN listed in Library of Congress Data. Replace with the ISBN 13 value.
loc_results.loc[148,'isbn']=9780451495587
loc_results.loc[458,'isbn']=9780316243391
loc_results.loc[747,'isbn']=9780062694058
loc_results.loc[960,'isbn']=9780062132529
loc_results[loc_results.isbn.str.isdigit()==False] #verify replacement worked
loc_results['isbn']=pd.to_numeric(loc_results['isbn'])


In [30]:
loc_results.genres.to_list()
genres_split = pd.DataFrame(loc_results.genres.to_list(),index = loc_results.index) #create a dataframe that splits the genre list of length n into n columns
loc_results = pd.concat([loc_results,genres_split],axis=1) #merge genre dataframe with existing Library of Congress Dataframe
loc_results = loc_results.rename({0:'genre_0', 1:'genre_1',2:'genre_2',3:'genre_3',4:'genre_4',5:'genre_5',6:'genre_6',7:'genre_7'}, axis=1) #rename genre column headers
loc_dedup = loc_results.drop(columns='genres').drop_duplicates() #drop_duplicates generates an error when one of the columns is a list, drop the genre list in favor of retaining only the genre columns

In [31]:
full_df = pd.merge(libraryreads,loc_dedup,how='left',on='isbn') #merge datasets
#939 rows, but 936 distinct ISBNs in the libraryReads data
isbn_mult = pd.DataFrame(full_df['isbn'].value_counts())
multiple_row_isbn = list(isbn_mult[isbn_mult.isbn==2].index)

#look at all records for ISBNs that appear multiple times in the data
for i in multiple_row_isbn:
    print(full_df.loc[full_df.isbn==i][['permalink','isbn']])
#isbn = 9781616149987 - first record is edition with 14 page preview of forthcoming novel. No genres listed in either record. Retain row 801 (no additional pages)
#isbn = 978006228259 -  retain record in row 769 - multiple publishers listed with the correct pub year, row 769 corresponds with publisher named in LibraryReads. Row 770 is the large print edition. 
#isbn = 9781476705996 - retain line 899. Visual inspection doesn't suggest any reason to prefer permalink over the other. 

full_df = full_df.drop([800,770,900])
#confirm no more duplicate ISBNs
isbn_mult = pd.DataFrame(full_df['isbn'].value_counts())
multiple_row_isbn = list(isbn_mult[isbn_mult.isbn==2].index)

for i in multiple_row_isbn:
    print(full_df[full_df.isbn==i])
print('none')


                           permalink           isbn
800  https://lccn.loc.gov/2015514100  9781616149987
801  https://lccn.loc.gov/2014016018  9781616149987
                           permalink           isbn
769  https://lccn.loc.gov/2015472648  9780062282569
770  https://lccn.loc.gov/2014049330  9780062282569
                           permalink           isbn
899  https://lccn.loc.gov/2014656682  9781476705996
900  https://lccn.loc.gov/2014656748  9781476705996
none


In [32]:
#clean up publisher information by removing formatting differences
full_df['publisher']=full_df['publisher'].astype(str)
full_df['publisher']=full_df['publisher'].str.lower()
full_df['publisher']=full_df['publisher'].str.rstrip()
full_df['publisher']=full_df['publisher'].str.lstrip()
#Cleaning publisher names by visual inspection and validation
pd.set_option('display.max_rows',140)
full_df['publisher'].value_counts().sort_index()

ace                                 7
algonquin books                    14
algonuin                            1
amistad                             1
amy einhorn/putnam                  1
atlantic monthly press              3
atria                               2
atria books                        17
atria/37 ink                        1
atria/emily bestler                 2
avon                               23
ballantine                          2
ballantine books                   34
ballatine books                     1
bantam                              7
beacon press                        1
bellevue literary press             1
berkley                            70
berkley crime prime                 1
berkley jove                        7
berkley prime crime                 1
berkley/jove                        5
bloomsbury                          2
bloomsbury publishing               2
bloomsbury usa                      2
blue rider press                    1
broadway boo

In [33]:
#Publisher clean up

def publisher_replace_manual(publishername,*row): #create a function for updates after gaining a better understanding of how many updates may be required
    '''
    replace the name of a publisher for given rows within the more common name from the dataset
    '''
    full_df.loc[row,'publisher']=publishername
    #print('complete')
def publisher_find_manual(namev1,namev2):
    row_index=[]
    for i in range(len(full_df[(full_df.publisher==namev1) | (full_df.publisher==namev2)])):
        row_index.append(full_df[(full_df.publisher==namev1) | (full_df.publisher==namev2)].index[i])
    print(namev1,row_index)


publisher_lookups={}
    
def publisher_find(namev1,namev2):
    row_index=[]
    for i in range(len(full_df[(full_df.publisher==namev1) | (full_df.publisher==namev2)])):
        row_index.append(full_df[(full_df.publisher==namev1) | (full_df.publisher==namev2)].index[i])
    publisher_lookups[namev1]=row_index
def publisher_replace(publishername,row_index): #create a function for updates after gaining a better understanding of how many updates may be required
    '''
    replace the name of a publisher for given rows within the more common name from the dataset
    '''
    for i in range(len(row_index)):
        full_df.loc[row_index[i],'publisher']=publishername
    #print('complete')

#print(full_df[full_df.publisher=='algonuin']) #confirmed typo
full_df.loc[505,'publisher']='algonquin books'
full_df[(full_df.publisher.str.contains('atria'))] #atria is the same as atria books, /xxx indicates a publisher within the umbrella of Atria
full_df.loc[417,'publisher']='atria books'
full_df.loc[513,'publisher']='atria books'
full_df[(full_df.publisher=='ballatine books') | (full_df.publisher=='ballantine')][['permalink','author','title','isbn']] #confirmed both are synonyms for ballantine books

Unnamed: 0,permalink,author,title,isbn
321,https://lccn.loc.gov/2018039473,"Letts, Elizabeth",Finding Dorothy: A Novel,9780525622109
363,,"Picoult, Jodi",A Spark of Light: A Novel,9781984828095
803,https://lccn.loc.gov/2014023994,"Picoult, Jodi",Leaving Time,9780345544926


In [34]:
publisher_find_list = [['berkley prime crime', 'berkley crime prime'],#berkley prime crime matches LC record
                       ['berkley/jove','berkley jove'],#LC records vary within both of these. Use berkley/jove, more consistent with format from atria imprints
                      ['doubleday','doubleday books'],
                       ['harlequin mira','mira'],#Harlequin MIRA is in the LC, confirmed on Harlequin publishers as MIRA. 
                      ['hqn books','hqn'],#majority are hqn books, no Library of Congress records to use for validation/comparison
                      ['little, brown and company','little brown'],#little, brown and company
                      ['mcd/farrar, straus & giroux','mcd'],
                      ['minotaur books','minotaur'],
                      ['park row books','park row'],
                      ['pegasus books','pegasus'],
                      ['quirk books','quirk'],
                      ['riverhead books','riverhead'],
                      ['simon & schuster','simon and schuster'],
                      ['sourcebooks landmark','sourcebooks/landmark'],
                      ['viking','viking adult']]

for i in publisher_find_list:
    publisher_find(i[0],i[1])

In [35]:
for x in publisher_lookups:
    publishername = x
    row_index=publisher_lookups[x]
    publisher_replace(publishername,row_index)

In [36]:
#more than 2 options/deeper investigation required
print(full_df[full_df['publisher'].str.contains('bloomsbury')][['permalink','isbn','publisher']])
print(full_df[full_df['publisher'].str.contains('giroux')][['permalink','isbn','publisher']])#LC lists separately, do not combine
print(full_df[full_df.publisher.str.contains('penguin')][['permalink','isbn','publisher']]) #Penguin books and penguin press are both found in LC, penguin should be penguin books by LC link
publisher_find_manual('putnam','putnam adult') #these should all go to GP Putnam's sons
print(full_df[full_df.publisher.str.contains('tor')][['permalink','isbn','publisher']])

publisher_replace_manual('ballantine books',803,321,363)
publisher_replace_manual('bloomsbury publishing',33,83,302,325,595,741)
publisher_replace_manual('penguin books',795)
publisher_replace_manual('g.p. putnam\'s sons',539,548,760,776,872)
publisher_replace_manual('tor.com',32,66,145)
publisher_replace_manual('tor books',35,78,104,272,287,337,454,510,641,774,825,904)

                           permalink           isbn              publisher
33   https://lccn.loc.gov/2020023175  9781635575422             bloomsbury
83   https://lccn.loc.gov/2020009930  9781635575637  bloomsbury publishing
302  https://lccn.loc.gov/2020275372  9781635572582             bloomsbury
325                              NaN  9781635570298  bloomsbury publishing
595                              NaN  9781632864499         bloomsbury usa
741  https://lccn.loc.gov/2014020071  9781619634442         bloomsbury usa
                           permalink           isbn  \
345  https://lccn.loc.gov/2017057595  9780374123697   
407  https://lccn.loc.gov/2017038361  9780374228194   
484  https://lccn.loc.gov/2016059400  9780374203108   
678  https://lccn.loc.gov/2015002963  9780374290252   

                       publisher  
345  mcd/farrar, straus & giroux  
407  mcd/farrar, straus & giroux  
484  mcd/farrar, straus & giroux  
678      farrar, straus & giroux  
                        

In [37]:
full_df['publisher'].value_counts().sort_index()

ace                                 7
algonquin books                    15
amistad                             1
amy einhorn/putnam                  1
atlantic monthly press              3
atria books                        19
atria/37 ink                        1
atria/emily bestler                 2
avon                               23
ballantine books                   37
bantam                              7
beacon press                        1
bellevue literary press             1
berkley                            70
berkley prime crime                 2
berkley/jove                       12
bloomsbury publishing               6
blue rider press                    1
broadway books                      1
celadon books                       3
crooked lane books                  3
crown                              18
custom house                        1
daw                                 1
del rey                            21
delacorte press                    11
dey street b

In [38]:
#Determine how many LibraryReads rows do not have Library of Congress data, proxied with the existence of a permalink. 
full_df['permalink'].describe()


count                                 856
unique                                856
top       https://lccn.loc.gov/2015300007
freq                                    1
Name: permalink, dtype: object

In [39]:
#4th pass of Library of Congress: few results, manual input more efficient than webscraping
full_df.loc[47,'permalink'] = 'https://lccn.loc.gov/2020015957'
full_df.loc[47,'genre_0'] = 'Biographical fiction'
full_df.loc[183,'permalink'] = 'https://lccn.loc.gov/2020275791'
full_df.loc[183,'genre_0'] = 'Chick lit'
full_df.loc[183,'genre_1'] = 'Fiction'
full_df.loc[183,'genre_2'] = 'Humorous fiction'
full_df.loc[183,'genre_3'] = 'Romance fiction'
full_df.loc[183,'genre_4'] = 'Love stories'
full_df.loc[516,'permalink'] = 'https://lccn.loc.gov/2017299737'
full_df.loc[516,'genre_0'] = 'Paranormal romance stories'
full_df.loc[516,'genre_1'] = 'Fantasy fiction'
full_df.loc[516,'genre_2'] = 'Fiction'
full_df.loc[516,'genre_3'] = 'Romance fiction'
full_df.loc[516,'genre_4'] = 'Love stories'
full_df.loc[231,'permalink'] = 'https://lccn.loc.gov/2019010863'
full_df.loc[231,'genre_0'] = 'Love stories'
full_df.loc[708,'permalink'] = 'https://lccn.loc.gov/2015298168'
full_df.loc[708,'genre_0'] = 'Essays'
full_df.loc[633,'permalink'] = 'https://lccn.loc.gov/2015025112'
full_df.loc[633,'genre_0'] = 'Psychological fiction'
full_df.loc[633,'genre_1'] = 'Suspense fiction'
full_df.loc[834,'permalink'] = 'https://lccn.loc.gov/2013048524'
full_df.loc[834,'genre_0'] = 'Love stories'
full_df.loc[310,'permalink'] = 'https://lccn.loc.gov/2019297682'
full_df.loc[310,'genre_0'] = 'Fiction'
full_df.loc[325,'permalink'] = 'https://lccn.loc.gov/2018276149'
full_df.loc[325,'genre_0'] = 'Fantasy fiction'
full_df.loc[325,'genre_1'] = 'Science fiction'

In [40]:
full_df['permalink'].describe()

count                                 865
unique                                865
top       https://lccn.loc.gov/2015300007
freq                                    1
Name: permalink, dtype: object

In [41]:
indicator_vars = ['lr_top','lr_hof','debut']
for indicator in indicator_vars:
    print(indicator, full_df[indicator].value_counts())
    full_df[indicator]=full_df[indicator].apply(lambda x: 1 if x == 'x' else 0) #convert to Booleans
    print(indicator,full_df[indicator].value_counts())


lr_top x    86
Name: lr_top, dtype: int64
lr_top 0    850
1     86
Name: lr_top, dtype: int64
lr_hof x    77
Name: lr_hof, dtype: int64
lr_hof 0    859
1     77
Name: lr_hof, dtype: int64
debut x    181
Name: debut, dtype: int64
debut 0    755
1    181
Name: debut, dtype: int64


In [42]:
#error on The Vineyard at Painted Moon: not a Hall of Fame book
full_df.loc[full_df.isbn==9781335912794,'lr_hof']=0

Clean up/consolidate genre names

In [43]:
#LibraryReads assigned genres
#full_df.loc[full_df.loc[full_df.genre=='Literary Fic'].index[0],'genre'] = 'Literary Fiction'
#full_df.loc[full_df.loc[full_df.genre=='Short stories'].index[0],'genre'] = 'Short Stories'
#full_df.loc[full_df.loc[full_df.genre=='Suspense/Thiller'].index[0],'genre'] = 'Suspense/Thriller'

#Library of Congress genres
genre_variables = ['genre_0','genre_1','genre_2','genre_3','genre_4','genre_5','genre_6','genre_7']
for i in genre_variables:
    full_df[i] = full_df[i].str.replace('.','').str.strip().str.lower()

genre_updates = [['young adult works','young adult fiction'],['thriller fiction','thrillers (fiction)'],['suspence fiction','suspense fiction'],
 ['spy stories','spy fiction'],['medical novels','medical fiction'],['lovet stories','love stories'],
 ['humorous stories','humorous fiction'],['ghost stories fiction','ghost stories'],['ghost storeis','ghost stories'],
 ['fictional works','fiction'],['fiction / thrillers', 'thrillers (fiction)'],['dystopian fiction','dystopias'],
 ['detective fiction','detective and mystery fiction'],['detective and mystery stories','detective and mystery fiction'],
 ['christmas fiction','christmas stories']]

    
def fiction_update_wide(current_genre, new_genre, genre_col):
    idx_list = full_df.loc[full_df[genre_col]==current_genre].index
    for i in idx_list:
        full_df.loc[i,genre_col] = new_genre

for j in genre_variables:
    for k in genre_updates:
        fiction_update_wide(k[0],k[1],j)


In [44]:
wide_df=full_df.copy() #Analysis refers to the combined dataset as the wide format

In [45]:
#Flag HoF in wide data
hof_year = [2019,2020,2021]
wide_df['hof_pd']=0
wide_df['hof_compare1']=0 #April 2016 - Sept 2018
wide_df['hof_compare2']=0 #Oct 2015 - Mar 2018
for i in wide_df.lr_my:
    if int(i[:4]) in hof_year:
        wide_df.loc[i,'hof_pd']=1
        wide_df.loc[i,'hof_compare1']=0
        wide_df.loc[i,'hof_compare2']=0
    elif int(i[:4]) == 2018 and i[-2:] in ('10','11','12'):
        wide_df.loc[i,'hof_pd']=1
        wide_df.loc[i,'hof_compare1']=0
        wide_df.loc[i,'hof_compare2']=0
    else:
        wide_df.loc[i,'hof_pd']=0
        if int(i[:4])==2018 and int(i[-2:]) in (1,2,3):
            wide_df.loc[i,'hof_compare1'] = 1
            wide_df.loc[i,'hof_compare2']=1
        elif int(i[:4]) == 2018 and int(i[-2:]) in (4, 5, 6, 7, 8, 9):
            wide_df.loc[i,'hof_compare1'] = 1
            wide_df.loc[i,'hof_compare2']= 0
        elif int(i[:4]) == 2017:
            wide_df.loc[i,'hof_compare1'] = 1
            wide_df.loc[i,'hof_compare2']=1
        elif int(i[:4]) == 2016 and int(i[-2:]) in (1, 2, 3):
            wide_df.loc[i,'hof_compare1'] = 0
            wide_df.loc[i,'hof_compare2']=1
        elif int(i[:4]) == 2016 and int(i[-2:]) in (4, 5, 6, 7, 8, 9, 10, 11, 12):
            wide_df.loc[i,'hof_compare1'] = 1
            wide_df.loc[i,'hof_compare2']=1
        elif int(i[:4]) == 2015 and int(i[-2:]) in (10,11, 12):
            wide_df.loc[i,'hof_compare1'] = 0
            wide_df.loc[i,'hof_compare2']=1
        else:
            wide_df.loc[i,'hof_compare1'] = 0
            wide_df.loc[i,'hof_compare2']=0


In [52]:
full_df.to_csv('wide_df.csv') #wide format data referred to in analysis


## Create Long Form data set

In [54]:
long_df = pd.melt(full_df, id_vars=['isbn'], value_vars=['genre_0', 'genre_1',
       'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7'])

len(long_df)
long_df=long_df.drop(long_df.loc[(long_df.variable !='genre_0') & (long_df.value.isna())].index) #NaN values after genre_0 aren't adding information. 
#if genre_0 = NaN, indicates that either no Library of Congress Data exists or there is no genre/form field in the catalog data
long_df.value = long_df.value.str.replace(' ','_')
LibofCongress_genres = long_df.value.value_counts()
print(len(LibofCongress_genres.loc[LibofCongress_genres > 1]),'genres identified in Library of Congress Records more than once')
print(len(LibofCongress_genres.loc[LibofCongress_genres == 1]),'genres identified in Library of Congress Records once')
long_df.head()

46 genres identified in Library of Congress Records more than once
27 genres identified in Library of Congress Records once


Unnamed: 0,isbn,variable,value
0,9780063050006,genre_0,mystery_fiction
1,9780440001584,genre_0,fantasy_fiction
2,9780593100585,genre_0,love_stories
3,9781250767943,genre_0,love_stories
4,9781250268822,genre_0,suspense_fiction


In [55]:
long_df=pd.merge(long_df, wide_df,how='left', on = 'isbn')

In [56]:
pd.set_option('display.max_rows',120)
LibofCongress_genres
long_df['value']=long_df['value'].str.lower()
LibofCongress_genres = long_df.value.value_counts()
LibofCongress_genres.sort_values()
#long_df.loc[long_df['value']=='suspense fiction'].index


choose-your-own_stories              1
autobiographical_comics              1
utopian_fiction                      1
sea_stories                          1
mystery_in_literature                1
etc                                  1
coming_of_age_fiction                1
novellas                             1
strips                               1
martial_arts_fiction                 1
fiction_/_literary                   1
time-travel_fiction                  1
musical_fiction                      1
humor                                1
bible_fiction                        1
biographies                          1
comic_books                          1
diary_fiction                        1
juvenile_works                       1
christian_fiction                    1
short_stories                        1
comics_(graphic_works)               1
epic_fiction                         1
popular_works                        1
autobiographical_fiction             1
true_crime_stories       

In [58]:
long_df.to_csv('long_df.csv')