## Read dataset

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 30)

In [2]:
df = pd.read_csv('docs/books_1.Best_Books_Ever.csv')
df.head(2)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FOR...,English,9780439023481,"['Young Adult', 'Fiction',...","['Katniss Everdeen', 'Peet...",Hardcover,First Edition,374,Scholastic Press,09/14/08,,['Locus Award Nominee for ...,6376780,"['3444695', '1921313', '74...",96.0,"['District 12, Panem', 'Ca...",https://i.gr-assets.com/im...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Ord...,Harry Potter and the Order...,Harry Potter #5,"J.K. Rowling, Mary GrandPr...",4.5,There is a door at the end...,English,9780439358071,"['Fantasy', 'Young Adult',...","['Sirius Black', 'Draco Ma...",Paperback,US Edition,870,Scholastic Inc.,09/28/04,06/21/03,['Bram Stoker Award for Wo...,2507623,"['1593642', '637516', '222...",98.0,['Hogwarts School of Witch...,https://i.gr-assets.com/im...,2632233,26923,7.38


In [3]:
# total lines and columns
df.shape

(52478, 25)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            52478 non-null  object 
 1   title             52478 non-null  object 
 2   series            23470 non-null  object 
 3   author            52478 non-null  object 
 4   rating            52478 non-null  float64
 5   description       51140 non-null  object 
 6   language          48672 non-null  object 
 7   isbn              52478 non-null  object 
 8   genres            52478 non-null  object 
 9   characters        52478 non-null  object 
 10  bookFormat        51005 non-null  object 
 11  edition           4955 non-null   object 
 12  pages             50131 non-null  object 
 13  publisher         48782 non-null  object 
 14  publishDate       51598 non-null  object 
 15  firstPublishDate  31152 non-null  object 
 16  awards            52478 non-null  object

## Dataset cleaning 

### Remove duplicates

In [5]:
# Remove duplicates
print('Number of rows before: ', df['bookId'].size)
df0 = df.drop_duplicates()
print('Number of rows after: ', df0['bookId'].size)
df0['bookId'].nunique()

Number of rows before:  52478
Number of rows after:  52428


52424

### Verify `bookId` uniquness and clean `price`

In [6]:
# check if bookId is unique.
df0.groupby('bookId')['bookId'].count().sort_values(ascending=False)

bookId
24903989-widz-ci                            2
635270.The_Planet_Pirates                   2
975953.Time_of_the_Dragons                  2
60614.Diamond_Dogs                          2
1.Harry_Potter_and_the_Half_Blood_Prince    1
                                           ..
19271017-my-story                           1
1927111.Love_Com_Vol_5                      1
1927112.Love_Com_Vol_6                      1
192722.Candle_in_the_Darkness               1
999985.Horrid_Henry_s_Underpants            1
Name: bookId, Length: 52424, dtype: int64

In [7]:
# Above related issue is with 'price' column. 2 slightly different prices for the same bookId
df0[df0['bookId'].isin(['60614.Diamond_Dogs',
                        '24903989-widz-ci',
                        '635270.The_Planet_Pirates'
                        '975953.Time_of_the_Dragons'])].sort_values('bookId')

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
37422,24903989-widz-ci,Widzę cię,Trilogia dei sensi #1,Irene Cao,3.22,Gdyby dało się uchwycić pr...,Polish,9788379991587,"['Romance', 'Erotica', 'Co...","['Elena Kyler', 'Leonardo']",Paperback,,304,Sonia Draga,January 14th 2015,11/06/13,[],1292,"['216', '321', '405', '235...",73.0,['Venice (Italy)'],https://i.gr-assets.com/im...,87,1,7.28
37476,24903989-widz-ci,Widzę cię,Trilogia dei sensi #1,Irene Cao,3.22,Gdyby dało się uchwycić pr...,Polish,9788379991587,"['Romance', 'Erotica', 'Co...","['Elena Kyler', 'Leonardo']",Paperback,,304,Sonia Draga,January 14th 2015,11/06/13,[],1292,"['216', '321', '405', '235...",73.0,['Venice (Italy)'],https://i.gr-assets.com/im...,87,1,7.32
37401,60614.Diamond_Dogs,Diamond Dogs,,Alan Watt (Goodreads Author),3.46,Neil Garvin is a seventeen...,English,9780446677844,"['Fiction', 'Mystery', 'Co...",[],Paperback,,256,Grand Central Publishing,September 1st 2001,09/01/00,['ALA Alex Award (2001)'],320,"['54', '100', '115', '42',...",84.0,[],https://i.gr-assets.com/im...,87,1,6.27
37455,60614.Diamond_Dogs,Diamond Dogs,,Alan Watt (Goodreads Author),3.46,Neil Garvin is a seventeen...,English,9780446677844,"['Fiction', 'Mystery', 'Co...",[],Paperback,,256,Grand Central Publishing,September 1st 2001,09/01/00,['ALA Alex Award (2001)'],320,"['54', '100', '115', '42',...",84.0,[],https://i.gr-assets.com/im...,87,1,6.06


In [8]:
# It could be substituted by the mean, but it is just 4 lines and the difference between prices is negligiable.
# Choosen solution is to remove second price deduplicating by tall except his column.

fix_col = list(df0.columns)
fix_col.remove('price')

df0 = df0.drop_duplicates(fix_col)

# problem solved
df0.groupby('bookId')['bookId'].count().sort_values(ascending=False)

bookId
1.Harry_Potter_and_the_Half_Blood_Prince    1
4068710-ufo-ifo                             1
40670008-before-the-fall                    1
40670312-the-one                            1
4067103-ransom-my-heart                     1
                                           ..
19271017-my-story                           1
1927111.Love_Com_Vol_5                      1
1927112.Love_Com_Vol_6                      1
192722.Candle_in_the_Darkness               1
999985.Horrid_Henry_s_Underpants            1
Name: bookId, Length: 52424, dtype: int64

In [9]:
# Update price format. From string like '1.189.88' to double 1189.88
df01 = df0.copy()
pat = r'\.(\d{3})'
repl = lambda m: m.group().replace(".", "")
df01['price'] = df01['price'].str.replace(pat, repl, regex=True)

df01[df01['bookId']=='3018318']


Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
5067,3018318,جدارية,,Mahmoud Darwish,4.31,"""هزمتك يا موت الفنون جميعه...",Arabic,9781855134966,"['Poetry', 'Literature', '...",[],Paperback,,105,رياض الريس للكتب والنشر,2001,10/28/00,['رياض الريس'],5502,"['3033', '1505', '689', '1...",95.0,[],https://i.gr-assets.com/im...,582,6,1189.88


### Clean `isbn`

In [10]:
# Issue many rows with "isbn" = 9999999999999
df01.groupby('isbn')['isbn'].count().sort_values(ascending=False).head(5)

isbn
9999999999999    4350
9781250166548       2
9780765326355       2
9780312429980       2
0000195166000       1
Name: isbn, dtype: int64

In [11]:
# book usually has a unique ISBN code. In order to use it as part of primary key for the book table
# the rows containing '9999999999999' will be substituted with numeric part of 'bookId'
df02 = df01.copy()
df02['isbn_new'] = df01['bookId'].str.extract(r'^(\d+)')
df02['isbn'] = np.where(df02['isbn'] =='9999999999999', df02['isbn_new'], df02['isbn'])
df03 = df02.drop(columns=['isbn_new'])

print('Number of unique ISBN: ', df03['isbn'].nunique())
print('Number of unique "bookId": ', df03['bookId'].nunique())

Number of unique ISBN:  52421
Number of unique "bookId":  52424


### The ISBN identifies not only the particular publication but its publisher. If there is a change of publisher, then the new publisher must assign one of their own ISBNs to the new publication.

In [12]:
# There are 1 incorrect line
print(df03.groupby(['isbn'])['publisher'].nunique().sort_values(ascending=False).head(5))
df03[df03['isbn'] == '9780312429980'].sort_values('title').head(3)

isbn
9780312429980    2
0000195166000    1
9781423143543    1
9781423144335    1
9781423145509    1
Name: publisher, dtype: int64


Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
1296,7826803-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads A...,3.88,England in the 1520s is a ...,English,9780312429980,"['Historical Fiction', 'Fi...","['Anne Boleyn', 'Thomas Mo...",Paperback,,604,Picador USA,August 31st 2010,04/30/09,"['Booker Prize (2009)', 'O...",166123,"['62692', '51592', '29341'...",86.0,['Putney (United Kingdom)'...,https://i.gr-assets.com/im...,4109,57,1.5
25628,19380923-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads A...,3.88,Tudor England. Henry VIII ...,English,9780312429980,"['Historical Fiction', 'Fi...","['Anne Boleyn', 'Thomas Mo...",Kindle Edition,,672,Fourth Estate,January 16th 2010,04/30/09,"['Booker Prize (2009)', 'O...",166308,"['62767', '51656', '29360'...",86.0,['Putney (United Kingdom)'...,https://i.gr-assets.com/im...,98,1,6.91


In [13]:
# drop the line with less info
print('Lines before: ', df03.shape[0])
df04 = df03.loc[~((df03["isbn"] == '9780312429980') & (df03["bookFormat"] == 'Kindle Edition'))]
print('Lines after: ', df04.shape[0])

Lines before:  52424
Lines after:  52423


### The same ISBN cannot be used for different formats of the book, whether the book is in printed or electronic form. That means hardcover and softcover versions of the book have to be assigned separate ISBNs.

In [14]:
df04.groupby(['isbn'])['bookFormat'].nunique().sort_values(ascending=False).head(5)

isbn
9780765326355    2
9781250166548    2
9781452373218    1
9781452303680    1
9781452305080    1
Name: bookFormat, dtype: int64

In [15]:
# Update isbn with digits from bookId to make it unique
import re
df05 = df04.copy()

def replace_isbn(row):
    if row['isbn'] in ('9781250166548', '9780765326355'):
        m = re.search(r'^\d+', row['bookId'])
        return m.group()
    else:
        return row['isbn']
    
df05['isbn'] = df05.apply(replace_isbn, axis=1)
df05[df05['isbn']=='9781250166548']

df05.groupby(['isbn'])['bookFormat'].nunique().sort_values(ascending=False).head(5)

isbn
0000195166000    1
9781452365862    1
9781452300795    1
9781452301679    1
9781452303680    1
Name: bookFormat, dtype: int64

### Clean `description`

In [16]:
# '\"' found in one book description is impending load as it is not escaped.
df05['description'] = df05['description'].str.replace(r'\\"', '', regex=True)

In [17]:
pd.set_option('display.max_colwidth', 50)
df05.groupby(['isbn'])['description'].nunique().sort_values(ascending=False).head(4)

isbn
0000195166000    1
9781452082448    1
9781451695199    1
9781451695656    1
Name: description, dtype: int64

### basic clean `publishDate`

In [18]:
# too long values for date
df05['publishDate'].str.len().max()

209.0

In [19]:
# max sensible data string length is 20. But there are over 300 longer entries to data column.

pd.set_option('display.max_colwidth', 100)

print(" Quantity of lines for 'publishDate' with length over 20: ", df05[(df05['publishDate'].str.len() > 20)]['publishDate'].count())
df05[df05['publishDate'].str.len() > 20].publishDate.head(2)
#df06['publishDate'].isna().sum()
#with np.printoptions(threshold=np.inf):
 #   print(dft['publishDate'].unique())

 Quantity of lines for 'publishDate' with length over 20:  304


2989    Best Books to Read When the Snow Is Falling\r\n\r\n3,839 books — 3,426 voters\r\nI Had No Idea T...
3560    Most Interesting World\r\n\r\n3,055 books — 2,474 voters\r\nThe Best Omnibuses and Box Sets\r\n\...
Name: publishDate, dtype: object

In [20]:
# Remove not date format data
df07 = df05.copy()
df07['publishDate'] = np.where(df07['publishDate'].str.len() > 20, np.NaN, df07['publishDate'])
df07['publishDate'] = np.where(df07['publishDate'].str.contains(r'\w*^$', regex=True), np.NaN, df07['publishDate'])
#df07[df07['publishDate'].str.len() > 20]
#df07['publishDate'].isna().sum
df07['publishDate'].count()

51240

In [21]:
# Book (title and author) should have 'firstPublishDate'
df07.groupby(['title', 'author'])['firstPublishDate'].nunique().sort_values(ascending=False).head(5)


title                                                       author                                            
A Song of Ice and Fire                                      George R.R. Martin                                    2
The Little House Collection                                 Laura Ingalls Wilder, Garth Williams (Illustrator)    2
Eugene Onegin                                               Alexander Pushkin, James E. Falen (Translator)        2
Oscar and Lucinda                                           Peter Carey                                           1
Motorcycles, Sushi and One Strange Book (Enhanced Edition)  Nancy N. Rue                                          1
Name: firstPublishDate, dtype: int64

In [22]:
#  Update values for 'firstPublishDate' for 3 inconsistent rows
df08 = df07.copy()

df08['firstPublishDate'] = np.where(df08['isbn'] =='9780192838995', '10/30/33', df08['firstPublishDate'])
df08['firstPublishDate'] = np.where(df08['isbn'] =='9780060529963', '10/30/32', df08['firstPublishDate'])
df08['firstPublishDate'] = np.where(df08['isbn'] =='9780345529053', '10/28/00', df08['firstPublishDate'])

### Clean `pages`

In [23]:
# Pages column should contains integers.
df09 = df08.copy()
df09['pages'] =  df09['pages'].str.replace('1 page', '', regex=True)

### Prepare list like columns  for loading to postgres

In [24]:
df09['setting']

0                                          ['District 12, Panem', 'Capitol, Panem', 'Panem (United States)']
1                         ['Hogwarts School of Witchcraft and Wizardry (United Kingdom)', 'London, England']
2                                                                       ['Maycomb, Alabama (United States)']
3        ['United Kingdom', 'Derbyshire, England (United Kingdom)', 'England', 'Hertfordshire, England (U...
4        ['Forks, Washington (United States)', 'Phoenix, Arizona (United States)', 'Washington (state) (U...
                                                        ...                                                 
52473                                                                                                     []
52474                                                                                                     []
52475                                                                                                     []
52476              

In [25]:
import pandas as pd
from ast import literal_eval
df10 = df09.copy()
for col in ['genres', 'setting', 'ratingsByStars', 'awards']:
    df10[col] = df09[col].apply(literal_eval)

In [26]:
df10.setting.head(2)

0                       [District 12, Panem, Capitol, Panem, Panem (United States)]
1    [Hogwarts School of Witchcraft and Wizardry (United Kingdom), London, England]
Name: setting, dtype: object

## Save clean total dataset

In [27]:
df10.to_csv('docs/data-preparation_total.csv', index=False) #, lineterminator='\n')

In [28]:
df10.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')

## Prepare tables for load and save

### 1 . book

In [29]:
# prepare to load characters as a list (postgres array)
df11 = df10.copy()
to_replace = {'^\[(\'|\")': '{\"',
              '(\'|\")\]$': '\"}',
              '(\'|\"), (\'|\")': '\", \"',
              '\[\]': '',
              ',(\'|\")': '\"'}
to_repl = {'^\[': '{',
              '\]$': '}'}
df11['characters'] = df10['characters'].replace(to_replace, regex=True)
df11['characters'].head(2)

0    {"Katniss Everdeen", "Peeta Mellark", "Cato (Hunger Games)", "Primrose Everdeen", "Gale Hawthorn...
1    {"Sirius Black", "Draco Malfoy", "Ron Weasley", "Petunia Dursley", "Vernon Dursley", "Dudley Dur...
Name: characters, dtype: object

In [30]:
# book table (title, author, firstPublishDate, characters)
book = df11[['title', 'author', 'firstPublishDate', 'characters']]
book.head(5)

Unnamed: 0,title,author,firstPublishDate,characters
0,The Hunger Games,Suzanne Collins,,"{""Katniss Everdeen"", ""Peeta Mellark"", ""Cato (Hunger Games)"", ""Primrose Everdeen"", ""Gale Hawthorn..."
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",06/21/03,"{""Sirius Black"", ""Draco Malfoy"", ""Ron Weasley"", ""Petunia Dursley"", ""Vernon Dursley"", ""Dudley Dur..."
2,To Kill a Mockingbird,Harper Lee,07/11/60,"{""Scout Finch"", ""Atticus Finch"", ""Jem Finch"", ""Arthur Radley"", ""Mayella Ewell"", ""Aunt Alexandra""..."
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",01/28/13,"{""Mr. Bennet"", ""Mrs. Bennet"", ""Jane Bennet"", ""Elizabeth Bennet"", ""Mary Bennet"", ""Kitty Bennet"", ..."
4,Twilight,Stephenie Meyer,10/05/05,"{""Edward Cullen"", ""Jacob Black"", ""Laurent"", ""Renee"", ""Bella Swan"", ""Billy Black"", ""Esme Cullen"",..."


In [31]:
book.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52423 entries, 0 to 52477
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             52423 non-null  object
 1   author            52423 non-null  object
 2   firstPublishDate  31121 non-null  object
 3   characters        52423 non-null  object
dtypes: object(4)
memory usage: 2.0+ MB


In [32]:
book.to_csv('docs/book.csv', index=False)

### 2. genre

In [33]:
df10.groupby(['title', 'author'])['genres'].count().sort_values(ascending=False)

title                                           author                               
Limits of Destiny                               Sharlyn G. Branson (Goodreads Author)    5
Poderosa                                        Sérgio Klein                             3
The Way of Kings                                Brandon Sanderson (Goodreads Author)     3
The Spaghetti Set: Family Served Italian Style  Rose Marie Boyd (Goodreads Author)       2
Ice Station                                     Matthew Reilly                           2
                                                                                        ..
I, Claudius                                     Robert Graves                            1
I, Claudius/Claudius the God                    Robert Graves                            1
I, Coriander                                    Sally Gardner                            1
I, Cosmo                                        Carlie Sorosiak (Goodreads Author)       1
신의 탑

In [34]:
# book genre (title, author, genres)
genre = df10[['title', 'author', 'genres']]
genre.to_csv('docs/genre.csv', index=False)

In [35]:
# remove lines without genre info
genre = genre[genre["genres"].str.len() != 0]

In [36]:
# Function to get the row with maximum length
def get_longest_row(group):
    return group.loc[group['genres'].str.len().idxmax()]
# Grouping by 'title' and 'author', and selecting the genre with maximum length
genre = genre.groupby(['title', 'author']).apply(get_longest_row).reset_index(drop=True)


In [37]:
genre.groupby(['title', 'author'])['genres'].count().sort_values(ascending=False).head(3)

title                                                  author                       
!از قر و قمبیل‌های قلمی بی‌قال و قیل                   بزرگمهر حسین پور                 1
The Book of Psalms: A Translation with Commentary      Robert Alter                     1
The Book of Mormon: Another Testament of Jesus Christ  Joseph Smith Jr. (Translator)    1
Name: genres, dtype: int64

In [38]:
genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47775 entries, 0 to 47774
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   47775 non-null  object
 1   author  47775 non-null  object
 2   genres  47775 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [39]:
# one genre per book line
genre = genre.explode('genres')
genre.head(5)

Unnamed: 0,title,author,genres
0,!از قر و قمبیل‌های قلمی بی‌قال و قیل,بزرگمهر حسین پور,Comics
0,!از قر و قمبیل‌های قلمی بی‌قال و قیل,بزرگمهر حسین پور,Humor
1,"""A Problem from Hell"": America and the Age of Genocide",Samantha Power,Nonfiction
1,"""A Problem from Hell"": America and the Age of Genocide",Samantha Power,History
1,"""A Problem from Hell"": America and the Age of Genocide",Samantha Power,Politics


In [40]:
genre.to_csv('docs/genre.csv', index=False)

### 3. setting

In [41]:
# setting table (title, author, setting)
df10.groupby(['title', 'author'])['setting'].count().sort_values(ascending=False)

title                                           author                               
Limits of Destiny                               Sharlyn G. Branson (Goodreads Author)    5
Poderosa                                        Sérgio Klein                             3
The Way of Kings                                Brandon Sanderson (Goodreads Author)     3
The Spaghetti Set: Family Served Italian Style  Rose Marie Boyd (Goodreads Author)       2
Ice Station                                     Matthew Reilly                           2
                                                                                        ..
I, Claudius                                     Robert Graves                            1
I, Claudius/Claudius the God                    Robert Graves                            1
I, Coriander                                    Sally Gardner                            1
I, Cosmo                                        Carlie Sorosiak (Goodreads Author)       1
신의 탑

In [42]:
setting = df10[['title', 'author', 'setting']]
setting = setting[setting['setting'].str.len() != 0]
setting.head(2)

Unnamed: 0,title,author,setting
0,The Hunger Games,Suzanne Collins,"[District 12, Panem, Capitol, Panem, Panem (United States)]"
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)","[Hogwarts School of Witchcraft and Wizardry (United Kingdom), London, England]"


In [43]:
setting.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11568 entries, 0 to 52472
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    11568 non-null  object
 1   author   11568 non-null  object
 2   setting  11568 non-null  object
dtypes: object(3)
memory usage: 361.5+ KB


In [44]:
# Function to get the row with maximum length
def get_longest_row(group):
    return group.loc[group['setting'].str.len().idxmax()]

# Grouping by 'title' and 'author', and selecting the genre with maximum length
setting = setting.groupby(['title', 'author']).apply(get_longest_row).reset_index(drop=True)

In [45]:
setting.groupby(['title', 'author'])['setting'].count().sort_values(ascending=False).head(3)

title                                     author                                
"Slowly, Slowly, Slowly," said the Sloth  Eric Carle, Jane Goodall (Foreword by)    1
The Black Arrow                           Robert Louis Stevenson                    1
The Black Circle                          Patrick Carman (Goodreads Author)         1
Name: setting, dtype: int64

In [46]:
# one setting per book line
setting = setting.explode('setting')
setting.head(5)

Unnamed: 0,title,author,setting
0,"""Slowly, Slowly, Slowly,"" said the Sloth","Eric Carle, Jane Goodall (Foreword by)",Amazon Rainforest
1,#Nerd,Cambria Hebert (Goodreads Author),Maryland (United States)
2,#scandal,Sarah Ockler (Goodreads Author),"Lavender Oaks, Colorado (United States)"
3,'Salem's Lot,Stephen King (Goodreads Author),"Jerusalem's Lot, Maine (United States)"
4,'Til Death,Sharon Sala (Goodreads Author),Kentucky (United States)


In [47]:
setting.to_csv('docs/setting.csv', index=False)

### 4. awards

In [48]:
# award table (title, author, award)
df10.groupby(['isbn'])['awards'].count().sort_values(ascending=False)

isbn
0000195166000    1
9781461107033    1
9781461053743    1
9781461063520    1
9781461065715    1
                ..
9780440415992    1
9780440416432    1
9780440416487    1
9780440416548    1
 978097736462    1
Name: awards, Length: 52423, dtype: int64

In [49]:
#award table (isbn, award)
award = df10[['isbn', 'awards']]
award.info()
award = award[award['awards'].str.len() != 0]
award.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52423 entries, 0 to 52477
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   isbn    52423 non-null  object
 1   awards  52423 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10603 entries, 0 to 52475
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   isbn    10603 non-null  object
 1   awards  10603 non-null  object
dtypes: object(2)
memory usage: 248.5+ KB


In [50]:
# Function to get the row with maximum length
def get_longest_row(group):
    return group.loc[group['awards'].str.len().idxmax()]

# Grouping by 'title' and 'author', and selecting the genre with maximum length
award = award.groupby(['isbn']).apply(get_longest_row).reset_index(drop=True)

In [51]:
# one award line per book
award = award.explode('awards')
award.head(5)

Unnamed: 0,isbn,awards
0,195170342,Pulitzer Prize for History (2005)
0,195170342,Ambassador Book Award for American Studies (2005)
0,195170342,Massachusetts Book Award Nominee for Nonfiction (2005)
0,195170342,National Book Award Finalist for Nonfiction (2004)
1,31809014745,ECPA Christian Book Award


In [52]:
# Extract award year information
awards = award.copy()
awards['award'] = awards['awards'].str.extract(r'(.*)\s\(\d{4}\)$')
awards['award_year'] = awards['awards'].str.extract(r'\((\d{4})\)$')
awards['award'] = np.where(awards['award'].isna(), awards['awards'], awards['award'])
awards = awards.drop(columns='awards')
awards[awards['award'].isna()]

Unnamed: 0,isbn,award,award_year


In [53]:
awards.head(3)

Unnamed: 0,isbn,award,award_year
0,195170342,Pulitzer Prize for History,2005
0,195170342,Ambassador Book Award for American Studies,2005
0,195170342,Massachusetts Book Award Nominee for Nonfiction,2005


In [54]:
awards.to_csv('docs/award.csv', index=False)

### 5. star

In [55]:
# star table (isbn, ratingsByStars) cum 8
df10.groupby(['isbn'])['ratingsByStars'].count().sort_values(ascending=False)

isbn
0000195166000    1
9781461107033    1
9781461053743    1
9781461063520    1
9781461065715    1
                ..
9780440415992    1
9780440416432    1
9780440416487    1
9780440416548    1
 978097736462    1
Name: ratingsByStars, Length: 52423, dtype: int64

In [56]:
star = df10[['isbn', 'ratingsByStars']]
print(star.info())
star = star[star['ratingsByStars'].str.len() != 0]
star.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52423 entries, 0 to 52477
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   isbn            52423 non-null  object
 1   ratingsByStars  52423 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51001 entries, 0 to 52477
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   isbn            51001 non-null  object
 1   ratingsByStars  51001 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [57]:
# Function to get the row with maximum length
def get_longest_row(group):
    return group.loc[group['ratingsByStars'].str.len().idxmax()]

# Grouping by 'title' and 'author', and selecting the genre with maximum length
star = star.groupby(['isbn']).apply(get_longest_row).reset_index(drop=True)

In [58]:
# one star line per publication
star = star.explode('ratingsByStars')
star['star']= -(star.groupby('isbn')['ratingsByStars'].cumcount()-5)
star.head(10)

Unnamed: 0,isbn,ratingsByStars,star
0,195166000,74,5
0,195166000,92,4
0,195166000,80,3
0,195166000,19,2
0,195166000,7,1
1,195170342,6136,5
1,195170342,4396,4
1,195170342,2255,3
1,195170342,638,2
1,195170342,286,1


In [59]:
star.to_csv('docs/star.csv', index=False)

### 6. publication table

In [60]:
# publication table (isbn,series,rating,description,language,
#bookFormat,edition,pages,publisher,publishDate,numRatings ,likedPercent, coverImg,
#'bbeScore', 'bbeVotes', 'price')
publication = df10[['isbn', 'bookFormat', 'publisher', 'publishDate', 'description', 'series', 'rating',
                    'language', 'edition', 'pages', 'numRatings', 'likedPercent', 'coverImg',
                    'bbeScore', 'bbeVotes', 'price', 'author', 'title']]
publication.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 52423 entries, 0 to 52477
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   isbn          52423 non-null  object 
 1   bookFormat    50950 non-null  object 
 2   publisher     48731 non-null  object 
 3   publishDate   51240 non-null  object 
 4   description   51087 non-null  object 
 5   series        23441 non-null  object 
 6   rating        52423 non-null  float64
 7   language      48622 non-null  object 
 8   edition       4949 non-null   object 
 9   pages         50080 non-null  object 
 10  numRatings    52423 non-null  int64  
 11  likedPercent  51802 non-null  float64
 12  coverImg      51818 non-null  object 
 13  bbeScore      52423 non-null  int64  
 14  bbeVotes      52423 non-null  int64  
 15  price         38079 non-null  object 
 16  author        52423 non-null  object 
 17  title         52423 non-null  object 
dtypes: float64(2), int64(3), o

In [61]:
publication.to_csv('docs/publication.csv', index=False)

In [62]:
publication.columns


Index(['isbn', 'bookFormat', 'publisher', 'publishDate', 'description',
       'series', 'rating', 'language', 'edition', 'pages', 'numRatings',
       'likedPercent', 'coverImg', 'bbeScore', 'bbeVotes', 'price', 'author',
       'title'],
      dtype='object')

#### Please note that `bookId` column was droped and was substituted with `isbn`, `title` and `author`
#### as uniquely identifying rows in this dataset.

#### New columns `award_year` and `star` was added to better distribute available information.
