## Read dataset

In [82]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 30)

In [83]:
df = pd.read_csv('books_1.Best_Books_Ever.csv')
df.head(2)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FOR...,English,9780439023481,"['Young Adult', 'Fiction',...","['Katniss Everdeen', 'Peet...",Hardcover,First Edition,374,Scholastic Press,09/14/08,,['Locus Award Nominee for ...,6376780,"['3444695', '1921313', '74...",96.0,"['District 12, Panem', 'Ca...",https://i.gr-assets.com/im...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Ord...,Harry Potter and the Order...,Harry Potter #5,"J.K. Rowling, Mary GrandPr...",4.5,There is a door at the end...,English,9780439358071,"['Fantasy', 'Young Adult',...","['Sirius Black', 'Draco Ma...",Paperback,US Edition,870,Scholastic Inc.,09/28/04,06/21/03,['Bram Stoker Award for Wo...,2507623,"['1593642', '637516', '222...",98.0,['Hogwarts School of Witch...,https://i.gr-assets.com/im...,2632233,26923,7.38


In [84]:
# total lines and columns
df.shape

(52478, 25)

## Dataset cleaning 

### Remove duplicates

In [85]:
# Remove duplicates
print('Number of rows before: ', df['bookId'].size)
df0 = df.drop_duplicates()
print('Number of rows after: ', df0['bookId'].size)
df0['bookId'].nunique()

Number of rows before:  52478
Number of rows after:  52428


52424

### Verify `bookId` uniquness and clean `price`

In [86]:
# check if bookId is unique.
df0.groupby('bookId')['bookId'].count().sort_values(ascending=False)

bookId
24903989-widz-ci                            2
635270.The_Planet_Pirates                   2
975953.Time_of_the_Dragons                  2
60614.Diamond_Dogs                          2
1.Harry_Potter_and_the_Half_Blood_Prince    1
                                           ..
19271017-my-story                           1
1927111.Love_Com_Vol_5                      1
1927112.Love_Com_Vol_6                      1
192722.Candle_in_the_Darkness               1
999985.Horrid_Henry_s_Underpants            1
Name: bookId, Length: 52424, dtype: int64

In [87]:
# Above related issue is with 'price' column. 2 slightly different prices for the same bookId
df0[df0['bookId'].isin(['60614.Diamond_Dogs',
                        '24903989-widz-ci',
                        '635270.The_Planet_Pirates'
                        '975953.Time_of_the_Dragons'])].sort_values('bookId')

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
37422,24903989-widz-ci,Widzę cię,Trilogia dei sensi #1,Irene Cao,3.22,Gdyby dało się uchwycić pr...,Polish,9788379991587,"['Romance', 'Erotica', 'Co...","['Elena Kyler', 'Leonardo']",Paperback,,304,Sonia Draga,January 14th 2015,11/06/13,[],1292,"['216', '321', '405', '235...",73.0,['Venice (Italy)'],https://i.gr-assets.com/im...,87,1,7.28
37476,24903989-widz-ci,Widzę cię,Trilogia dei sensi #1,Irene Cao,3.22,Gdyby dało się uchwycić pr...,Polish,9788379991587,"['Romance', 'Erotica', 'Co...","['Elena Kyler', 'Leonardo']",Paperback,,304,Sonia Draga,January 14th 2015,11/06/13,[],1292,"['216', '321', '405', '235...",73.0,['Venice (Italy)'],https://i.gr-assets.com/im...,87,1,7.32
37401,60614.Diamond_Dogs,Diamond Dogs,,Alan Watt (Goodreads Author),3.46,Neil Garvin is a seventeen...,English,9780446677844,"['Fiction', 'Mystery', 'Co...",[],Paperback,,256,Grand Central Publishing,September 1st 2001,09/01/00,['ALA Alex Award (2001)'],320,"['54', '100', '115', '42',...",84.0,[],https://i.gr-assets.com/im...,87,1,6.27
37455,60614.Diamond_Dogs,Diamond Dogs,,Alan Watt (Goodreads Author),3.46,Neil Garvin is a seventeen...,English,9780446677844,"['Fiction', 'Mystery', 'Co...",[],Paperback,,256,Grand Central Publishing,September 1st 2001,09/01/00,['ALA Alex Award (2001)'],320,"['54', '100', '115', '42',...",84.0,[],https://i.gr-assets.com/im...,87,1,6.06


In [88]:
df0['price'].sort_values()

39591    0.84
43494    0.84
47302    0.84
39453    0.84
47182    0.84
         ... 
52467     NaN
52468     NaN
52471     NaN
52473     NaN
52474     NaN
Name: price, Length: 52428, dtype: object

In [89]:
# It could be substituted by the mean, but it is just 4 lines and the difference between prices is negligiable.
# Choosen solution is to remove second price deduplicating by tall except his column.

fix_col = list(df0.columns)
fix_col.remove('price')

df0 = df0.drop_duplicates(fix_col)

df0.groupby('bookId')['bookId'].count().sort_values(ascending=False)

bookId
1.Harry_Potter_and_the_Half_Blood_Prince    1
4068710-ufo-ifo                             1
40670008-before-the-fall                    1
40670312-the-one                            1
4067103-ransom-my-heart                     1
                                           ..
19271017-my-story                           1
1927111.Love_Com_Vol_5                      1
1927112.Love_Com_Vol_6                      1
192722.Candle_in_the_Darkness               1
999985.Horrid_Henry_s_Underpants            1
Name: bookId, Length: 52424, dtype: int64

In [90]:
# Update price format. From string like '1.189.88' to double 1189.88
df01 = df0.copy()
pat = r'\.(\d{3})'
repl = lambda m: m.group().replace(".", "")
df01['price'] = df01['price'].str.replace(pat, repl, regex=True)

df01[df01['bookId']=='3018318']


Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
5067,3018318,جدارية,,Mahmoud Darwish,4.31,"""هزمتك يا موت الفنون جميعه...",Arabic,9781855134966,"['Poetry', 'Literature', '...",[],Paperback,,105,رياض الريس للكتب والنشر,2001,10/28/00,['رياض الريس'],5502,"['3033', '1505', '689', '1...",95.0,[],https://i.gr-assets.com/im...,582,6,1189.88


### Clean `isbn`

In [91]:
# Issue many rows with "isbn" = 9999999999999
df01.groupby('isbn')['isbn'].count().sort_values(ascending=False)

isbn
9999999999999    4350
9781250166548       2
9780765326355       2
9780312429980       2
0000195166000       1
                 ... 
9780552556804       1
9780552557573       1
9780552561563       1
9780552562522       1
 978097736462       1
Name: isbn, Length: 48072, dtype: int64

In [92]:
# book usually has a unique ISBN code. In order to use it as part of primary key for the book table
# the rows containing '9999999999999' will be substituted with numeric part of 'bookId'
df02 = df01.copy()
df02['isbn_new'] = df01['bookId'].str.extract(r'^(\d+)')
df02['isbn'] = np.where(df02['isbn'] =='9999999999999', df02['isbn_new'], df02['isbn'])
df03 = df02.drop(columns=['isbn_new'])

print('Number of unique ISBN: ', df03['isbn'].nunique())
print('Number of unique "bookId": ', df03['bookId'].nunique())

Number of unique ISBN:  52421
Number of unique "bookId":  52424


In [93]:
# Difference between 'bookId' unique count and 'isbn' unique count is 'bookFormat' variation.
print(df03.groupby(['isbn'])['bookId'].count().sort_values(ascending=False).head(5))
df03[df03['isbn'].isin(['9780765326355', '9781250166548','9780312429980'])].sort_values('title')

isbn
9780765326355    2
9781250166548    2
9780312429980    2
0000195166000    1
9781461107033    1
Name: bookId, dtype: int64


Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
23159,34704992-edgedancer,Edgedancer,The Stormlight Archive #2.5,Brandon Sanderson (Goodrea...,4.29,\r\n From #1 New York Tim...,,9781250166548,"['Fantasy', 'Fiction', 'Ep...",['Lift'],Kindle Edition,,272,Tor Books,October 3rd 2017,11/22/16,[],39199,"['17521', '16063', '5009',...",98.0,[],https://i.gr-assets.com/im...,99,1,9.81
27486,34703445-edgedancer,Edgedancer,The Stormlight Archive #2.5,Brandon Sanderson (Goodrea...,4.29,From #1 New York Times bes...,,9781250166548,"['Fantasy', 'Fiction', 'Ep...",['Lift'],Hardcover,,272,Tor Books,October 17th 2017,11/22/16,[],39206,"['17526', '16065', '5009',...",98.0,[],https://i.gr-assets.com/im...,97,1,9.81
342,7235533-the-way-of-kings,The Way of Kings,The Stormlight Archive #1,Brandon Sanderson (Goodrea...,4.63,From #1 New York Times bes...,English,9780765326355,"['Fantasy', 'Fiction', 'Ep...","['Kaladin Stormblessed', '...",Hardcover,,1007,Tor Books,08/31/10,,['Locus Award Nominee for ...,302877,"['221465', '60345', '14100...",98.0,['Roshar'],https://i.gr-assets.com/im...,34012,359,22.74
32761,9188338-the-way-of-kings,The Way of Kings,The Stormlight Archive #1,Brandon Sanderson (Goodrea...,4.63,From #1 New York Times bes...,English,9780765326355,"['Fantasy', 'Fiction', 'Ep...","['Kaladin Stormblessed', '...",Kindle Edition,,1137,Tor Books,August 31st 2010,,['Locus Award Nominee for ...,304529,"['222482', '60729', '14210...",98.0,['Roshar'],https://i.gr-assets.com/im...,93,1,22.47
1296,7826803-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads A...,3.88,England in the 1520s is a ...,English,9780312429980,"['Historical Fiction', 'Fi...","['Anne Boleyn', 'Thomas Mo...",Paperback,,604,Picador USA,August 31st 2010,04/30/09,"['Booker Prize (2009)', 'O...",166123,"['62692', '51592', '29341'...",86.0,['Putney (United Kingdom)'...,https://i.gr-assets.com/im...,4109,57,1.5
25628,19380923-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads A...,3.88,Tudor England. Henry VIII ...,English,9780312429980,"['Historical Fiction', 'Fi...","['Anne Boleyn', 'Thomas Mo...",Kindle Edition,,672,Fourth Estate,January 16th 2010,04/30/09,"['Booker Prize (2009)', 'O...",166308,"['62767', '51656', '29360'...",86.0,['Putney (United Kingdom)'...,https://i.gr-assets.com/im...,98,1,6.91


### Clean `description` . There should be one unique description for `isbn`

In [94]:
pd.set_option('display.max_colwidth', None)

df03.groupby(['isbn'])['description'].nunique().sort_values(ascending=False)

isbn
9780765326355    2
9780312429980    2
9781250166548    2
0000195166000    1
9781452096834    1
                ..
9780340599099    0
9780753556030    0
20896128         0
9780340564622    0
17259397         0
Name: description, Length: 52421, dtype: int64

In [95]:
df03[df03['isbn'].isin(['9780765326355', '9781250166548','9780312429980'])][['title', 'isbn', 'bookFormat', 'description']].sort_values('title')

Unnamed: 0,title,isbn,bookFormat,description
23159,Edgedancer,9781250166548,Kindle Edition,"\r\n From #1 New York Times bestselling author Brandon Sanderson, a special gift edition of Edgedancer, a short novel of the Stormlight Archive.\r\nThree years ago, Lift asked a goddess to stop her from growing older--a wish she believed was granted. Now, in Edgedancer, the barely teenage nascent Knight Radiant finds that time stands still for no one. Although the young Azish emperor granted her safe haven from an executioner she knows only as Darkness, court life is suffocating the free-spirited Lift, who can't help heading to Yeddaw when she hears the relentless Darkness is there hunting people like her with budding powers. The downtrodden in Yeddaw have no champion, and Lift knows she must seize this awesome responsibility.\r\n Other books by Brandon Sanderson\r\n\r\n \r\n The Cosmere\r\n \r\n \r\n \r\n \r\n The Mistborn Saga\r\n \r\n Mistborn: The Final Empire\r\n \r\n The Well of Ascension\r\n \r\n The Hero of Ages\r\n \r\n Alloy of Law\r\n \r\n Shadows of Self\r\n \r\n Bands of Mourning\r\nThe Stormlight ArchiveThe Way of KingsWords of RadianceEdgedancer (Novella)Oathbringer (forthcoming)\r\n Collection\r\n \r\n Arcanum Unbounded\r\n\r\n Other Cosmere Titles\r\n \r\n Elantris\r\n \r\n Warbreaker\r\n \r\n Rithmatist\r\n\r\n \r\n The Alcatraz vs. the Evil Librarians Series\r\n \r\n \r\n Alcatraz vs. the Evil Librarians\r\n \r\n The Scrivener's Bones\r\n \r\n The Knights of Crystallia\r\n \r\n The Shattered Lens\r\n \r\n The Dark Talent\r\n\r\n \r\n The Reckoners Series\r\n \r\n \r\n Steelheart\r\n \r\n Firefight\r\n \r\n Calamity\r\nAt the Publisher's request, this title is being sold without Digital Rights Management Software (DRM) applied."
27486,Edgedancer,9781250166548,Hardcover,"From #1 New York Times bestselling author Brandon Sanderson, a special gift edition of Edgedancer, a short novel of the Stormlight Archive (previously published in Arcanum Unbounded).Three years ago, Lift asked a goddess to stop her from growing older--a wish she believed was granted. Now, in Edgedancer, the barely teenage nascent Knight Radiant finds that time stands still for no one. Although the young Azish emperor granted her safe haven from an executioner she knows only as Darkness, court life is suffocating the free-spirited Lift, who can't help heading to Yeddaw when she hears the relentless Darkness is there hunting people like her with budding powers. The downtrodden in Yeddaw have no champion, and Lift knows she must seize this awesome responsibility."
342,The Way of Kings,9780765326355,Hardcover,"From #1 New York Times bestselling author Brandon Sanderson, The Way of Kings, book one of The Stormlight Archive begins an incredible new saga of epic proportion.Roshar is a world of stone and storms. Uncanny tempests of incredible power sweep across the rocky terrain so frequently that they have shaped ecology and civilization alike. Animals hide in shells, trees pull in branches, and grass retracts into the soilless ground. Cities are built only where the topography offers shelter.It has been centuries since the fall of the ten consecrated orders known as the Knights Radiant, but their Shardblades and Shardplate remain: mystical swords and suits of armor that transform ordinary men into near-invincible warriors. Men trade kingdoms for Shardblades. Wars were fought for them, and won by them.One such war rages on a ruined landscape called the Shattered Plains. There, Kaladin, who traded his medical apprenticeship for a spear to protect his little brother, has been reduced to slavery. In a war that makes no sense, where ten armies fight separately against a single foe, he struggles to save his men and to fathom the leaders who consider them expendable.Brightlord Dalinar Kholin commands one of those other armies. Like his brother, the late king, he is fascinated by an ancient text called The Way of Kings. Troubled by over-powering visions of ancient times and the Knights Radiant, he has begun to doubt his own sanity.Across the ocean, an untried young woman named Shallan seeks to train under an eminent scholar and notorious heretic, Dalinar's niece, Jasnah. Though she genuinely loves learning, Shallan's motives are less than pure. As she plans a daring theft, her research for Jasnah hints at secrets of the Knights Radiant and the true cause of the war.The result of over ten years of planning, writing, and world-building, The Way of Kings is but the opening movement of the Stormlight Archive, a bold masterpiece in the making.Speak again the ancient oaths:Life before death.Strength before weakness.Journey before Destination.and return to men the Shards they once bore.The Knights Radiant must stand again."
32761,The Way of Kings,9780765326355,Kindle Edition,"From #1 New York Times bestselling author Brandon Sanderson, The Way of Kings, Book One of the Stormlight Archive begins an incredible new saga of epic proportion.Roshar is a world of stone and storms. Uncanny tempests of incredible power sweep across the rocky terrain so frequently that they have shaped ecology and civilization alike. Animals hide in shells, trees pull in branches, and grass retracts into the soilless ground. Cities are built only where the topography offers shelter.It has been centuries since the fall of the ten consecrated orders known as the Knights Radiant, but their Shardblades and Shardplate remain: mystical swords and suits of armor that transform ordinary men into near-invincible warriors. Men trade kingdoms for Shardblades. Wars were fought for them, and won by them.One such war rages on a ruined landscape called the Shattered Plains. There, Kaladin, who traded his medical apprenticeship for a spear to protect his little brother, has been reduced to slavery. In a war that makes no sense, where ten armies fight separately against a single foe, he struggles to save his men and to fathom the leaders who consider them expendable.Brightlord Dalinar Kholin commands one of those other armies. Like his brother, the late king, he is fascinated by an ancient text called The Way of Kings. Troubled by over-powering visions of ancient times and the Knights Radiant, he has begun to doubt his own sanity.Across the ocean, an untried young woman named Shallan seeks to train under an eminent scholar and notorious heretic, Dalinar's niece, Jasnah. Though she genuinely loves learning, Shallan's motives are less than pure. As she plans a daring theft, her research for Jasnah hints at secrets of the Knights Radiant and the true cause of the war.The result of over ten years of planning, writing, and world-building, The Way of Kings is but the opening movement of the Stormlight Archive, a bold masterpiece in the making.Speak again the ancient oaths:Life before death.Strength before weakness.Journey before Destination.and return to men the Shards they once bore.The Knights Radiant must stand again."
1296,Wolf Hall,9780312429980,Paperback,"England in the 1520s is a heartbeat from disaster. If the king dies without a male heir, the country could be destroyed by civil war. Henry VIII wants to annul his marriage of twenty years and marry Anne Boleyn. The pope and most of Europe opposes him. Into this impasse steps Thomas Cromwell: a wholly original man, a charmer and a bully, both idealist and opportunist, astute in reading people, and implacable in his ambition. But Henry is volatile: one day tender, one day murderous. Cromwell helps him break the opposition, but what will be the price of his triumph?"
25628,Wolf Hall,9780312429980,Kindle Edition,"Tudor England. Henry VIII is on the throne, but has no heir. Cardinal Wolsey is charged with securing his divorce. Into this atmosphere of distrust comes Thomas Cromwell - a man as ruthlessly ambitious in his wider politics as he is for himself. His reforming agenda is carried out in the grip of a self-interested parliament and a king who fluctuates between romantic passions and murderous rages."


In [96]:
df04 = df03.copy()
correct_value_book1 = df04.loc[(df04.isbn == '9781250166548') & (df04.bookFormat == 'Hardcover'), 'description'].values[0]
df04.loc[(df04.isbn == '9781250166548') & (df04.bookFormat == 'Kindle Edition'), 'description'] = correct_value_book1

correct_value_book2 = df04.loc[(df04.isbn == '9780765326355') & (df04.bookFormat == 'Hardcover'), 'description'].values[0]
df04.loc[(df04.isbn == '9780765326355') & (df04.bookFormat == 'Kindle Edition'), 'description'] = correct_value_book2

correct_value_book3 = df04.loc[(df04.isbn == '9780312429980') & (df04.bookFormat == 'Paperback'), 'description'].values[0]
df04.loc[(df04.isbn == '9780312429980') & (df04.bookFormat == 'Kindle Edition'), 'description'] = correct_value_book3

df04[df04['isbn'].isin(['9780765326355', '9781250166548','9780312429980'])] \
                        [['title', 'isbn', 'bookFormat', 'description']] \
                          .sort_values('title')

Unnamed: 0,title,isbn,bookFormat,description
23159,Edgedancer,9781250166548,Kindle Edition,"From #1 New York Times bestselling author Brandon Sanderson, a special gift edition of Edgedancer, a short novel of the Stormlight Archive (previously published in Arcanum Unbounded).Three years ago, Lift asked a goddess to stop her from growing older--a wish she believed was granted. Now, in Edgedancer, the barely teenage nascent Knight Radiant finds that time stands still for no one. Although the young Azish emperor granted her safe haven from an executioner she knows only as Darkness, court life is suffocating the free-spirited Lift, who can't help heading to Yeddaw when she hears the relentless Darkness is there hunting people like her with budding powers. The downtrodden in Yeddaw have no champion, and Lift knows she must seize this awesome responsibility."
27486,Edgedancer,9781250166548,Hardcover,"From #1 New York Times bestselling author Brandon Sanderson, a special gift edition of Edgedancer, a short novel of the Stormlight Archive (previously published in Arcanum Unbounded).Three years ago, Lift asked a goddess to stop her from growing older--a wish she believed was granted. Now, in Edgedancer, the barely teenage nascent Knight Radiant finds that time stands still for no one. Although the young Azish emperor granted her safe haven from an executioner she knows only as Darkness, court life is suffocating the free-spirited Lift, who can't help heading to Yeddaw when she hears the relentless Darkness is there hunting people like her with budding powers. The downtrodden in Yeddaw have no champion, and Lift knows she must seize this awesome responsibility."
342,The Way of Kings,9780765326355,Hardcover,"From #1 New York Times bestselling author Brandon Sanderson, The Way of Kings, book one of The Stormlight Archive begins an incredible new saga of epic proportion.Roshar is a world of stone and storms. Uncanny tempests of incredible power sweep across the rocky terrain so frequently that they have shaped ecology and civilization alike. Animals hide in shells, trees pull in branches, and grass retracts into the soilless ground. Cities are built only where the topography offers shelter.It has been centuries since the fall of the ten consecrated orders known as the Knights Radiant, but their Shardblades and Shardplate remain: mystical swords and suits of armor that transform ordinary men into near-invincible warriors. Men trade kingdoms for Shardblades. Wars were fought for them, and won by them.One such war rages on a ruined landscape called the Shattered Plains. There, Kaladin, who traded his medical apprenticeship for a spear to protect his little brother, has been reduced to slavery. In a war that makes no sense, where ten armies fight separately against a single foe, he struggles to save his men and to fathom the leaders who consider them expendable.Brightlord Dalinar Kholin commands one of those other armies. Like his brother, the late king, he is fascinated by an ancient text called The Way of Kings. Troubled by over-powering visions of ancient times and the Knights Radiant, he has begun to doubt his own sanity.Across the ocean, an untried young woman named Shallan seeks to train under an eminent scholar and notorious heretic, Dalinar's niece, Jasnah. Though she genuinely loves learning, Shallan's motives are less than pure. As she plans a daring theft, her research for Jasnah hints at secrets of the Knights Radiant and the true cause of the war.The result of over ten years of planning, writing, and world-building, The Way of Kings is but the opening movement of the Stormlight Archive, a bold masterpiece in the making.Speak again the ancient oaths:Life before death.Strength before weakness.Journey before Destination.and return to men the Shards they once bore.The Knights Radiant must stand again."
32761,The Way of Kings,9780765326355,Kindle Edition,"From #1 New York Times bestselling author Brandon Sanderson, The Way of Kings, book one of The Stormlight Archive begins an incredible new saga of epic proportion.Roshar is a world of stone and storms. Uncanny tempests of incredible power sweep across the rocky terrain so frequently that they have shaped ecology and civilization alike. Animals hide in shells, trees pull in branches, and grass retracts into the soilless ground. Cities are built only where the topography offers shelter.It has been centuries since the fall of the ten consecrated orders known as the Knights Radiant, but their Shardblades and Shardplate remain: mystical swords and suits of armor that transform ordinary men into near-invincible warriors. Men trade kingdoms for Shardblades. Wars were fought for them, and won by them.One such war rages on a ruined landscape called the Shattered Plains. There, Kaladin, who traded his medical apprenticeship for a spear to protect his little brother, has been reduced to slavery. In a war that makes no sense, where ten armies fight separately against a single foe, he struggles to save his men and to fathom the leaders who consider them expendable.Brightlord Dalinar Kholin commands one of those other armies. Like his brother, the late king, he is fascinated by an ancient text called The Way of Kings. Troubled by over-powering visions of ancient times and the Knights Radiant, he has begun to doubt his own sanity.Across the ocean, an untried young woman named Shallan seeks to train under an eminent scholar and notorious heretic, Dalinar's niece, Jasnah. Though she genuinely loves learning, Shallan's motives are less than pure. As she plans a daring theft, her research for Jasnah hints at secrets of the Knights Radiant and the true cause of the war.The result of over ten years of planning, writing, and world-building, The Way of Kings is but the opening movement of the Stormlight Archive, a bold masterpiece in the making.Speak again the ancient oaths:Life before death.Strength before weakness.Journey before Destination.and return to men the Shards they once bore.The Knights Radiant must stand again."
1296,Wolf Hall,9780312429980,Paperback,"England in the 1520s is a heartbeat from disaster. If the king dies without a male heir, the country could be destroyed by civil war. Henry VIII wants to annul his marriage of twenty years and marry Anne Boleyn. The pope and most of Europe opposes him. Into this impasse steps Thomas Cromwell: a wholly original man, a charmer and a bully, both idealist and opportunist, astute in reading people, and implacable in his ambition. But Henry is volatile: one day tender, one day murderous. Cromwell helps him break the opposition, but what will be the price of his triumph?"
25628,Wolf Hall,9780312429980,Kindle Edition,"England in the 1520s is a heartbeat from disaster. If the king dies without a male heir, the country could be destroyed by civil war. Henry VIII wants to annul his marriage of twenty years and marry Anne Boleyn. The pope and most of Europe opposes him. Into this impasse steps Thomas Cromwell: a wholly original man, a charmer and a bully, both idealist and opportunist, astute in reading people, and implacable in his ambition. But Henry is volatile: one day tender, one day murderous. Cromwell helps him break the opposition, but what will be the price of his triumph?"


In [97]:
# '\"' found in one book description is impending load as it is not escaped.
#df04[df04['description'].str.contains(r'\\"', na=False)]
df04['description'] = df04['description'].str.replace(r'\\"', '', regex=True)

In [98]:
pd.set_option('display.max_colwidth', 50)
df04.groupby(['isbn'])['description'].nunique().sort_values(ascending=False)

isbn
0000195166000    1
978145208533     1
9781451695656    1
9781451697131    1
9781451697186    1
                ..
9781556618673    0
9780753556030    0
20896128         0
9780340564622    0
17259397         0
Name: description, Length: 52421, dtype: int64

### Clean `author`

In [73]:
#pd.set_option('display.max_colwidth', None)
#df05 = df04.copy()
#df05[['author']].head(5)

In [74]:
# Separate distinct contributors to the book
#df05['author'] = df05['author'].str.split(',')
#df05['author']

In [75]:
#pd.set_option('display.max_colwidth', 30)
#df06 = df05.explode('author')
#df06.head(5)

### basic clean `publishDate`

In [104]:
# too long values for date
df04['publishDate'].str.len().max()

209.0

In [105]:
# max sensible data string length is 20. But there are over 300 longer entries to data column.

pd.set_option('display.max_colwidth', 100)

print(" Quantity of lines for 'publishDate' with length over 20: ", df04[(df04['publishDate'].str.len() > 20)]['publishDate'].count())
df04[df04['publishDate'].str.len() > 20].publishDate.head(2)
#df06['publishDate'].isna().sum()
#with np.printoptions(threshold=np.inf):
 #   print(dft['publishDate'].unique())

 Quantity of lines for 'publishDate' with length over 20:  304


2989    Best Books to Read When the Snow Is Falling\r\n\r\n3,839 books — 3,426 voters\r\nI Had No Idea T...
3560    Most Interesting World\r\n\r\n3,055 books — 2,474 voters\r\nThe Best Omnibuses and Box Sets\r\n\...
Name: publishDate, dtype: object

In [106]:
# Remove not date format data and unify date format.
df07 = df04.copy()
df07['publishDate'] = np.where(df07['publishDate'].str.len() > 20, np.NaN, df07['publishDate'])
df07['publishDate'] = np.where(df07['publishDate'].str.contains(r'\w*^$', regex=True), np.NaN, df07['publishDate'])
#df07[df07['publishDate'].str.len() > 20]
#df07['publishDate'].isna().sum
df07['publishDate'].count()

51241

### Clean `pages`

In [107]:
# Pages column should contains integers.
df08 = df07.copy()
df08['pages'] =  df08['pages'].str.replace('1 page', '', regex=True)

### Prepared list column for loading to postgres

In [108]:
df08['ratingsByStars']

0          ['3444695', '1921313', '745221', '171994', '93557']
1            ['1593642', '637516', '222366', '39573', '14526']
2          ['2363896', '1333153', '573280', '149952', '80794']
3           ['1617567', '816659', '373311', '113934', '76770']
4        ['1751460', '1113682', '1008686', '542017', '548674']
                                 ...                          
52473                        ['311', '310', '197', '42', '11']
52474                              ['16', '14', '5', '2', '0']
52475                   ['2109', '1868', '1660', '647', '390']
52476                            ['77', '78', '59', '19', '5']
52477                           ['106', '73', '42', '17', '8']
Name: ratingsByStars, Length: 52424, dtype: object

In [109]:
df09 = df08.copy()
to_replace = {'^\[(\'|\")': '{\"',
              '(\'|\")\]$': '\"}',
              '(\'|\"), (\'|\")': '\", \"',
              '\[\]': '',
              ',(\'|\")': '\"'}
for col in ['genres', 'characters', 'setting', 'ratingsByStars', 'awards']:
    df09[col] = df09[col].replace(to_replace, regex=True)
df09['genres'].head(2)

0    {"Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Te...
1    {"Fantasy", "Young Adult", "Fiction", "Magic", "Childrens", "Adventure", "Audiobook", "Middle Gr...
Name: genres, dtype: object

## Save clean dataset

In [110]:
df09.to_csv('data-preparation.csv', index=False) #, lineterminator='\n')

In [111]:
df09.shape

(52424, 25)

In [244]:
df09.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')

In [48]:
df09.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68629 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            68629 non-null  object 
 1   title             68629 non-null  object 
 2   series            30140 non-null  object 
 3   author            68629 non-null  object 
 4   rating            68629 non-null  float64
 5   description       66910 non-null  object 
 6   language          64006 non-null  object 
 7   isbn              68629 non-null  object 
 8   genres            68629 non-null  object 
 9   characters        68629 non-null  object 
 10  bookFormat        66876 non-null  object 
 11  edition           7287 non-null   object 
 12  pages             65708 non-null  object 
 13  publisher         64434 non-null  object 
 14  publishDate       67280 non-null  object 
 15  firstPublishDate  42522 non-null  object 
 16  awards            68629 non-null  object

In [268]:
# isbn will be a primary key for book table 
print('Number of unique books from original dataset: ',df09['bookId'].nunique()) 
print('Number of unique books by clean isbn with 3 books having same isbn but different format: ',df09['isbn'].nunique()) 

Number of unique books from original dataset:  52424
Number of unique books by clean isbn with 3 books having same isbn but different format:  52421


In [85]:
import ast  # for literal string parsing
# parse, because contents in data cells were loaded as strings

for i in range(len(df07)):
    s = df07.iat[i,20]
    #(f"row {i} before: {type(s)}")to_replace = {'^\[(\'|\")': '{\"',
              '(\'|\")\]$': '\'}"',
              '(\'|\"), (\'|\")': '\", \"',
              '\[\]': '',
              ',(\'|\")': '\"'}
for col in ['genres', 'characters', 'setting', 'ratingsByStars', 'awards']:
    df08[col] = df08[col].replace(to_replace, regex=True)
df08['genres'].head(2)
    df07.iat[i, 20] = ast.literal_eval(s)
    #print(f"       after: {type(s)}, len={len(s)}")

In [88]:
df08 = df07.copy()
df08 = df08.explode('genres')
df08[df08['isbn']=='1885']

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Classics,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Fiction,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Romance,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Historical Fiction,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Literature,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Historical,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Novels,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Historical Romance,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Classic Literature,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,
3,1885.Pride_and_Prejudice,Pride and Prejudice,,Jane Austen,4.26,"Alternate cover edition of ISBN 9780679783268Since its immediate success in 1813, Pride and Prej...",English,1885,Adult,"[Mr. Bennet, Mrs. Bennet, Jane Bennet, Elizabeth Bennet, Mary Bennet, Kitty Bennet, Lydia Bennet...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,01/28/13,[],2998241,"[1617567, 816659, 373311, 113934, 76770]",94.0,"[United Kingdom, Derbyshire, England (United Kingdom), England, Hertfordshire, England (United K...",https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320399351l/1885.jpg,1983116,20452,


In [122]:
pd.set_option('display.max_colwidth', 200)
df08 = df07.copy()
#df08[['genres']]

In [154]:
to_replace = {'^\[(\'|\")': '{\"',
              '(\'|\")\]$': '\'}"',
              '(\'|\"), (\'|\")': '\", \"',
              '\[\]': '',
              ',(\'|\")': '\"'}
for col in ['genres', 'characters', 'setting', 'ratingsByStars', 'awards']:
    df08[col] = df08[col].replace(to_replace, regex=True)
df08['genres'].head(2)

0              {"Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen", "Post Apocalyptic", "Action'}"
1    {"Fantasy", "Young Adult", "Fiction", "Magic", "Childrens", "Adventure", "Audiobook", "Middle Grade", "Classics", "Science Fiction Fantasy'}"
Name: genres, dtype: object

In [105]:
df07[df07['characters'].str.contains('Moresbury')]['characters']

2252    ['Port Moresbury,', 'Kit Moresbury']
Name: characters, dtype: object

## Prepare tables for load

In [28]:
# author table (isbn, author)
authors = df07[['isbn', 'author']]
authors.head(5)

Unnamed: 0,isbn,author
0,9780439023481,Suzanne Collins
1,9780439358071,J.K. Rowling
1,9780439358071,Mary GrandPré (Illustrator)
2,2657,Harper Lee
3,1885,Jane Austen


In [29]:
# genre table (isbn, genres)
genres = df07[['isbn', 'genres']]
genres = genres.explode('genres')
genres.head(5)

Unnamed: 0,isbn,genres
0,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantasy', 'Science Fiction', 'Romance', 'Adventure', 'Te..."
1,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic', 'Childrens', 'Adventure', 'Audiobook', 'Middle Gr..."
1,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic', 'Childrens', 'Adventure', 'Audiobook', 'Middle Gr..."
2,2657,"['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical..."
3,1885,"['Classics', 'Fiction', 'Romance', 'Historical Fiction', 'Literature', 'Historical', 'Novels', '..."


In [92]:
# characters table (isbn, character)
characters = df07[['isbn', 'characters']]
characters = characters.explode('characters')
characters.head(5)

Unnamed: 0,isbn,characters
0,9780439023481,Katniss Everdeen
0,9780439023481,Peeta Mellark
0,9780439023481,Cato (Hunger Games)
0,9780439023481,Primrose Everdeen
0,9780439023481,Gale Hawthorne


In [94]:
#awards table (isbn, award)
awards = df07[['isbn', 'awards']]
awards = awards.explode('awards')
awards.head(5)

Unnamed: 0,isbn,awards
0,9780439023481,Locus Award Nominee for Best Young Adult Book (2009)
0,9780439023481,Georgia Peach Book Award (2009)
0,9780439023481,Buxtehuder Bulle (2009)
0,9780439023481,Golden Duck Award for Young Adult (Hal Clement Award) (2009)
0,9780439023481,Grand Prix de l'Imaginaire Nominee for Roman jeunesse étranger (2010)


### further cleaning for date , maybe when analyzing, or as suggestion.

In [557]:
print(df07['publishDate'].head(4))
with np.printoptions(threshold=np.inf):
    print(df07['publishDate'].unique())

0    09/14/08
1    09/28/04
1    09/28/04
2    05/23/06
Name: publishDate, dtype: object
['09/14/08' '09/28/04' '05/23/06' '10/10/00' '09/06/06' '03/14/06'
 '04/28/96' '09/16/02' '09/25/12' '04/01/99' '01/10/12' '06/23/07'
 '10/07/64' '10/28/02' '03/28/06' '11/22/05' '06/01/04' '12/01/00'
 '02/04/03' '03/03/87' '11/29/11' '02/28/12' '10/01/99' '01/01/04'
 '04/15/14' '12/31/02' '02/28/99' '03/27/07' '09/30/04' '02/10/09'
 '05/06/03' '11/01/03' '06/29/00' '10/01/01' '01/08/02' '09/23/13'
 '05/12/86' '09/01/98' '06/24/03' '01/30/01' '07/15/03' '03/01/06'
 '06/01/07' '11/07/17' '08/28/05' '09/01/06' '09/01/97' '10/28/00'
 '10/28/88' '11/30/06' '08/29/06' '10/28/03' '05/01/07' '10/28/95'
 '01/12/99' '03/08/18' '05/28/04' '04/28/98' '01/24/06' '09/04/04'
 '10/01/19' '02/04/02' '05/01/90' '10/22/98' '06/28/75' '10/28/98'
 '04/06/04' '10/28/06' '07/21/07' '02/01/63' '10/16/12' '07/26/05'
 '02/01/05' '06/01/98' '03/28/73' '09/16/08' '12/17/13' '09/03/13'
 '12/10/19' '05/30/06' '05/21/19' '10/02

In [531]:
df07[df07['publishDate'].str.contains('214', na=False)]

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
11231,23277468-world-peace,World Peace: The Voice of a Mountain Bird,,Amit Ray,4.62,"This is a fable of a mountain bird, who had a vision to change the world and bring peace on eart...",English,9789382123262,"['Inspirational', 'Spirituality', 'Self Help', 'Nonfiction']",[],Paperback,,164,Inner Light Publishers,September 9th 214,,[],108,"['80', '21', '4', '0', '3']",97.0,[],https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1411874198l/23277468.jpg,200,2,10.94
11231,23277468-world-peace,World Peace: The Voice of a Mountain Bird,,Banani Ray (Goodreads Author),4.62,"This is a fable of a mountain bird, who had a vision to change the world and bring peace on eart...",English,9789382123262,"['Inspirational', 'Spirituality', 'Self Help', 'Nonfiction']",[],Paperback,,164,Inner Light Publishers,September 9th 214,,[],108,"['80', '21', '4', '0', '3']",97.0,[],https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1411874198l/23277468.jpg,200,2,10.94


In [515]:
#df07['publishDate'] = np.where(df07['publishDate'].str.contains('T'),
 #                              df07['publishDate'].str.extract('^(.{10})'),
  #                             df07['publishDate'])
#df07['publishDate'] = pd.to_datetime(df07['publishDate'], errors='coerce', format='%m/%d/%y')
df08 = df07.copy()
#df08['publishDate'] = pd.to_datetime(df08['publishDate'], errors='ignore', format='%B %dth %Y')

In [513]:
print(df08['publishDate'].head(4))
with np.printoptions(threshold=np.inf):
    print(df08['publishDate'].unique())

0   2008-09-14
1   2004-09-28
1   2004-09-28
2   2006-05-23
Name: publishDate, dtype: datetime64[ns]
['2008-09-14T00:00:00.000000000' '2004-09-28T00:00:00.000000000'
 '2006-05-23T00:00:00.000000000' '2000-10-10T00:00:00.000000000'
 '2006-09-06T00:00:00.000000000' '2006-03-14T00:00:00.000000000'
 '1996-04-28T00:00:00.000000000' '2002-09-16T00:00:00.000000000'
 '2012-09-25T00:00:00.000000000' '1999-04-01T00:00:00.000000000'
 '2012-01-10T00:00:00.000000000' '2007-06-23T00:00:00.000000000'
 '2064-10-07T00:00:00.000000000' '2002-10-28T00:00:00.000000000'
 '2006-03-28T00:00:00.000000000' '2005-11-22T00:00:00.000000000'
 '2004-06-01T00:00:00.000000000' '2000-12-01T00:00:00.000000000'
 '2003-02-04T00:00:00.000000000' '1987-03-03T00:00:00.000000000'
 '2011-11-29T00:00:00.000000000' '2012-02-28T00:00:00.000000000'
 '1999-10-01T00:00:00.000000000' '2004-01-01T00:00:00.000000000'
 '2014-04-15T00:00:00.000000000' '2002-12-31T00:00:00.000000000'
 '1999-02-28T00:00:00.000000000' '2007-03-27T00:00:00.

In [514]:
df08['publishDate'].count()

66482

In [558]:
def set_date(col):
    # date_formates = ["21 June, 2018", "12/11/2018 09:15:32", "April-21" ]
    #date_formats = ["%d %B, %Y", "%d/%m/%Y %H:%M:%S", "%B-%y", "%d %B, %Y", "%m/%d/Y"] # Can add different date formats to this list to test
    date_formats = ['%m/%d/%y', '%B %dst %Y', '%B %dnd %Y', '%B %dth %Y', '%Y']
    for x in date_formats:
        col = pd.to_datetime(col, errors="ignore", format= f"{x}")

    col = pd.to_datetime(col, errors="coerce") # To remove errors in the columns like strings or numbers
    return col

In [559]:
df08 = df07.copy()
df08['publishDate2'] = set_date(df08['publishDate'])
df08['publishDate2']

0       2008-09-14
1       2004-09-28
1       2004-09-28
2       2006-05-23
3       2000-10-10
           ...    
52475   2011-03-18
52476   2011-09-01
52476   2011-09-01
52476   2011-09-01
52477   2011-05-08
Name: publishDate2, Length: 68629, dtype: datetime64[ns]

In [560]:
df08[(df08['publishDate'].isna()) & (~df08['publishDate'].isna())]['publishDate']

Series([], Name: publishDate, dtype: object)

In [561]:
df08[df08['publishDate']=='1424']

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price,publishDate2
12872,6097536,إصلاح القلوب,,عمرو خالد,4.18,سلسلة من المحاضرات للداعية عمرو خالد بعنوان إصلاح القلوب,Arabic,6097536,"['Religion', 'Nonfiction', 'Islam']",[],Paperback,,318,الدار العربية للعلوم,1424,,[],756,"['377', '210', '116', '31', '22']",93.0,[],https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1232037376l/6097536.jpg,189,2,,NaT
12872,6097536,إصلاح القلوب,,عمرو خالد,4.18,سلسلة من المحاضرات للداعية عمرو خالد بعنوان إصلاح القلوب,Arabic,6097536,"['Religion', 'Nonfiction', 'Islam']",[],Paperback,,318,الدار العربية للعلوم,1424,,[],756,"['377', '210', '116', '31', '22']",93.0,[],https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1232037376l/6097536.jpg,189,2,,NaT


In [133]:
df08[df08['price'].isna()]

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price


In [141]:
df08['price'] =  df08['price'].fillna('')

In [168]:
df08['description'] = df08['description'].str.replace(r'[\t\r\n]+', '', regex=True)

In [146]:
df08['isbn'].isna().sum()

0

In [177]:
df05[df05['description'].str.contains(r'\\"', na=False)]

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
49329,32182929-lietuva-atsiskleid-ia,Lietuva atsiskleidžia: 99 LGBT+ istorijos,,"[Romas Zabarauskas, Arcana Femina]",4.43,"„Lietuva atsiskleidžia: 99 LGBT+ istorijos\"" pristato skirtingas Lietuvos LGBT+ asmenų patirtis. Teisininkė, elektrikas, šokėjas, aktyvistė, bedarbis, politologas, studentė ir daugybė kitų dalinas...",Lithuanian,9786099577111,"['Nonfiction', 'LGBT']",[],Paperback,First edition,256,Naratyvas,2016,,[],60,"['36', '16', '6', '2', '0']",97.0,[],https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1474616977l/32182929._SY475_.jpg,43,1,


In [156]:
df08 = df08.replace('\\r', '', regex=True)

In [169]:
df08.to_csv('data-preparation.csv', index=False, lineterminator='\n')