# CHILDREN'S BOOKS REVIEWS EDA AND DATA CLEANING

## Importing the libraries and data

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_rows=350
pd.options.display.max_columns=40


import warnings
warnings.filterwarnings('ignore')

In [2]:
filename2 = '../../ALLWOMEN/_MODULE 7/datasets/goodreads_reviews_children.json' #change your path here
data2 = pd.read_json(filename2,lines=True)

In [3]:
data2.shape

(734640, 11)

In [4]:
reviews=data2.copy()

## EDA and Data Cleaning: Dataset of reviews

In [5]:
reviews.head(3)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,23310161,f4b4b050f4be00e9283c92a814af2670,4,Fun sequel to the original.,Tue Nov 17 11:37:35 -0800 2015,Tue Nov 17 11:38:05 -0800 2015,,,7,0
1,8842281e1d1347389f2ab93d60773d4d,17290220,22d424a2b0057b18fb6ecf017af7be92,5,One of my favorite books to read to my 5 year ...,Sat Nov 08 08:54:03 -0800 2014,Wed Jan 25 13:56:12 -0800 2017,Tue Jan 24 00:00:00 -0800 2017,,4,0
2,8842281e1d1347389f2ab93d60773d4d,6954929,50ed4431c451d5677d98dd25ca8ec106,5,One of the best and most imaginative childrens...,Thu Oct 23 13:46:20 -0700 2014,Thu Oct 23 13:47:00 -0700 2014,,,6,1


In [6]:
reviews.book_id.nunique()
# We have fewer unique book_id in this dataset than in the description dataset

123946

In [7]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

Let's drop some columns we don't need in this project

In [8]:
reviews=reviews.drop(['date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'], axis=1)

In [9]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734640 entries, 0 to 734639
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      734640 non-null  object
 1   book_id      734640 non-null  int64 
 2   review_id    734640 non-null  object
 3   rating       734640 non-null  int64 
 4   review_text  734640 non-null  object
dtypes: int64(2), object(3)
memory usage: 28.0+ MB


In [10]:
reviews.duplicated().sum() 

0

In [12]:
def get_uniques(df,lim=20):
    for col in df.columns:
        array=list(df[col].values)
        uniques=set(array)
        if len(uniques) < lim:
            print(col, ":", uniques)
        else:
            print(col, ":", len(uniques), "unique values.")

In [13]:
get_uniques(reviews)

user_id : 92667 unique values.
book_id : 123946 unique values.
review_id : 734640 unique values.
rating : {0, 1, 2, 3, 4, 5}
review_text : 704137 unique values.


In [14]:
reviews.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
book_id,734640.0,9520406.0,10028760.0,5.0,432463.0,6411331.0,17349203.0,36469877.0
rating,734640.0,3.818344,1.240084,0.0,3.0,4.0,5.0,5.0


### User_id

In [15]:
reviews.user_id.value_counts()

a2d6dd1685e5aa0a72c9410f8f55e056    5322
9003d274774f4c47e62f77600b08ac1d    4482
97e2ce2141fa1c880967d78aec3c14fa    3898
6ac35fe952c608da50153d64f616291b    2836
751efd615712748ea54bd36da6e521aa    2824
                                    ... 
168938145a6f059f642bd0eb2b183d5a       1
e4ccf5f545ce85f1eab37856c537b52d       1
54208fdc5f05c626a2765b0d3c480d56       1
e786399c1f21e504609c29808a8d7ec8       1
d699e86772624042c140ac94ca504c91       1
Name: user_id, Length: 92667, dtype: int64

There is an user_id with 5322 different reviews. 

### Rating

In [16]:
reviews.rating.value_counts()

4    253185
5    251400
3    148210
2     40006
0     31113
1     10726
Name: rating, dtype: int64

There are 31113 ratings with a value of 0. We will take a look at this later when we merge the dataframes

### review_id  & review_text

In [17]:
reviews.review_id.nunique()

734640

In [18]:
reviews.review_text.value_counts()

SM                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     2181
A                                                                                                                                                                                                                                                                                                                                                                                                                                           

As we can see, there are 734640 reviews with unique id, but some of the texts of the reviews are repeated.
We will also deal with this after merging the datasets.

## Merging the reviews with the descriptions dataset cleaned before in notebook1

In [20]:
data3=pd.read_csv('descriptionsdfclean.csv') #this is our file from notebook 1

In [21]:
bookstomerge=data3.copy()

In [22]:
bookstomerge.shape

(62395, 15)

In [23]:
bookstomerge.columns

Index(['isbn', 'text_reviews_count', 'is_ebook', 'average_rating',
       'description', 'format', 'publisher', 'num_pages', 'isbn13',
       'publication_year', 'book_id', 'ratings_count', 'title',
       'descriptiondetect', 'titledetect'],
      dtype='object')

In [24]:
reviews.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text'], dtype='object')

In [25]:
df = pd.merge(reviews,bookstomerge , on=["book_id"])

In [26]:
df.shape

(537462, 19)

In [27]:
df.book_id.nunique() #we have lost 32 books after the merging.

62363

## EDA and Data cleaning of the merged DF

In [28]:
df.isnull().sum().sum()

0

In [29]:
get_uniques(df)

user_id : 73368 unique values.
book_id : 62363 unique values.
review_id : 537462 unique values.
rating : {0, 1, 2, 3, 4, 5}
review_text : 516088 unique values.
isbn : 61037 unique values.
text_reviews_count : 971 unique values.
is_ebook : {False, True}
average_rating : 285 unique values.
description : 62363 unique values.
format : 73 unique values.
publisher : 5366 unique values.
num_pages : 610 unique values.
isbn13 : 62095 unique values.
publication_year : 107 unique values.
ratings_count : 3830 unique values.
title : 62363 unique values.
descriptiondetect : 62363 unique values.
titledetect : 62363 unique values.


### Ratings

In [30]:
df.rating.value_counts()

4    187677
5    183112
3    108046
2     28667
0     22345
1      7615
Name: rating, dtype: int64

In [31]:
df[df.rating==0].nunique() 

user_id                5916
book_id               13067
review_id             22345
rating                    1
review_text           17250
isbn                  12944
text_reviews_count      965
is_ebook                  2
average_rating          237
description           13067
format                   29
publisher              1487
num_pages               448
isbn13                13044
publication_year         86
ratings_count          3175
title                 13067
descriptiondetect     13067
titledetect           13067
dtype: int64

In [32]:
df[df.rating==0]

Unnamed: 0,user_id,book_id,review_id,rating,review_text,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect
9,ff6d428e075e07702736c9d047b2c046,23310161,c2cad3a2a5b77fe8b8e2e11d466482f8,0,I''m not sure what it is about this kid Duncan...,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home
57,a33a6e4ca1c8896001a5336fe3bf0ecd,23310161,8bef232bfb6b4e5f6c7d49aad582577d,0,"A beautiful, beautiful story, not only for chi...",0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home
61,9cc3f4b9d56415191cfc24f6f59703d9,23310161,445c2db1565476d1235e86827b433cee,0,"So, so funny! My favorites are Maroon Crayon (...",0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home
83,4473faba02735ae33825baa557747eab,23310161,1a41bd60925de579955dcc7efee07210,0,Read some children's books for a short group a...,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home
109,5c7f8ab40ef794ea7c5eea62dfe8b8eb,23310161,36f422be8140cb05472e5969f85698b1,0,Another great crayon adventure!,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537347,cd7d78cff86b42120270ba32267dbb8f,3681950,c5fddb6598d8f79117269ce09d6ab28a,0,** spoiler alert ** \n I really enjoyed this b...,0670831964,3,False,3.62,Does a new name make a new you? Which name? Wh...,Hardcover,Viking Books for Young Readers,0.0,9780670831968,1990,6,Me and My Name,does a new name make a new you? which name? wh...,me and my name
537354,12458c01d2d31e64784fdea05b4fdd4e,23342191,4cc9ba4a23c92daeb9d6d0f49b13cf0d,0,"""The Boxcar Children guide to adventure: a how...",1497698928,4,True,4.37,The Boxcar Children have long been known for b...,ebook,Albert Whitman Company,144.0,9781497698925,2014,14,The Boxcar Children Guide to Adventure: A How-...,the boxcar children have long been known for b...,the boxcar children guide to adventure: a how-...
537381,453d7864278771acecb3a36ab42d0112,14802030,4d73092e30ae59d827acf79c4d2f0add,0,more of a 13yr olds book.couldnt finish,0141338881,3,False,3.90,Hannah & Joey have been best friends forever. ...,Paperback,Puffin Books,190.0,9780141338880,2011,31,Driftwood. Cathy Cassidy,hannah & joey have been best friends forever. ...,driftwood. cathy cassidy
537400,4e71cf218924a68104a10318d1e3d0fe,9995871,9270a3035d52a4bfc73cbbbc21fa4c29,0,-two girls \n -penpals \n - two different stor...,1416940227,5,False,4.12,"Sincerely, Sophie\nEleven year old Sophie Turn...",Paperback,Simon & Schuster Books for Young Readers,416.0,9781416940227,2011,27,"Sincerely: Sincerely, Sophie, Sincerely, Katie","sincerely, sophie\neleven year old sophie turn...","sincerely: sincerely, sophie, sincerely, katie"


## Language detection of the reviews

In [33]:
def preprocess_df(df):
    
    def process_string(x):
                 
        x = x.lower()
        return x
     
    df['review_text_detect'] = df['review_text'].apply(process_string)
    
    return df

In [34]:
df=preprocess_df(df)

In [35]:
from langdetect import detect

In [36]:
# write the function that detects the language
def language_detection(text):
    try:
        return detect(text)
    except:
        return None

#And neither this code to avoid file overwritting.
df.to_csv('reviews_langdetect.csv', index=False)

After running the language detect function, we have a new csv file. We are importing it below to avoid running the langdetect function code every time we restart the kernel.

### Importing the dataset after applying langdetec 

In [37]:
df=pd.read_csv('reviews_langdetect.csv') #comment this line if you are running the code above

In [38]:
df.shape

(537462, 21)

In [39]:
df.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'isbn',
       'text_reviews_count', 'is_ebook', 'average_rating', 'description',
       'format', 'publisher', 'num_pages', 'isbn13', 'publication_year',
       'ratings_count', 'title', 'descriptiondetect', 'titledetect',
       'review_text_detect', 'language'],
      dtype='object')

##  2º Round of Data Cleaning

In [49]:
df.language.value_counts()

en    507284
Name: language, dtype: int64

In [41]:
df[df.language!= 'en']

Unnamed: 0,user_id,book_id,review_id,rating,review_text,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,review_text_detect,language
0,8842281e1d1347389f2ab93d60773d4d,23310161,f4b4b050f4be00e9283c92a814af2670,4,Fun sequel to the original.,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,fun sequel to the original.,it
39,fdacc154118f88267b3b7bed7aa51080,23310161,a9b68ab76c24fbf12f0d3fdfadabaa53,4,Yo quiero unos crayones asi:(.,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,yo quiero unos crayones asi:(.,es
54,2746c2efcd96500931cf35e7a5967175,23310161,f83c80a8c987aea45e42194bba9058bb,4,"(April 7, 2016) https://i.ytimg.com/vi/ZHle5Gr...",0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,"(april 7, 2016) https://i.ytimg.com/vi/zhle5gr...",ro
87,c44a8be67a73ef1fab68c8585a60205f,23310161,98179e387c9d60c72c55f7d75e156bbe,5,Laugh out loud funny!,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,laugh out loud funny!,fr
89,9d79d624c10936da9a659b45cc2ed2e1,23310161,b5a39cf5352117b7e95ad8501e3b4bc0,4,Esteban!,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,esteban!,es
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537427,e84d333d55c67b86fdadfb013150d78c,18224172,a876341a2d86af09fbe44fb1d28891bb,5,I liked this book.,1490591400,7,False,3.67,"An adventure for readers aged 199 - 9, stuffed...",Paperback,Createspace,310.0,9781490591407,2013,17,Like Clockwork - A Complete Adventure Serial,"an adventure for readers aged 199 - 9, stuffed...",like clockwork - a complete adventure serial,i liked this book.,af
537448,0f777b72a1a35f00f7a948bdfd2dae7d,94752,e224e25aebe35316375c02b89b48d38f,2,http://bottomshelfbooks.blogspot.com/...,158642114X,5,False,3.94,"Work, Work, Workis the story of an industrious...",Hardcover,Steerforth,32.0,9781586421144,2006,52,"Work, Work, Work","work, work, workis the story of an industrious...","work, work, work",http://bottomshelfbooks.blogspot.com/...,
537449,0f777b72a1a35f00f7a948bdfd2dae7d,2102369,5741fa4719d0fbbc4d12a186dbc620ef,3,http://bottomshelfbooks.blogspot.com/...,0761452958,3,False,3.53,Yoshi the tanuki--a Japanese raccoon-dog--lear...,Hardcover,Two Lions,32.0,9780761452959,2007,12,The Furry-Legged Teapot,yoshi the tanuki--a japanese raccoon-dog--lear...,the furry-legged teapot,http://bottomshelfbooks.blogspot.com/...,
537450,0f777b72a1a35f00f7a948bdfd2dae7d,291366,bb847402b6571b3252ebc2c366a6bea6,3,http://bottomshelfbooks.blogspot.com/...,0395922720,12,False,4.13,"On June 14, 1940, Hans and Margret Rey fled Pa...",Hardcover,HMH Books for Young Readers,64.0,9780395922729,1998,53,The Original Curious George,"on june 14, 1940, hans and margret rey fled pa...",the original curious george,http://bottomshelfbooks.blogspot.com/...,


In [42]:
df[df.language== 'en']

Unnamed: 0,user_id,book_id,review_id,rating,review_text,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,review_text_detect,language
1,d1e368a7d2870eb6fbf6e0d350568a2d,23310161,87fb7685b0b1aa774cc14c330bdcfadf,4,Great sequel! Lost crayons write their owner p...,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,great sequel! lost crayons write their owner p...,en
2,7370e3e727f4fdae3ab82f054838d73d,23310161,b8c59999e366608b789114d6fdd52a69,5,Great sequel to the original.,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,great sequel to the original.,en
3,ec58d8af7f69e19781abed9d25287a57,23310161,db7984daa9a8c3b49f4c7f9f73ec86ac,5,I loved this book as much as the 1st one.,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,i loved this book as much as the 1st one.,en
4,cfe4738a5d926fc158d4fcb1287fe2c5,23310161,f3c100afbe1495fc51bff4d6c3cb7138,4,It was a cute follow-up to the clever first book.,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,it was a cute follow-up to the clever first book.,en
5,5a7355dd76fb7649912b783dffe604e1,23310161,cde0bf6dfb9ff09ef13494daa0209284,5,Not as good as the original but fun to have a ...,0399172750,1231,False,4.43,The companion to the #1 blockbuster bestseller...,Hardcover,Philomel Books,36.0,9780399172755,2015,8924,The Day the Crayons Came Home,the companion to the #1 blockbuster bestseller...,the day the crayons came home,not as good as the original but fun to have a ...,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537456,12104bd507c4ed850f90b22243905be0,15767746,8cc43e5a9131e54309e1720dcac1db1e,3,The Red Gorilla of Oz was alright. I was doing...,147742007X,2,False,3.41,"Sebastian is the clumsy, heedless prince of th...",Paperback,CreateSpace,168.0,9781477420072,2012,6,The Red Gorilla of Oz,"sebastian is the clumsy, heedless prince of th...",the red gorilla of oz,the red gorilla of oz was alright. i was doing...,en
537457,0c6a008c66d219aa559aa717d48515a9,139404,d7e673822b2d8683be98d7c34b598354,4,This is a great book for creating a home or fo...,0977706303,7,False,3.94,Nominated for the 2007 Anne Izard Storytellers...,Paperback,Mythic Yoga Studio,114.0,9780977706303,2006,79,Teaching Yoga to Children Through Story,nominated for the 2007 anne izard storytellers...,teaching yoga to children through story,this is a great book for creating a home or fo...,en
537458,3ad19263e87d15206aa3465aa29d59d9,15263392,b8cc2beada2f149ee51146282899801d,4,Andy Griffith's enough said. The king of child...,0230700756,5,False,3.92,Packed with full colour illustrations and diag...,Hardcover,MacMillan Children's Books,96.0,9780230700758,2008,34,What Bumosaur Is That?: An Illustrated Guide t...,packed with full colour illustrations and diag...,what bumosaur is that?: an illustrated guide t...,andy griffith's enough said. the king of child...,en
537459,300842609e2359ae76c4fd6ff60704be,3108554,ce06686588ccd26a4b86f900f15dc0ad,3,A short read about the fear of whats to come w...,019271614X,1,False,4.00,Although life in London during World War II ha...,not defined,Oxford University Press,0.0,9780192716149,1991,13,Paper Faces,although life in london during world war ii ha...,paper faces,a short read about the fear of whats to come w...,en


In [43]:
df=df[df.language== 'en']

In [44]:
df.shape

(507284, 21)

In [45]:
get_uniques(df)

user_id : 68265 unique values.
book_id : 60904 unique values.
review_id : 507284 unique values.
rating : {0, 1, 2, 3, 4, 5}
review_text : 497142 unique values.
isbn : 59599 unique values.
text_reviews_count : 971 unique values.
is_ebook : {False, True}
average_rating : 284 unique values.
description : 60904 unique values.
format : 71 unique values.
publisher : 5267 unique values.
num_pages : 603 unique values.
isbn13 : 60649 unique values.
publication_year : 107 unique values.
ratings_count : 3827 unique values.
title : 60904 unique values.
descriptiondetect : 60904 unique values.
titledetect : 60904 unique values.
review_text_detect : 496634 unique values.
language : {'en'}


In [46]:
df.review_text.value_counts()

O                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            616
Fantastic books for young girls getting into reading!! Great stories about friendship and life lessons. The characters 

In [47]:
df.review_text_detect.value_counts()

o                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  617
fantastic books for young girls getting into reading!! great stories about friendship and life lessons. the characters deal with all sorts of situations and often find responsible solutions to problems. \n i loved this series growing up and wanted to start my own babysitti

In [48]:
df[df.review_text_detect.map(df.review_text_detect.value_counts()) >= 2].sort_values('review_text_detect', ascending=False)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,review_text_detect,language
324853,a1aa241333c125d7149699d2e9a1b037,1306065,6cc1da563df43711e48eff3bf1abb801,4,{my generic MTH reveiew} \n Miles is now fully...,0375830316,112,False,3.96,Jack and Annie travel back in time to a desert...,Hardcover,Random House Books for Young Readers,128.0,9780375830310,2005,3812,"Season of the Sandstorms (Magic Tree House, #34)",jack and annie travel back in time to a desert...,"season of the sandstorms (magic tree house, #34)",{my generic mth reveiew} \n miles is now fully...,en
324868,a1aa241333c125d7149699d2e9a1b037,301898,9e34ef350f8c6fd2d54c2d7a3ea93a5f,4,{my generic MTH reveiew} \n Miles is now fully...,0375830332,142,False,3.99,MERLIN HAS ASKED Jack and Annie to help on ano...,Hardcover,Random House for Young Readers,128.0,9780375830334,2005,4379,"Carnival at Candlelight (Magic Tree House, #33)",merlin has asked jack and annie to help on ano...,"carnival at candlelight (magic tree house, #33)",{my generic mth reveiew} \n miles is now fully...,en
522452,a1aa241333c125d7149699d2e9a1b037,2288956,872ad62212e2a075da3e6fc4760e0a7d,0,{my generic MTH reveiew} \n Miles is now fully...,0375837280,8,False,4.05,Merlin the Magician will not eat or sleep or s...,Paperback,Random House for Young Readers,128.0,9780375837289,2009,119,"Dragon of the Red Dawn (Magic Tree House, #37)",merlin the magician will not eat or sleep or s...,"dragon of the red dawn (magic tree house, #37)",{my generic mth reveiew} \n miles is now fully...,en
39357,630f1c10f4e76be80e44500189d28ad0,6327,bb2fdd9729a001d4df6fe9b7132036c5,4,Young Readers Choice 1986,0590032496,4302,False,4.17,This is not a fairy tale. This is about realwi...,Paperback,Scholastic Inc.,208.0,9780590032490,1997,227206,The Witches,this is not a fairy tale. this is about realwi...,the witches,young readers choice 1986,en
165671,630f1c10f4e76be80e44500189d28ad0,105999,cb72399dba3f55524885e42079e043ef,3,Young Readers Choice 1986,0380709589,1191,False,3.70,Beverly Cleary's timeless Newbery Medal-winnin...,Paperback,HarperCollins,134.0,9780380709588,2000,29366,"Dear Mr. Henshaw (Leigh Botts, #1)",beverly cleary's timeless newbery medal-winnin...,"dear mr. henshaw (leigh botts, #1)",young readers choice 1986,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16745,42ac014557825144b1a71a2e918a8c2c,157993,bb02db6e00c9485670793aa57bff5ba1,5,"""And now here is my secret, a very simple secr...",0156012197,16639,False,4.28,"Moral allegory and spiritual autobiography, Th...",Paperback,"Harcourt, Inc.",93.0,9780156012195,2000,763309,The Little Prince,"moral allegory and spiritual autobiography, th...",the little prince,"""and now here is my secret, a very simple secr...",en
17142,78f7aed272cc12087e19ce2bcc20a5de,157993,9159ce21e7aa717463fa7dc481696ede,4,"""And now here is my secret, a very simple secr...",0156012197,16639,False,4.28,"Moral allegory and spiritual autobiography, Th...",Paperback,"Harcourt, Inc.",93.0,9780156012195,2000,763309,The Little Prince,"moral allegory and spiritual autobiography, th...",the little prince,"""and now here is my secret, a very simple secr...",en
220210,6e58a5249317da1274e82a6f97822f13,7779,3e07ed91948d425ff7b6d153c420e01a,5,"""A person's a person, no matter how small.""",0679800034,994,False,4.16,Surely among the most lovable of all Dr. Seuss...,Paperback,Random House Books for Young Readers,64.0,9780679800033,1990,81660,Horton Hears a Who!,surely among the most lovable of all dr. seuss...,horton hears a who!,"""a person's a person, no matter how small.""",en
220193,f6eb32a7b73ba3e653302623817c9b71,7779,3903b6f117be1780b4681b3da82af06e,5,"""A person's a person, no matter how small.""",0679800034,994,False,4.16,Surely among the most lovable of all Dr. Seuss...,Paperback,Random House Books for Young Readers,64.0,9780679800033,1990,81660,Horton Hears a Who!,surely among the most lovable of all dr. seuss...,horton hears a who!,"""a person's a person, no matter how small.""",en


In [50]:
df=df.sort_values('text_reviews_count', ascending= False)

In [51]:
df.head(4)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,review_text_detect,language
20820,b10fe6b69d79b31aea00153d46fd16c5,3636,82574e06eb02f3ccb1cc2423d4276938,5,SPECTACULAR! This book is simply elegant:),385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)",spectacular! this book is simply elegant:),en
21734,39d0410410b2af134b3dcc66f7a21f5f,3636,da1fadd6ce53c0ab01bcbd1d58c010d9,5,The best YA science fiction I've ever read!,385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)",the best ya science fiction i've ever read!,en
21756,02a401f76c4cb789b41a03c86aba8333,3636,ff2f2796810b34be15a4cd6adcdf1504,4,Before I started reading the Giver I thought I...,385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)",before i started reading the giver i thought i...,en
21755,1d4f47fc8b9c55a2b4b44a32a4243ac3,3636,20d31eb9a3c111b19a34b5f551a3bdf5,5,I just re-read this book. The first time I rea...,385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)",i just re-read this book. the first time i rea...,en


In [52]:
df1=df.drop_duplicates(subset=['review_text_detect'], keep= 'first')

In [53]:
df1.shape

(496634, 21)

In [54]:
df1.isnull().sum().sum()

0

In [55]:
df1.duplicated().sum()

0

In [56]:
get_uniques(df1)

user_id : 67843 unique values.
book_id : 60004 unique values.
review_id : 496634 unique values.
rating : {0, 1, 2, 3, 4, 5}
review_text : 496634 unique values.
isbn : 58707 unique values.
text_reviews_count : 971 unique values.
is_ebook : {False, True}
average_rating : 284 unique values.
description : 60004 unique values.
format : 70 unique values.
publisher : 5242 unique values.
num_pages : 602 unique values.
isbn13 : 59753 unique values.
publication_year : 106 unique values.
ratings_count : 3824 unique values.
title : 60004 unique values.
descriptiondetect : 60004 unique values.
titledetect : 60004 unique values.
review_text_detect : 496634 unique values.
language : {'en'}


In [57]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
book_id,496634.0,9579397.0,10028250.0,5.0,357664.0,6609748.0,17380069.0,36444005.0
rating,496634.0,3.862239,1.172741,0.0,3.0,4.0,5.0,5.0
text_reviews_count,496634.0,2131.312,6835.664,1.0,32.0,133.0,814.0,49850.0
average_rating,496634.0,3.984773,0.3003256,1.0,3.82,4.01,4.18,5.0
num_pages,496634.0,119.7532,127.4989,0.0,32.0,48.0,195.0,3816.0
publication_year,496634.0,2006.687,9.89018,1899.0,2003.0,2009.0,2013.0,2018.0
ratings_count,496634.0,69650.99,247322.9,1.0,154.0,952.0,11951.0,1876252.0


In [58]:
df=df1.drop(['language'],axis=1)

In [59]:
df.shape

(496634, 20)

In [60]:
df.publisher.value_counts().head(50)

HarperCollins                                 27742
HMH Books for Young Readers                   18650
Candlewick Press                              16938
Scholastic Press                              11652
Little, Brown Books for Young Readers         11287
Puffin Books                                  10589
Disney-Hyperion                               10232
Random House Books for Young Readers           9746
Chronicle Books                                8229
Scholastic Inc.                                8003
Atheneum Books for Young Readers               7564
Scholastic                                     7139
Dial Books                                     6804
HarperCollins Publishers                       6479
Greenwillow Books                              6393
Ember                                          5754
Knopf Books for Young Readers                  5528
Simon  Schuster Books for Young Readers        5477
Harry N. Abrams                                5315
Scholastic P

In [61]:
df.publisher.replace({"HarperCollins Publishers": "HarperCollins", 
                      'Scholastic Press':'Scholastic', 'Scholastic Inc.':'Scholastic',
                      'Scholastic, Inc.':'Scholastic', 'Scholastic Paperbacks':'Scholastic','Puffin':'Puffin Books', 
                      'Knopf Books for Young Readers': 'Knopf',
                     'Candlewick Press': 'Candlewick', }, inplace=True)

### Saving the complete clean DF

In [62]:
df.to_csv('FINAL_COMPLETE_DF.csv', index=False)

### And saving a df just with the final book_id to merge later with the descriptions_df

In [63]:
listbook_id=df.book_id.unique()
book_id = pd.DataFrame(listbook_id, columns=["book_id"])

In [64]:
book_id.to_csv('book_id_total.csv', index= False)