In [2]:
# Dataset: Amazon Books Reviews
# Author: Mohamed Bekheet
# Source: Mohamed Bekheet. (2022). Amazon Books Reviews [Data set]. Kaggle. https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
# URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
# This dataset contains 2 files
# The first file** reviews** file contain feedback about 3M user on 212404 unique books the data set is 
# part of the Amazon review Dataset it contains product reviews and metadata from Amazon, including 
# 142.8 million reviews spanning May 1996 - July 2014.
# Reviews Dataset Data Dictionary:
# -id:                  The Id of Book
# -Title:   	        Book Title
# -Price:               The price of Book
# -User_id:             Id of the user who rates the book
# -profileName:         Name of the user who rates the book
# -review/helpfulness:  helpfulness rating of the review, e.g. 2/3
# -review/score:        rating from 0 to 5 for the book
# -review/time:         time of given the review
# -review/summary:      the summary of a text review
# -review/text:         the full text of a review

# The second file Books Details file contains details information about 212404 unique books it file is built by using
# google books API to get details information about books it rated in the first file
# and this file contains

# Book Details Dataset Data Dictionary:
# Title:	        Book Title
# Descripe:	        Decription of book
# authors:	        Name of book authors
# image:	        Url for book cover
# previewLink:	    Link to access this book on google Books
# publisher:	    Name of the publisheer
# publishedDate:	The date of publish
# infoLink:	        Link to get more information about the book on google books
# categories:	    Genres of books
# ratingsCount:	    Averaging rating for book

# Both Datasets are linked through the unique tittle of the book.

In [3]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer



In [4]:
df_rating_raw = pd.read_csv('../data/raw/Books_rating.csv')

df_rating_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB


In [5]:
df_rating_raw.sample(10)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
967172,0590847740,The Creepy Creations of Professor Shock (Give ...,,,,2/2,5.0,1045008000,Goosebumps:Creepy Creations of Professor Shock,Goosebumps: Creepy Creations of Professor Shoc...
2065331,0451521196,Wuthering Heights (Signet classics),,A26QQ25ME462ZH,Dr. Janet C. Herrmann,0/0,5.0,1358553600,Great Literature,Others pale by this . Am so glad that i can ha...
2925844,0743500067,The Attorney (Paul Madriani Novels),,A11776PFBYRAYY,"Steven B. Stern ""author""",0/0,5.0,1359590400,Martini's Specialty,If you like reading about the behind the scene...
564143,0310920663,KJV Prophecy Marked Reference Study Bible,,A27F6VUZ4JFHCL,"Balraj Bhachoo ""Raj""",2/39,1.0,1256169600,A fake bible,I saw this heretic on God TV with those two bi...
2947402,0689846002,"New York Is English, Chattanooga Is Creek. (Ri...",17.99,A1SYLII0808HD6,M. Allen Greenbaum,4/4,4.0,1135728000,"4 1/2 A Mad Hatter, Metropolitan Smatter...Gat...","For decades now, San Francisco theatregoers ha..."
1883925,0471230693,Venture Capital and Private Equity: A Casebook...,,,,20/41,2.0,1030406400,Large investment in time for little payback in...,If you are interested in quickly learning abou...
1184500,0520219295,The Georgian Feast: The Vibrant Culture and Sa...,24.33,A1KE1JDMGCCNDV,Alina Kostina,14/14,5.0,992476800,A wonderful find,This book is precious just by the fact that it...
1134187,B000GS6CQM,The Color Purple,,AT375YKDU4KJA,Callie A. Collins,4/8,3.0,1002672000,A Celebration of the Human Spirit?,"As an objective study of racism, feminism, and..."
2265177,B000HMY3ZY,Dr Jekyll And Mr Hyde,,A3QK6W0AMKR03L,C,1/1,4.0,1348876800,Dr. Kekyll and Mr. Hyde,My son had to read this book in high school so...
1506886,B000K1W00M,"Secret On Ararat - Babylon Rising, Book 2",,A3ASIAGLNYSEWW,Minnie D. Thornton,0/1,5.0,1128988800,"Babylon Rising, The secret on Ararat",This book is as well-written as all of Tim LaH...


In [11]:
df_rating_raw['Title'].value_counts().head(30)

Title
The Hobbit                                                          22023
Pride and Prejudice                                                 20371
Atlas Shrugged                                                      12513
Wuthering Heights                                                   10780
The Giver                                                            7644
Great Expectations                                                   7421
Harry Potter and The Sorcerer's Stone                                6796
Of Mice and Men                                                      6728
Brave New World                                                      6312
Mere Christianity                                                    6053
The Picture of Dorian Gray                                           5883
Persuasion                                                           5498
The Great Gatsby                                                     5291
Fahrenheit 451                  

In [3]:
df_data_raw = pd.read_csv('../data/raw/books_data.csv')

df_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB


In [5]:
df_data_raw.sample(10)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
55420,Exploring Genesis,"""John Phillips writes with enthusiasm and clar...",['John Phillips'],http://books.google.com/books/content?id=kkASY...,http://books.google.com/books?id=kkASYvZl0GMC&...,Kregel Academic,2001-09-01,http://books.google.com/books?id=kkASYvZl0GMC&...,['Religion'],7.0
50485,Preface to the Study of Paul,This 'preface' to the study of Paul attempts t...,['Stephen Westerholm'],http://books.google.com/books/content?id=U-LYA...,http://books.google.com/books?id=U-LYAAAAMAAJ&...,Eerdmans Publishing Company,1997,http://books.google.com/books?id=U-LYAAAAMAAJ&...,['Religion'],
43046,Thicker Than Water,Much of what you've heard about plastic pollut...,['Erica Cirino'],http://books.google.com/books/content?id=2mE7E...,http://books.google.com/books?id=2mE7EAAAQBAJ&...,Island Press,2021-10-07,http://books.google.com/books?id=2mE7EAAAQBAJ&...,['Nature'],
122785,Siege of St. Augustine,,['Ricardo Torres-Reyes'],,http://books.google.nl/books?id=TToTPwAACAAJ&d...,,1972,http://books.google.nl/books?id=TToTPwAACAAJ&d...,"['Saint Augustine Expedition, Fla., 1740']",
119244,Selected stories (The World's classics),Shares twelve stories starring the legendary d...,['Arthur Conan Doyle'],http://books.google.com/books/content?id=uEfbA...,http://books.google.nl/books?id=uEfbAgAAQBAJ&p...,Oxford University Press,2014-04,http://books.google.nl/books?id=uEfbAgAAQBAJ&d...,['Fiction'],
201069,She Who Dreams: A Journey into Healing through...,Wanda Burch dreamt that she would die at a cer...,['Wanda Burch'],http://books.google.com/books/content?id=at6GH...,http://books.google.com/books?id=at6GHjVSS8sC&...,New World Library,2010-11-17,https://play.google.com/store/books/details?id...,['Self-Help'],
53413,Quilt Inspirations from Africa : A Caravan of ...,With its mesmerizing colors and exotic designs...,"['Kaye England', 'Mary Elizabeth Johnson']",http://books.google.com/books/content?id=PW-bP...,http://books.google.com/books?id=PW-bPAAACAAJ&...,Ntc Publishing Group,2000-01-01,http://books.google.com/books?id=PW-bPAAACAAJ&...,['Crafts & Hobbies'],
99660,Lethal Justice,,,,,,,,,
108499,10 Most Common Chess Mistakes,Grandmaster Larry Evans draws upon his vast ex...,['Larry Evans'],http://books.google.com/books/content?id=UcIHA...,http://books.google.com/books?id=UcIHAAAACAAJ&...,Cardoza,2002-09-17,http://books.google.com/books?id=UcIHAAAACAAJ&...,['Games'],
48490,Congressional government;: A study in American...,,['Woodrow Wilson'],http://books.google.com/books/content?id=cmUqA...,http://books.google.com/books?id=cmUqAQAAIAAJ&...,,1898,https://play.google.com/store/books/details?id...,['United States'],


In [9]:
df_rating_raw.shape

(3000000, 10)

In [10]:
df_data_raw.shape

(212404, 10)

In [12]:
df_rating_raw['review/score'].value_counts()

review/score
5.0    1807343
4.0     585616
3.0     254295
1.0     201688
2.0     151058
Name: count, dtype: int64

In [13]:
df_rating_raw.isna().sum()

Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
review/summary            407
review/text                 8
dtype: int64

In [14]:
df_data_raw.isna().sum()

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64